1 #pragma once
2 // cfloat.hpp: 'classic' float: definition of an arbitrary configuration linear floating-point representation
3 //
4 // Copyright (C) 2017-2021 Stillwater Supercomputing, Inc.
5 //
6 // This file is part of the universal numbers project, which is released under an MIT Open Source license.
7
8 // compiler specific environment has been delegated to be handled by the
9 // number system include file <universal/number/cfloat.hpp>
10 //
11 // supporting types and functions
12 #include <universal/native/ieee754.hpp>
13 #include <universal/native/subnormal.hpp>
14 #include <universal/native/bit_functions.hpp>
15 #include <universal/number/shared/nan_encoding.hpp>
16 #include <universal/number/shared/infinite_encoding.hpp>
17 #include <universal/number/shared/specific_value_encoding.hpp>
18 // cfloat exception structure
19 #include <universal/number/cfloat/exceptions.hpp>
20 // cfloat tracing options
21 #include <universal/number/cfloat/trace_constants.hpp>
22 // composition types used by cfloat
23 #include <universal/internal/blockbinary/blockbinary.hpp>
24 #include <universal/internal/blocktriple/blocktriple.hpp>
25
26 #ifndef CFLOAT_THROW_ARITHMETIC_EXCEPTION
27 #define CFLOAT_THROW_ARITHMETIC_EXCEPTION 0
28 #endif
29 #ifndef TRACE_CONVERSION
30 #define TRACE_CONVERSION 0
31 #endif
32
33 namespace sw::universal {
34
35 /*
36 * classic floats have denorms, but no gradual overflow, and
37 * project values outside of their dynamic range to +-inf
38 *
39 * Behavior flags
40 * gradual underflow: use all fraction encodings when exponent is all 0's
41 * gradual overflow: use all fraction encodings when exponent is all 1's
42 * saturation to maxneg or maxpos when value is out of dynamic range
43 */
44 // Forward definitions
45 template<size_t nbits, size_t es, typename bt,
46 bool hasSubnormals, bool hasSupernormals, bool isSaturating> class cfloat;
47 template<size_t nbits, size_t es, typename bt,
48 bool hasSubnormals, bool hasSupernormals, bool isSaturating>
49 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>
50 abs(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>&);
51
52 /// <summary>
53 /// decode an cfloat value into its constituent parts
54 /// </summary>
55 /// <typeparam name="bt"></typeparam>
56 /// <param name="v"></param>
57 /// <param name="s"></param>
58 /// <param name="e"></param>
59 /// <param name="f"></param>
60 template<size_t nbits, size_t es, size_t fbits, typename bt,
61 bool hasSubnormals, bool hasSupernormals, bool isSaturating>
decode(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v,bool & s,blockbinary<es,bt> & e,blockbinary<fbits,bt> & f)62 void decode(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& v, bool& s, blockbinary<es, bt>& e, blockbinary<fbits, bt>& f) {
63 v.sign(s);
64 v.exponent(e);
65 v.fraction(f);
66 }
67
68 /// <summary>
69 /// return the binary scale of the given number
70 /// </summary>
71 /// <typeparam name="bt">Block type used for storage: derived through ADL</typeparam>
72 /// <param name="v">the cfloat number for which we seek to know the binary scale</param>
73 /// <returns>binary scale, i.e. 2^scale, of the value of the cfloat</returns>
74 template<size_t nbits, size_t es, typename bt,
75 bool hasSubnormals, bool hasSupernormals, bool isSaturating>
scale(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)76 int scale(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& v) {
77 return v.scale();
78 }
79
80 /// <summary>
81 /// parse a text string into a cfloat value
82 /// </summary>
83 /// <typeparam name="bt"></typeparam>
84 /// <param name="str"></param>
85 /// <returns></returns>
86 template<size_t nbits, size_t es, typename bt,
87 bool hasSubnormals, bool hasSupernormals, bool isSaturating>
88 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>
parse(const std::string & stringRep)89 parse(const std::string& stringRep) {
90 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> a{ 0 };
91 return a.assign(stringRep);
92 }
93
94 /// <summary>
95 /// convert a blocktriple to a cfloat. blocktriples come out of the arithmetic
96 /// engine in the form ii.ff...ff and a scale. The conversion must take this
97 /// denormalized form into account during conversion.
98 ///
99 /// The blocktriple must be in this form to round correctly, as all the bits
100 /// after an arithmetic operation must be taken into account.
101 ///
102 /// Transformation:
103 /// ii.ff...ff transform to s.eee.fffff
104 /// All number systems that depend on blocktriple will need to have
105 /// the rounding decision answered, so that functionality can be
106 /// reused if we locate it inside blocktriple.
107 ///
108 /// if (srcbits > fbits) // we need to round
109 /// if (ii.00..00 > 1)
110 /// mask is at srcbits - fbits + 1
111 /// else
112 /// mask is at srcbits - fbits
113 /// }
114 /// else { // no need to round
115 /// }
116 /// </summary>
117 /// <typeparam name="bt">type of the block used for cfloat storage</typeparam>
118 /// <param name="src">the blocktriple to be converted</param>
119 /// <param name="tgt">the resulting cfloat</param>
120 template<size_t srcbits, BlockTripleOperator op, size_t nbits, size_t es, typename bt,
121 bool hasSubnormals, bool hasSupernormals, bool isSaturating>
convert(const blocktriple<srcbits,op,bt> & src,cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & tgt)122 inline /*constexpr*/ void convert(const blocktriple<srcbits, op, bt>& src,
123 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& tgt) {
124 // std::cout << "convert: " << to_binary(src) << std::endl;
125 using btType = blocktriple<srcbits, op, bt>;
126 using cfloatType = cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>;
127 // test special cases
128 if (src.isnan()) {
129 tgt.setnan(src.sign() ? NAN_TYPE_SIGNALLING : NAN_TYPE_QUIET);
130 }
131 else if (src.isinf()) {
132 tgt.setinf(src.sign());
133 }
134 else if (src.iszero()) {
135 tgt.setzero();
136 tgt.setsign(src.sign()); // preserve sign
137 }
138 else {
139 int significantScale = src.significantscale();
140 int exponent = src.scale() + significantScale;
141 // special case of underflow
142 if constexpr (hasSubnormals) {
143 // std::cout << "exponent = " << exponent << " bias = " << cfloatType::EXP_BIAS << " exp subnormal = " << cfloatType::MIN_EXP_SUBNORMAL << '\n';
144 //if (exponent < (cfloatType::MIN_EXP_SUBNORMAL - 1) || exponent + cfloatType::EXP_BIAS < 0) { // this culls subnormal values too much
145 if (exponent < (cfloatType::MIN_EXP_SUBNORMAL - 1)) {
146
147 tgt.setzero();
148 return;
149 }
150 }
151 else {
152 if (exponent < (cfloatType::MIN_EXP_NORMAL - 1) || exponent + cfloatType::EXP_BIAS < 0) {
153 //if (exponent < (cfloatType::MIN_EXP_NORMAL - 1)) {
154 tgt.setzero();
155 return;
156 }
157 }
158 // special case of overflow
159 if constexpr (hasSupernormals) {
160 if constexpr (isSaturating) {
161 if (exponent > cfloatType::MAX_EXP) {
162 if (src.sign()) tgt.maxneg(); else tgt.maxpos();
163 return;
164 }
165 }
166 else {
167 if (exponent > cfloatType::MAX_EXP) {
168 tgt.setinf(src.sign());
169 return;
170 }
171 }
172 }
173 else { // no supernormals will saturate at a different encoding: TODO can we hide it all in maxpos?
174 if constexpr (isSaturating) {
175 if (exponent > cfloatType::MAX_EXP) {
176 if (src.sign()) tgt.maxneg(); else tgt.maxpos();
177 return;
178 }
179 }
180 else {
181 if (exponent > cfloatType::MAX_EXP) {
182 tgt.setinf(src.sign());
183 return;
184 }
185 }
186 }
187
188 // our value needs to go through rounding to be correctly interpreted
189 //
190 // tgt.clear(); // no need as all bits are going to be set by the code below
191 int adjustment{ 0 };
192 if constexpr (btType::bfbits < 65) {
193 // we can use a uint64_t to construct the cfloat
194 if constexpr (hasSubnormals) {
195 if (exponent >= cfloatType::MIN_EXP_SUBNORMAL && exponent < cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>::MIN_EXP_NORMAL) {
196 // the value is in the subnormal range of the cfloat
197
198 // -exponent because we are right shifting and exponent in this range is negative
199 adjustment = -(exponent + subnormal_reciprocal_shift[es]);
200 // this is the right shift adjustment required for subnormal representation due
201 // to the scale of the input number, i.e. the exponent of 2^-adjustment
202 }
203 else {
204 // the value is in the normal range of the cfloat
205 }
206 }
207
208
209 // process sign
210 uint64_t raw = (src.sign() ? 1ull : 0ull);
211 // std::cout << "raw bits (sign) " << to_binary(raw) << '\n';
212 // get the rounding direction and the LSB right shift:
213 // TODO: do we want to support arbitrary blocktriples instead of the ALU output versions?
214 std::pair<bool, size_t> alignment = src.roundingDecision(adjustment);
215 bool roundup = alignment.first;
216 size_t rightShift = alignment.second; // this is the shift to get the LSB of the src to the LSB of the tgt
217 // std::cout << "round-up? " << (roundup ? "yes" : "no") << '\n';
218 // std::cout << "rightShift " << rightShift << '\n';
219 // process exponent
220
221 uint64_t expBits = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(cfloatType::EXP_BIAS)); // this is guaranteed to be positive
222 raw <<= es; // shift sign to make room for the exponent bits
223 raw |= (roundup ? (expBits+1) : expBits);
224 // std::cout << "raw bits (exp) " << to_binary(raw) << '\n';
225 // process fraction bits
226 uint64_t fracbits = src.get_ull(); // get all the bits, including the integer bits
227 // std::cout << "fracbits " << to_binary(fracbits) << '\n';
228 raw <<= cfloatType::fbits;
229 // int rightShift = cfloatType::MIN_EXP_NORMAL - static_cast<int>(exponent) + (srcbits - cfloatType::fbits);
230 // std::cout << "right shift " << rightShift << '\n';
231 fracbits >>= rightShift;
232 // std::cout << "fracbits shifted " << to_binary(fracbits) << '\n';
233 fracbits &= cfloatType::ALL_ONES_FR;
234 // std::cout << "fracbits masked " << to_binary(fracbits) << '\n';
235 raw |= fracbits;
236 tgt.setbits(raw);
237 // std::cout << "raw bits (all) " << to_binary(raw) << '\n';
238 }
239 else {
240 // TODO
241 // compose the segments
242 tgt.setsign(src.sign());
243 tgt.setexponent(src.scale());
244 // this api doesn't work: tgt.setfraction(src.significant());
245 std::cerr << "convert nbits > 64 TBD\n";
246 }
247 }
248 }
249
250
251 /// <summary>
252 /// An arbitrary, fixed-size floating-point number with configurable gradual under/overflow and saturation/non-saturation arithmetic.
253 /// Default configuration offers normal encoding and non-saturating arithmetic.
254 /// /// </summary>
255 /// <typeparam name="nbits">number of bits in the encoding</typeparam>
256 /// <typeparam name="es">number of exponent bits in the encoding</typeparam>
257 /// <typeparam name="bt">the type to use as storage class: one of [uint8_t|uint16_t|uint32_t]</typeparam>
258 /// <typeparam name="hasSubnormals">configure gradual underflow (==subnormals)</typeparam>
259 /// <typeparam name="hasSupernormals">configure graudal overflow (==supernormals)</typeparam>
260 /// <typeparam name="isSaturating">configure saturation arithmetic</typeparam>
261 template<size_t _nbits, size_t _es, typename bt = uint8_t,
262 bool _hasSubnormals = false, bool _hasSupernormals = false, bool _isSaturating = false>
263 class cfloat {
264 public:
265 static_assert(_nbits > _es + 1ull, "nbits is too small to accomodate the requested number of exponent bits");
266 static_assert(_es < 21ull, "my God that is a big number, are you trying to break the Interweb?");
267 static_assert(_es > 0, "number of exponent bits must be bigger than 0 to be a classic floating point number");
268 // how do you assert on the condition that if es == 1 then subnormals and supernormals must be true?
269 static constexpr bool subsuper = (_hasSubnormals && _hasSupernormals);
270 static constexpr bool special = (subsuper ? true : (_es > 1));
271 static_assert(special, "when es == 1, cfloat must have both subnormals and supernormals");
272 static constexpr size_t bitsInByte = 8ull;
273 static constexpr size_t bitsInBlock = sizeof(bt) * bitsInByte;
274 static_assert(bitsInBlock <= 64, "storage unit for block arithmetic needs to be <= uint64_t"); // TODO: carry propagation on uint64_t requires assembly code
275
276 static constexpr size_t nbits = _nbits;
277 static constexpr size_t es = _es;
278 static constexpr size_t fbits = nbits - 1ull - es; // number of fraction bits excluding the hidden bit
279 static constexpr size_t fhbits = nbits - es; // number of fraction bits including the hidden bit
280 static constexpr size_t abits = 2 * fhbits; // size of the addend
281 static constexpr size_t mbits = 2ull * fbits + 1ull; // size of the multiplier output
282 static constexpr size_t divbits = 3ull * fhbits + 4ull;// size of the divider output
283
284 static constexpr size_t storageMask = (0xFFFFFFFFFFFFFFFFull >> (64ull - bitsInBlock));
285 static constexpr bt ALL_ONES = bt(~0); // block type specific all 1's value
286 static constexpr uint32_t ALL_ONES_ES = (0xFFFF'FFFFul >> (32 - es));
287 static constexpr uint64_t topfbits = fbits % 64;
288 static constexpr uint64_t FR_SHIFT = (topfbits > 0 ? (64 - topfbits) : 0);
289 static constexpr uint64_t ALL_ONES_FR = (topfbits > 0 ? (0xFFFF'FFFF'FFFF'FFFFull >> FR_SHIFT) : 0ull); // special case for nbits <= 64
290 static constexpr uint64_t INF_ENCODING = (ALL_ONES_FR & ~1ull);
291
292 static constexpr size_t nrBlocks = 1ull + ((nbits - 1ull) / bitsInBlock);
293 static constexpr size_t MSU = nrBlocks - 1ull; // MSU == Most Significant Unit, as MSB is already taken
294 static constexpr bt MSU_MASK = (ALL_ONES >> (nrBlocks * bitsInBlock - nbits));
295 static constexpr size_t bitsInMSU = bitsInBlock - (nrBlocks * bitsInBlock - nbits);
296 static constexpr size_t fBlocks = 1ull + ((fbits - 1ull) / bitsInBlock); // nr of blocks with fraction bits
297 static constexpr size_t FSU = fBlocks - 1ull; // FSU = Fraction Significant Unit: the index of the block that contains the most significant fraction bits
298 static constexpr bt FSU_MASK = (ALL_ONES >> (fBlocks * bitsInBlock - fbits));
299 static constexpr size_t bitsInFSU = bitsInBlock - (fBlocks * bitsInBlock - fbits);
300
301 static constexpr bt SIGN_BIT_MASK = bt(bt(1ull) << ((nbits - 1ull) % bitsInBlock));
302 static constexpr bt LSB_BIT_MASK = bt(1ull);
303 static constexpr bool MSU_CAPTURES_EXP = (1ull + es) <= bitsInMSU;
304 static constexpr size_t EXP_SHIFT = (MSU_CAPTURES_EXP ? (1 == nrBlocks ? (nbits - 1ull - es) : (bitsInMSU - 1ull - es)) : 0);
305 static constexpr bt MSU_EXP_MASK = ((ALL_ONES << EXP_SHIFT) & ~SIGN_BIT_MASK) & MSU_MASK;
306 static constexpr int EXP_BIAS = ((1l << (es - 1ull)) - 1l);
307 static constexpr int MAX_EXP = (es == 1) ? 1 : ((1l << es) - EXP_BIAS - 1);
308 static constexpr int MIN_EXP_NORMAL = 1 - EXP_BIAS;
309 static constexpr int MIN_EXP_SUBNORMAL = 1 - EXP_BIAS - int(fbits); // the scale of smallest ULP
310 static constexpr bt BLOCK_MASK = bt(~0);
311
312 static constexpr bool hasSubnormals = _hasSubnormals;
313 static constexpr bool hasSupernormals = _hasSupernormals;
314 static constexpr bool isSaturating = _isSaturating;
315 typedef bt BlockType;
316
317 // constructors
cfloat()318 constexpr cfloat() noexcept : _block{ 0 } {};
319
320 constexpr cfloat(const cfloat&) noexcept = default;
321 constexpr cfloat(cfloat&&) noexcept = default;
322
323 constexpr cfloat& operator=(const cfloat&) noexcept = default;
324 constexpr cfloat& operator=(cfloat&&) noexcept = default;
325
326 // decorated/converting constructors
cfloat(const std::string & stringRep)327 constexpr cfloat(const std::string& stringRep) {
328 assign(stringRep);
329 }
330 /// <summary>
331 /// construct an cfloat from another, block type bt must be the same
332 /// </summary>
333 /// <param name="rhs"></param>
334 template<size_t nnbits, size_t ees>
cfloat(const cfloat<nnbits,ees,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)335 cfloat(const cfloat<nnbits, ees, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
336 // this->assign(rhs);
337 }
338
339 // specific value constructor
cfloat(const SpecificValue code)340 constexpr cfloat(const SpecificValue code) noexcept
341 : _block{ 0 } {
342 switch (code) {
343 case SpecificValue::maxpos:
344 maxpos();
345 break;
346 case SpecificValue::minpos:
347 minpos();
348 break;
349 case SpecificValue::zero:
350 default:
351 zero();
352 break;
353 case SpecificValue::minneg:
354 minneg();
355 break;
356 case SpecificValue::maxneg:
357 maxneg();
358 break;
359 case SpecificValue::infpos:
360 setinf(false);
361 break;
362 case SpecificValue::infneg:
363 setinf(true);
364 break;
365 case SpecificValue::nar: // approximation as cfloats don't have a NaR
366 case SpecificValue::qnan:
367 setnan(NAN_TYPE_QUIET);
368 break;
369 case SpecificValue::snan:
370 setnan(NAN_TYPE_SIGNALLING);
371 break;
372 }
373 }
374
375 /// <summary>
376 /// construct an cfloat from a native type, specialized for size
377 /// </summary>
378 /// <param name="iv">initial value to construct</param>
cfloat(signed char iv)379 constexpr cfloat(signed char iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(short iv)380 constexpr cfloat(short iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(int iv)381 constexpr cfloat(int iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(long iv)382 constexpr cfloat(long iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(long long iv)383 constexpr cfloat(long long iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(char iv)384 constexpr cfloat(char iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned short iv)385 constexpr cfloat(unsigned short iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned int iv)386 constexpr cfloat(unsigned int iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned long iv)387 constexpr cfloat(unsigned long iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned long long iv)388 constexpr cfloat(unsigned long long iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(float iv)389 CONSTEXPRESSION cfloat(float iv) noexcept : _block{ 0 } { *this = iv; }
cfloat(double iv)390 CONSTEXPRESSION cfloat(double iv) noexcept : _block{ 0 } { *this = iv; }
391
392 // assignment operators
operator =(signed char rhs)393 constexpr cfloat& operator=(signed char rhs) noexcept { return convert_signed_integer(rhs); }
operator =(short rhs)394 constexpr cfloat& operator=(short rhs) noexcept { return convert_signed_integer(rhs); }
operator =(int rhs)395 constexpr cfloat& operator=(int rhs) noexcept { return convert_signed_integer(rhs); }
operator =(long rhs)396 constexpr cfloat& operator=(long rhs) noexcept { return convert_signed_integer(rhs); }
operator =(long long rhs)397 constexpr cfloat& operator=(long long rhs) noexcept { return convert_signed_integer(rhs); }
398
operator =(char rhs)399 constexpr cfloat& operator=(char rhs) noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned short rhs)400 constexpr cfloat& operator=(unsigned short rhs) noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned int rhs)401 constexpr cfloat& operator=(unsigned int rhs) noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned long rhs)402 constexpr cfloat& operator=(unsigned long rhs) noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned long long rhs)403 constexpr cfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned_integer(rhs); }
404
operator =(float rhs)405 CONSTEXPRESSION cfloat& operator=(float rhs) noexcept { return convert_ieee754(rhs); }
operator =(double rhs)406 CONSTEXPRESSION cfloat& operator=(double rhs) noexcept { return convert_ieee754(rhs); }
407
408 // guard long double support to enable ARM and RISC-V embedded environments
409 #if LONG_DOUBLE_SUPPORT
cfloat(long double iv)410 CONSTEXPRESSION cfloat(long double iv) noexcept : _block{ 0 } { *this = iv; }
operator =(long double rhs)411 CONSTEXPRESSION cfloat& operator=(long double rhs) noexcept { return convert_ieee754(rhs); }
operator long double() const412 explicit operator long double() const { return to_native<long double>(); }
413 #endif
414
415 // arithmetic operators
416 // prefix operator
operator -() const417 inline cfloat operator-() const {
418 cfloat tmp(*this);
419 tmp._block[MSU] ^= SIGN_BIT_MASK;
420 return tmp;
421 }
422
operator +=(const cfloat & rhs)423 cfloat& operator+=(const cfloat& rhs) {
424 if constexpr (cfloat_trace_add) std::cout << "---------------------- ADD -------------------" << std::endl;
425 // special case handling of the inputs
426 #if CFLOAT_THROW_ARITHMETIC_EXCEPTION
427 if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
428 throw cfloat_operand_is_nan{};
429 }
430 #else
431 if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
432 setnan(NAN_TYPE_SIGNALLING);
433 return *this;
434 }
435 if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
436 setnan(NAN_TYPE_QUIET);
437 return *this;
438 }
439 #endif
440 // normal + inf = inf
441 // normal + -inf = -inf
442 // inf + normal = inf
443 // inf + inf = inf
444 // inf + -inf = ?
445 // -inf + normal = -inf
446 // -inf + -inf = -inf
447 // -inf + inf = ?
448 if (isinf()) {
449 if (rhs.isinf()) {
450 if (sign() != rhs.sign()) {
451 setnan(NAN_TYPE_SIGNALLING);
452 }
453 return *this;
454 }
455 else {
456 return *this;
457 }
458 }
459 else {
460 if (rhs.isinf()) {
461 *this = rhs;
462 return *this;
463 }
464 }
465
466 if (iszero()) {
467 *this = rhs;
468 return *this;
469 }
470 if (rhs.iszero()) return *this;
471
472 // arithmetic operation
473 blocktriple<fbits, BlockTripleOperator::ADD, bt> a, b, sum;
474
475 // transform the inputs into (sign,scale,significant)
476 // triples of the correct width
477 normalizeAddition(a);
478 rhs.normalizeAddition(b);
479 sum.add(a, b);
480
481 convert(sum, *this);
482
483 return *this;
484 }
operator +=(double rhs)485 cfloat& operator+=(double rhs) {
486 return *this += cfloat(rhs);
487 }
operator -=(const cfloat & rhs)488 cfloat& operator-=(const cfloat& rhs) {
489 if constexpr (cfloat_trace_sub) std::cout << "---------------------- SUB -------------------" << std::endl;
490 if (rhs.isnan())
491 return *this += rhs;
492 else
493 return *this += -rhs;
494 }
operator -=(double rhs)495 cfloat& operator-=(double rhs) {
496 return *this -= cfloat(rhs);
497 }
operator *=(const cfloat & rhs)498 cfloat& operator*=(const cfloat& rhs) {
499 if constexpr (cfloat_trace_mul) std::cout << "---------------------- MUL -------------------" << std::endl;
500 // special case handling of the inputs
501 #if CFLOAT_THROW_ARITHMETIC_EXCEPTION
502 if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
503 throw cfloat_operand_is_nan{};
504 }
505 #else
506 if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
507 setnan(NAN_TYPE_SIGNALLING);
508 return *this;
509 }
510 if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
511 setnan(NAN_TYPE_QUIET);
512 return *this;
513 }
514 #endif
515 // inf * inf = inf
516 // inf * -inf = -inf
517 // -inf * inf = -inf
518 // -inf * -inf = inf
519 // 0 * inf = -nan(ind)
520 bool resultSign = sign() != rhs.sign();
521 if (isinf()) {
522 if (rhs.isinf()) {
523 setsign(resultSign);
524 return *this;
525 }
526 else {
527 setnan(NAN_TYPE_SIGNALLING);
528 return *this;
529 }
530 }
531 else {
532 if (rhs.isinf()) {
533 setnan(NAN_TYPE_SIGNALLING);
534 return *this;
535 }
536 }
537
538 if (iszero() || rhs.iszero()) {
539 setzero();
540 setsign(resultSign); // deal with negative 0
541 return *this;
542 }
543
544 // arithmetic operation
545 blocktriple<fbits, BlockTripleOperator::MUL, bt> a, b, product;
546
547 // transform the inputs into (sign,scale,significant)
548 // triples of the correct width
549 normalizeMultiplication(a);
550 rhs.normalizeMultiplication(b);
551 product.mul(a, b);
552
553 convert(product, *this);
554
555 if constexpr (cfloat_trace_mul) {
556 std::cout << to_binary(a) << " * " << to_binary(b) << " = " << to_binary(product) << '\n';
557 }
558 return *this;
559 }
operator *=(double rhs)560 cfloat& operator*=(double rhs) {
561 return *this *= cfloat(rhs);
562 }
operator /=(const cfloat & rhs)563 cfloat& operator/=(const cfloat& rhs) {
564 if constexpr (cfloat_trace_div) std::cout << "---------------------- DIV -------------------" << std::endl;
565 #if CFLOAT_THROW_ARITHMETIC_EXCEPTION
566 if (rhs.iszero()) throw cfloat_divide_by_zero();
567 if (rhs.isnan()) throw cfloat_divide_by_nan();
568 if (isnan()) throw cfloat_operand_is_nan();
569 #else
570 if (rhs.iszero()) std::cerr << "cfloat_negative_sqrt_argument" << std::endl;
571 if (rhs.isnan()) std::cerr << "cfloat_divide_by_nan" << std::endl;
572 if (isnan()) std::cerr << "cfloat_operand_is_nan" << std::endl;
573 #endif
574 return *this;
575 }
operator /=(double rhs)576 cfloat& operator/=(double rhs) {
577 return *this /= cfloat(rhs);
578 }
579 /// <summary>
580 /// move to the next bit encoding modulo 2^nbits
581 /// </summary>
582 /// <typeparam name="bt"></typeparam>
operator ++()583 inline cfloat& operator++() {
584 if constexpr (0 == nrBlocks) {
585 return *this;
586 }
587 else if constexpr (1 == nrBlocks) {
588 if (ispos()) {
589 if ((_block[MSU] & (MSU_MASK >> 1)) == (MSU_MASK >> 1)) { // pattern: 0.11.111 = nan
590 _block[MSU] |= SIGN_BIT_MASK; // pattern: 1.11.111 = snan
591 }
592 else {
593 ++_block[MSU];
594 }
595 }
596 else {
597 if ((_block[MSU] & SIGN_BIT_MASK) == _block[MSU]) { // pattern: 1.00.000 = -0
598 _block[MSU] = 0; // pattern: 0.00.000 = +0
599 }
600 else {
601 --_block[MSU];
602 }
603 }
604 }
605 else {
606 if (ispos()) {
607 // special case: pattern: 0.11.111 = nan transitions to pattern: 1.11.111 = snan
608 if (isnanencoding()) {
609 setnan(NAN_TYPE_SIGNALLING);
610 }
611 else {
612 bool carry = true;
613 for (unsigned i = 0; i < MSU; ++i) {
614 if (carry) {
615 if ((_block[i] & storageMask) == storageMask) { // block will overflow
616 _block[i] = 0;
617 carry = true;
618 }
619 else {
620 ++_block[i];
621 carry = false;
622 }
623 }
624 }
625 if (carry) {
626 ++_block[MSU];
627 }
628 }
629 }
630 else {
631 // special case: pattern: 1.00.000 = -0 transitions to pattern: 0.00.000 = +0
632 if (iszeroencoding()) {
633 setzero();
634 }
635 else {
636 // 1111 0000
637 // 1110 1111
638 bool borrow = true;
639 for (unsigned i = 0; i < MSU; ++i) {
640 if (borrow) {
641 if ((_block[i] & storageMask) == 0) { // block will underflow
642 --_block[i];
643 borrow = true;
644 }
645 else {
646 --_block[i];
647 borrow = false;
648 }
649 }
650 }
651 if (borrow) {
652 --_block[MSU];
653 }
654 }
655 }
656 }
657 return *this;
658 }
operator ++(int)659 inline cfloat operator++(int) {
660 cfloat tmp(*this);
661 operator++();
662 return tmp;
663 }
operator --()664 inline cfloat& operator--() {
665 if constexpr (0 == nrBlocks) {
666 return *this;
667 }
668 else if constexpr (1 == nrBlocks) {
669 if (ispos()) {
670 if (_block[MSU] == 0) { // pattern: 0.00.000 = 0
671 _block[MSU] |= SIGN_BIT_MASK; // pattern: 1.00.000 = -0
672 }
673 else {
674 --_block[MSU];
675 }
676 }
677 else {
678 if ((_block[MSU] & MSU_MASK) == MSU_MASK) { // pattern: 1.11.111 = snan
679 _block[MSU] &= ~SIGN_BIT_MASK; // pattern: 0.11.111 = qnan
680 }
681 else {
682 ++_block[MSU];
683 }
684 }
685
686 }
687 else {
688 if (ispos()) {
689 // special case: pattern: 0.00.000 = +0 transitions to pattern: 1.00.000 = -0
690 if (iszeroencoding()) {
691 setsign(true);
692 }
693 else {
694 bool borrow = true;
695 for (unsigned i = 0; i < MSU; ++i) {
696 if (borrow) {
697 if ((_block[i] & storageMask) == 0) { // block will underflow
698 --_block[i];
699 borrow = true;
700 }
701 else {
702 --_block[i];
703 borrow = false;
704 }
705 }
706 }
707 if (borrow) {
708 --_block[MSU];
709 }
710 }
711 }
712 else {
713 // special case: pattern: 1.11.111 = snan transitions to pattern: 0.11.111 = qnan
714 if (isnanencoding()) {
715 setsign(false);
716 }
717 else {
718 bool carry = true;
719 for (unsigned i = 0; i < MSU; ++i) {
720 if (carry) {
721 if ((_block[i] & storageMask) == storageMask) { // block will overflow
722 _block[i] = 0;
723 carry = true;
724 }
725 else {
726 ++_block[i];
727 carry = false;
728 }
729 }
730 }
731 if (carry) {
732 ++_block[MSU];
733 }
734 }
735 }
736 }
737 return *this;
738 }
operator --(int)739 inline cfloat operator--(int) {
740 cfloat tmp(*this);
741 operator--();
742 return tmp;
743 }
744
745 // modifiers
746
747 /// <summary>
748 /// clear the content of this cfloat to zero
749 /// </summary>
750 /// <returns>void</returns>
clear()751 inline constexpr void clear() noexcept {
752 if constexpr (0 == nrBlocks) {
753 return;
754 }
755 else if constexpr (1 == nrBlocks) {
756 _block[0] = bt(0);
757 }
758 else if constexpr (2 == nrBlocks) {
759 _block[0] = bt(0);
760 _block[1] = bt(0);
761 }
762 else if constexpr (3 == nrBlocks) {
763 _block[0] = bt(0);
764 _block[1] = bt(0);
765 _block[2] = bt(0);
766 }
767 else if constexpr (4 == nrBlocks) {
768 _block[0] = bt(0);
769 _block[1] = bt(0);
770 _block[2] = bt(0);
771 _block[3] = bt(0);
772 }
773 else if constexpr (5 == nrBlocks) {
774 _block[0] = bt(0);
775 _block[1] = bt(0);
776 _block[2] = bt(0);
777 _block[3] = bt(0);
778 _block[4] = bt(0);
779 }
780 else if constexpr (6 == nrBlocks) {
781 _block[0] = bt(0);
782 _block[1] = bt(0);
783 _block[2] = bt(0);
784 _block[3] = bt(0);
785 _block[4] = bt(0);
786 _block[5] = bt(0);
787 }
788 else if constexpr (7 == nrBlocks) {
789 _block[0] = bt(0);
790 _block[1] = bt(0);
791 _block[2] = bt(0);
792 _block[3] = bt(0);
793 _block[4] = bt(0);
794 _block[5] = bt(0);
795 _block[6] = bt(0);
796 }
797 else if constexpr (8 == nrBlocks) {
798 _block[0] = bt(0);
799 _block[1] = bt(0);
800 _block[2] = bt(0);
801 _block[3] = bt(0);
802 _block[4] = bt(0);
803 _block[5] = bt(0);
804 _block[6] = bt(0);
805 _block[7] = bt(0);
806 }
807 else {
808 for (size_t i = 0; i < nrBlocks; ++i) {
809 _block[i] = bt(0);
810 }
811 }
812 }
813 /// <summary>
814 /// set the number to +0
815 /// </summary>
816 /// <returns>void</returns>
setzero()817 inline constexpr void setzero() noexcept { clear(); }
818 /// <summary>
819 /// set the number to +inf
820 /// </summary>
821 /// <param name="sign">boolean to make it + or - infinity, default is -inf</param>
822 /// <returns>void</returns>
setinf(bool sign=true)823 inline constexpr void setinf(bool sign = true) noexcept {
824 if constexpr (0 == nrBlocks) {
825 return;
826 }
827 else if constexpr (1 == nrBlocks) {
828 _block[MSU] = sign ? bt(MSU_MASK ^ LSB_BIT_MASK) : bt(~SIGN_BIT_MASK & (MSU_MASK ^ LSB_BIT_MASK));
829 }
830 else if constexpr (2 == nrBlocks) {
831 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
832 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
833 }
834 else if constexpr (3 == nrBlocks) {
835 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
836 _block[1] = BLOCK_MASK;
837 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
838 }
839 else if constexpr (4 == nrBlocks) {
840 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
841 _block[1] = BLOCK_MASK;
842 _block[2] = BLOCK_MASK;
843 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
844 }
845 else if constexpr (5 == nrBlocks) {
846 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
847 _block[1] = BLOCK_MASK;
848 _block[2] = BLOCK_MASK;
849 _block[3] = BLOCK_MASK;
850 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
851 }
852 else if constexpr (6 == nrBlocks) {
853 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
854 _block[1] = BLOCK_MASK;
855 _block[2] = BLOCK_MASK;
856 _block[3] = BLOCK_MASK;
857 _block[4] = BLOCK_MASK;
858 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
859 }
860 else if constexpr (7 == nrBlocks) {
861 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
862 _block[1] = BLOCK_MASK;
863 _block[2] = BLOCK_MASK;
864 _block[3] = BLOCK_MASK;
865 _block[4] = BLOCK_MASK;
866 _block[5] = BLOCK_MASK;
867 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
868 }
869 else if constexpr (8 == nrBlocks) {
870 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
871 _block[1] = BLOCK_MASK;
872 _block[2] = BLOCK_MASK;
873 _block[3] = BLOCK_MASK;
874 _block[4] = BLOCK_MASK;
875 _block[5] = BLOCK_MASK;
876 _block[6] = BLOCK_MASK;
877 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
878 }
879 else {
880 _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
881 for (size_t i = 1; i < nrBlocks - 1; ++i) {
882 _block[i] = BLOCK_MASK;
883 }
884 _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
885 }
886 }
887 /// <summary>
888 /// set the number to a quiet NaN (+nan) or a signalling NaN (-nan, default)
889 /// </summary>
890 /// <param name="sign">boolean to make it + or - infinity, default is -inf</param>
891 /// <returns>void</returns>
setnan(int NaNType=NAN_TYPE_SIGNALLING)892 inline constexpr void setnan(int NaNType = NAN_TYPE_SIGNALLING) noexcept {
893 if constexpr (0 == nrBlocks) {
894 return;
895 }
896 else if constexpr (1 == nrBlocks) {
897 // fall through
898 }
899 else if constexpr (2 == nrBlocks) {
900 _block[0] = BLOCK_MASK;
901 }
902 else if constexpr (3 == nrBlocks) {
903 _block[0] = BLOCK_MASK;
904 _block[1] = BLOCK_MASK;
905 }
906 else if constexpr (4 == nrBlocks) {
907 _block[0] = BLOCK_MASK;
908 _block[1] = BLOCK_MASK;
909 _block[2] = BLOCK_MASK;
910 }
911 else if constexpr (5 == nrBlocks) {
912 _block[0] = BLOCK_MASK;
913 _block[1] = BLOCK_MASK;
914 _block[2] = BLOCK_MASK;
915 _block[3] = BLOCK_MASK;
916 }
917 else if constexpr (6 == nrBlocks) {
918 _block[0] = BLOCK_MASK;
919 _block[1] = BLOCK_MASK;
920 _block[2] = BLOCK_MASK;
921 _block[3] = BLOCK_MASK;
922 _block[4] = BLOCK_MASK;
923 }
924 else if constexpr (7 == nrBlocks) {
925 _block[0] = BLOCK_MASK;
926 _block[1] = BLOCK_MASK;
927 _block[2] = BLOCK_MASK;
928 _block[3] = BLOCK_MASK;
929 _block[4] = BLOCK_MASK;
930 _block[5] = BLOCK_MASK;
931 }
932 else if constexpr (8 == nrBlocks) {
933 _block[0] = BLOCK_MASK;
934 _block[1] = BLOCK_MASK;
935 _block[2] = BLOCK_MASK;
936 _block[3] = BLOCK_MASK;
937 _block[4] = BLOCK_MASK;
938 _block[5] = BLOCK_MASK;
939 _block[6] = BLOCK_MASK;
940 }
941 else {
942 for (size_t i = 0; i < nrBlocks - 1; ++i) {
943 _block[i] = BLOCK_MASK;
944 }
945 }
946 _block[MSU] = NaNType == NAN_TYPE_SIGNALLING ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
947 }
setsign(bool sign=true)948 inline constexpr void setsign(bool sign = true) {
949 if (sign) {
950 _block[MSU] |= SIGN_BIT_MASK;
951 }
952 else {
953 _block[MSU] &= ~SIGN_BIT_MASK;
954 }
955 }
setexponent(int scale)956 inline constexpr bool setexponent(int scale) {
957 if (scale < MIN_EXP_SUBNORMAL || scale > MAX_EXP) return false; // this scale cannot be represented
958 if constexpr (nbits < 65) {
959 // we can use a uint64_t to construct the cfloat
960 //uint64_t raw{ 0 };
961 if (scale >= MIN_EXP_SUBNORMAL && scale < MIN_EXP_NORMAL) {
962 // we are a subnormal number: all exponent bits are 1
963 // what do you do know? If you set them all to 1, you still
964 // don't have the right scale
965 return false;
966 }
967 else {
968 // TODO: optimize
969 uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
970 uint32_t mask = (1ul << (es - 1));
971 for (size_t i = nbits - 2; i > nbits - 2 - es; --i) {
972 setbit(i, (mask & exponentBits));
973 mask >>= 1;
974 }
975 }
976 }
977 else {
978 // TODO: optimize
979 uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
980 uint32_t mask = (1ul << (es - 1));
981 for (size_t i = nbits - 2; i > nbits - 2 - es; --i) {
982 setbit(i, (mask & exponentBits));
983 mask >>= 1;
984 }
985 }
986 return true;
987 }
988 /// <summary>
989 /// set the fraction bits given a significant in the form ???
990 /// </summary>
991 /// <param name="significant"></param>
setfraction(const blockbinary<fbits,bt> & fraction)992 inline constexpr void setfraction(const blockbinary<fbits, bt>& fraction) {
993 for (size_t i = 0; i < fbits; ++i) {
994 setbit(i, fraction.test(i));
995 }
996 }
setfraction(uint64_t raw_bits)997 inline constexpr void setfraction(uint64_t raw_bits) {
998 // unoptimized as it is not meant to be an end-user API, it is a test API
999 if constexpr (fbits < 65) {
1000 uint64_t mask{ 1ull };
1001 for (size_t i = 0; i < fbits; ++i) {
1002 setbit(i, (mask & raw_bits));
1003 mask <<= 1;
1004 }
1005 }
1006 }
1007 // specific number system values of interest
maxpos()1008 inline constexpr cfloat& maxpos() noexcept {
1009 if constexpr (hasSupernormals) {
1010 // maximum positive value has this bit pattern: 0-1...1-111...101, that is, sign = 0, e = 11..11, f = 111...101
1011 clear();
1012 flip();
1013 setbit(nbits - 1ull, false); // sign = 0
1014 setbit(1ull, false); // bit1 = 0
1015 }
1016 else {
1017 // maximum positive value has this bit pattern: 0-1...0-111...111, that is, sign = 0, e = 11..10, f = 111...111
1018 clear();
1019 flip();
1020 setbit(fbits, false); // set least significant exponent bit to 0
1021 setbit(nbits - 1ull, false); // set sign to 0
1022 }
1023 return *this;
1024 }
minpos()1025 inline constexpr cfloat& minpos() noexcept {
1026 if constexpr (hasSubnormals) {
1027 // minimum positive value has this bit pattern: 0-000-00...01, that is, sign = 0, e = 000, f = 00001
1028 clear();
1029 setbit(0);
1030 }
1031 else {
1032 // minimum positive value has this bit pattern: 0-001-00...0, that is, sign = 0, e = 001, f = 0000
1033 clear();
1034 setbit(fbits);
1035 }
1036 return *this;
1037 }
zero()1038 inline constexpr cfloat& zero() noexcept {
1039 // the zero value
1040 clear();
1041 return *this;
1042 }
minneg()1043 inline constexpr cfloat& minneg() noexcept {
1044 if constexpr (hasSubnormals) {
1045 // minimum negative value has this bit pattern: 1-000-00...01, that is, sign = 1, e = 00, f = 00001
1046 clear();
1047 setbit(nbits - 1ull);
1048 setbit(0);
1049 }
1050 else {
1051 // minimum negative value has this bit pattern: 1-001-00...0, that is, sign = 1, e = 001, f = 0000
1052 clear();
1053 setbit(fbits);
1054 setbit(nbits - 1ull);
1055 }
1056 return *this;
1057 }
maxneg()1058 inline constexpr cfloat& maxneg() noexcept {
1059 if constexpr (hasSupernormals) {
1060 // maximum negative value has this bit pattern: 1-1...1-111...101, that is, sign = 1, e = 1..1, f = 111...101
1061 clear();
1062 flip();
1063 setbit(1ull, false);
1064 }
1065 else {
1066 // maximum negative value has this bit pattern: 1-1...0-111...111, that is, sign = 1, e = 11..10, f = 111...111
1067 clear();
1068 flip();
1069 setbit(fbits, false);
1070 }
1071 return *this;
1072 }
1073
1074 /// <summary>
1075 /// set a specific bit in the encoding to true or false. If bit index is out of bounds, no modification takes place.
1076 /// </summary>
1077 /// <param name="i">bit index to set</param>
1078 /// <param name="v">boolean value to set the bit to. Default is true.</param>
1079 /// <returns>void</returns>
setbit(size_t i,bool v=true)1080 inline constexpr void setbit(size_t i, bool v = true) noexcept {
1081 if (i < nbits) {
1082 bt block = _block[i / bitsInBlock];
1083 bt null = ~(1ull << (i % bitsInBlock));
1084 bt bit = bt(v ? 1 : 0);
1085 bt mask = bt(bit << (i % bitsInBlock));
1086 _block[i / bitsInBlock] = bt((block & null) | mask);
1087 return;
1088 }
1089 }
1090 /// <summary>
1091 /// set the raw bits of the cfloat. This is a required API function for number systems in the Universal Numbers Library
1092 /// This enables verification test suites to inject specific test bit patterns using a common interface.
1093 // This is a memcpy type operator, but the target number system may not have a linear memory layout and
1094 // thus needs to steer the bits in potentially more complicated ways then memcpy.
1095 /// </summary>
1096 /// <param name="raw_bits">unsigned long long carrying bits that will be written verbatim to the cfloat</param>
1097 /// <returns>reference to the cfloat</returns>
setbits(uint64_t raw_bits)1098 inline constexpr cfloat& setbits(uint64_t raw_bits) noexcept {
1099 if constexpr (0 == nrBlocks) {
1100 return *this;
1101 }
1102 else if constexpr (1 == nrBlocks) {
1103 _block[0] = raw_bits & storageMask;
1104 }
1105 else if constexpr (2 == nrBlocks) {
1106 if constexpr (bitsInBlock < 64) {
1107 _block[0] = raw_bits & storageMask;
1108 raw_bits >>= bitsInBlock;
1109 _block[1] = raw_bits & storageMask;
1110 }
1111 else {
1112 _block[0] = raw_bits & storageMask;
1113 _block[1] = 0;
1114 }
1115 }
1116 else if constexpr (3 == nrBlocks) {
1117 if constexpr (bitsInBlock < 64) {
1118 _block[0] = raw_bits & storageMask;
1119 raw_bits >>= bitsInBlock;
1120 _block[1] = raw_bits & storageMask;
1121 raw_bits >>= bitsInBlock;
1122 _block[2] = raw_bits & storageMask;
1123 }
1124 else {
1125 _block[0] = raw_bits & storageMask;
1126 _block[1] = 0;
1127 _block[2] = 0;
1128 }
1129 }
1130 else if constexpr (4 == nrBlocks) {
1131 if constexpr (bitsInBlock < 64) {
1132 _block[0] = raw_bits & storageMask;
1133 raw_bits >>= bitsInBlock;
1134 _block[1] = raw_bits & storageMask;
1135 raw_bits >>= bitsInBlock;
1136 _block[2] = raw_bits & storageMask;
1137 raw_bits >>= bitsInBlock;
1138 _block[3] = raw_bits & storageMask;
1139 }
1140 else {
1141 _block[0] = raw_bits & storageMask;
1142 _block[1] = 0;
1143 _block[2] = 0;
1144 _block[3] = 0;
1145 }
1146 }
1147 else {
1148 if constexpr (bitsInBlock < 64) {
1149 for (size_t i = 0; i < nrBlocks; ++i) {
1150 _block[i] = raw_bits & storageMask;
1151 raw_bits >>= bitsInBlock;
1152 }
1153 }
1154 else {
1155 _block[0] = raw_bits & storageMask;
1156 for (size_t i = 1; i < nrBlocks; ++i) {
1157 _block[i] = 0;
1158 }
1159 }
1160 }
1161 _block[MSU] &= MSU_MASK; // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
1162 return *this;
1163 }
1164
1165 /// <summary>
1166 /// 1's complement of the encoding
1167 /// </summary>
1168 /// <returns>reference to this cfloat object</returns>
flip()1169 inline constexpr cfloat& flip() noexcept { // in-place one's complement
1170 for (size_t i = 0; i < nrBlocks; ++i) {
1171 _block[i] = bt(~_block[i]);
1172 }
1173 _block[MSU] &= MSU_MASK; // assert precondition of properly nulled leading non-bits
1174 return *this;
1175 }
1176
1177 /// <summary>
1178 /// assign the value of the string representation to the cfloat
1179 /// </summary>
1180 /// <param name="stringRep">decimal scientific notation of a real number to be assigned</param>
1181 /// <returns>reference to this cfloat</returns>
assign(const std::string & str)1182 inline constexpr cfloat& assign(const std::string& str) noexcept {
1183 clear();
1184 if (str.length() == 0) return *this;
1185 // TODO: regex based determination of scientific form or binary form
1186
1187 if (str.length() > 2) {
1188 if (str[0] == '0' && str[1] == 'b') {
1189 // binary string needs to be at least nbits+4 characters
1190 int field(0);
1191 int exponentBits(-1); // we start the field with a '.'
1192 if (str.size() != (nbits + 4)) {
1193 std::cerr << "provided binary string representation does not contain " << nbits << " bits. Reset to 0\n";
1194 return *this;
1195 }
1196 size_t index = nbits;
1197 for (size_t i = 1; i < str.size(); ++i) {
1198 if (str[i] == '1') {
1199 setbit(--index, true);
1200 }
1201 else if (str[i] == '0') {
1202 setbit(--index, false);
1203 }
1204 else if (str[i] == '.' || str[i] == '\'') {
1205 ++field;
1206 if (field == 2) { // just finished parsing exponent field: we can now check the number of exponent bits
1207 if (exponentBits != es) {
1208 std::cerr << "provided binary string representation does not contain " << es << " exponent bits. Found " << exponentBits << ". Reset to 0\n";
1209 return *this;
1210 }
1211 }
1212 }
1213 if (field == 1) { // exponent field
1214 ++exponentBits;
1215 }
1216 }
1217 if (field != 2) {
1218 std::cerr << "provided binary string did not contain three fields separated by '.': Reset to 0\n";
1219 return *this;
1220 }
1221 }
1222 }
1223 else {
1224 std::cerr << "parse/assign currently only parse binary string formats that start with 0b\n";
1225 }
1226 return *this;
1227 }
1228
1229 // selectors
sign() const1230 inline constexpr bool sign() const noexcept { return (_block[MSU] & SIGN_BIT_MASK) == SIGN_BIT_MASK; }
scale() const1231 inline constexpr int scale() const noexcept {
1232 int e{ 0 };
1233 if constexpr (MSU_CAPTURES_EXP) {
1234 e = static_cast<int>((_block[MSU] & ~SIGN_BIT_MASK) >> EXP_SHIFT);
1235 if (e == 0) {
1236 // subnormal scale is determined by fraction
1237 // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1238 e = (2l - (1l << (es - 1ull))) - 1;
1239 if constexpr (nbits > 2 + es) {
1240 for (size_t i = nbits - 2ull - es; i > 0; --i) {
1241 if (test(i)) break;
1242 --e;
1243 }
1244 }
1245 }
1246 else {
1247 e -= EXP_BIAS;
1248 }
1249 }
1250 else {
1251 blockbinary<es, bt> ebits;
1252 exponent(ebits);
1253 if (ebits.iszero()) {
1254 // subnormal scale is determined by fraction
1255 // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1256 e = (2l - (1l << (es - 1ull))) - 1;
1257 if constexpr (nbits > 2 + es) {
1258 for (size_t i = nbits - 2ull - es; i > 0; --i) {
1259 if (test(i)) break;
1260 --e;
1261 }
1262 }
1263 }
1264 else {
1265 e = static_cast<int>(unsigned(ebits) - EXP_BIAS);
1266 }
1267 }
1268 return e;
1269 }
1270 // tests
isneg() const1271 inline constexpr bool isneg() const noexcept { return sign(); }
ispos() const1272 inline constexpr bool ispos() const noexcept { return !sign(); }
iszeroencoding() const1273 inline constexpr bool iszeroencoding() const noexcept {
1274 if constexpr (0 == nrBlocks) {
1275 return true;
1276 }
1277 else if constexpr (1 == nrBlocks) {
1278 return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1279 }
1280 else if constexpr (2 == nrBlocks) {
1281 return (_block[0] == 0) && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1282 }
1283 else if constexpr (3 == nrBlocks) {
1284 return (_block[0] == 0) && _block[1] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1285 }
1286 else if constexpr (4 == nrBlocks) {
1287 return (_block[0] == 0) && _block[1] == 0 && _block[2] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1288 }
1289 else {
1290 for (size_t i = 0; i < nrBlocks - 1; ++i) if (_block[i] != 0) return false;
1291 return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1292 }
1293 }
iszero() const1294 inline constexpr bool iszero() const noexcept {
1295 if constexpr (hasSubnormals) {
1296 return iszeroencoding();
1297 }
1298 else { // all subnormals round to 0
1299 blockbinary<es, bt> ebits;
1300 exponent(ebits);
1301 if (ebits.iszero()) return true; else return false;
1302 }
1303 }
isone() const1304 inline constexpr bool isone() const noexcept {
1305 // unbiased exponent = scale = 0, fraction = 0
1306 int s = scale();
1307 if (s == 0) {
1308 blockbinary<fbits, bt> f;
1309 fraction(f);
1310 return f.iszero();
1311 }
1312 return false;
1313 }
1314 /// <summary>
1315 /// check if value is infinite, -inf, or +inf.
1316 /// +inf = 0-1111-11111-0: sign = 0, uncertainty = 0, es/fraction bits = 1
1317 /// -inf = 1-1111-11111-0: sign = 1, uncertainty = 0, es/fraction bits = 1
1318 /// </summary>
1319 /// <param name="InfType">default is 0, both types, -1 checks for -inf, 1 checks for +inf</param>
1320 /// <returns>true if +-inf, false otherwise</returns>
isinf(int InfType=INF_TYPE_EITHER) const1321 inline constexpr bool isinf(int InfType = INF_TYPE_EITHER) const noexcept {
1322 // the bit pattern encoding of Inf is independent of gradual overflow (supernormal) configuration
1323 bool isNegInf = false;
1324 bool isPosInf = false;
1325 if constexpr (0 == nrBlocks) {
1326 return false;
1327 }
1328 else if constexpr (1 == nrBlocks) {
1329 isNegInf = (_block[MSU] & MSU_MASK) == (MSU_MASK ^ LSB_BIT_MASK);
1330 isPosInf = (_block[MSU] & MSU_MASK) == ((MSU_MASK ^ SIGN_BIT_MASK) ^ LSB_BIT_MASK);
1331 }
1332 else if constexpr (2 == nrBlocks) {
1333 bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
1334 isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1335 isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1336 }
1337 else if constexpr (3 == nrBlocks) {
1338 bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK);
1339 isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1340 isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1341 }
1342 else if constexpr (4 == nrBlocks) {
1343 bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
1344 isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1345 isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1346 }
1347 else {
1348 bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
1349 for (size_t i = 1; i < nrBlocks - 1; ++i) {
1350 if (_block[i] != BLOCK_MASK) {
1351 isInf = false;
1352 break;
1353 }
1354 }
1355 isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1356 isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1357 }
1358
1359 return (InfType == INF_TYPE_EITHER ? (isNegInf || isPosInf) :
1360 (InfType == INF_TYPE_NEGATIVE ? isNegInf :
1361 (InfType == INF_TYPE_POSITIVE ? isPosInf : false)));
1362 }
1363 /// <summary>
1364 /// check if a value is a quiet or a signalling NaN
1365 /// quiet NaN = 0-1111-11111-1: sign = 0, uncertainty = 1, es/fraction bits = 1
1366 /// signalling NaN = 1-1111-11111-1: sign = 1, uncertainty = 1, es/fraction bits = 1
1367 /// </summary>
1368 /// <param name="NaNType">default is 0, both types, 1 checks for Signalling NaN, -1 checks for Quiet NaN</param>
1369 /// <returns>true if the right kind of NaN, false otherwise</returns>
isnanencoding(int NaNType=NAN_TYPE_EITHER) const1370 inline constexpr bool isnanencoding(int NaNType = NAN_TYPE_EITHER) const noexcept {
1371 // the bit encoding of NaN is independent of the gradual overflow configuration
1372 bool isNaN = true;
1373 bool isNegNaN = false;
1374 bool isPosNaN = false;
1375
1376 if constexpr (0 == nrBlocks) {
1377 return false;
1378 }
1379 else if constexpr (1 == nrBlocks) {
1380 }
1381 else if constexpr (2 == nrBlocks) {
1382 isNaN = (_block[0] == BLOCK_MASK);
1383 }
1384 else if constexpr (3 == nrBlocks) {
1385 isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK);
1386 }
1387 else if constexpr (4 == nrBlocks) {
1388 isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
1389 }
1390 else {
1391 for (size_t i = 0; i < nrBlocks - 1; ++i) {
1392 if (_block[i] != BLOCK_MASK) {
1393 isNaN = false;
1394 break;
1395 }
1396 }
1397 }
1398 isNegNaN = isNaN && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1399 isPosNaN = isNaN && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1400
1401 return (NaNType == NAN_TYPE_EITHER ? (isNegNaN || isPosNaN) :
1402 (NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
1403 (NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
1404 }
isnan(int NaNType=NAN_TYPE_EITHER) const1405 inline constexpr bool isnan(int NaNType = NAN_TYPE_EITHER) const noexcept {
1406 if constexpr (hasSupernormals) {
1407 return isnanencoding(NaNType);
1408 }
1409 else {
1410 bool isNaN = true;
1411 bool isNegNaN = false;
1412 bool isPosNaN = false;
1413 if (!issupernormal()) { isNaN = false; }
1414 isNegNaN = isNaN && sign();
1415 isPosNaN = isNaN && !sign();
1416 return (NaNType == NAN_TYPE_EITHER ? (isNegNaN || isPosNaN) :
1417 (NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
1418 (NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
1419 }
1420 }
1421 // isnormal returns true if exponent bits are not all zero or one, false otherwise
isnormal() const1422 inline constexpr bool isnormal() const noexcept {
1423 blockbinary<es, bt> e;
1424 exponent(e);
1425 // return !e.iszero() && !isinf() && !isnan(); // old definition that included the supernormals but excluded the extreme encodings
1426 return !e.iszero() && !e.isallones();
1427 }
1428 // isdenormal returns true if exponent bits are all zero, false otherwise
isdenormal() const1429 inline constexpr bool isdenormal() const noexcept {
1430 blockbinary<es, bt> e;
1431 exponent(e);
1432 return e.iszero();
1433 }
1434 // issupernormal returns true if exponent bits are all one, false otherwise
issupernormal() const1435 inline constexpr bool issupernormal() const noexcept {
1436 blockbinary<es, bt> e;
1437 exponent(e);
1438 return e.isallones();
1439 }
1440 template<typename NativeReal>
inrange(NativeReal v)1441 inline constexpr bool inrange(NativeReal v) {
1442 // the valid range for this cfloat includes the interval between
1443 // maxpos and the value that would round down to maxpos
1444 bool bIsInRange = true;
1445 if (v > 0) {
1446 cfloat c(SpecificValue::maxpos);
1447 cfloat<nbits + 1, es, BlockType, hasSubnormals, hasSupernormals, isSaturating> d;
1448 d = NativeReal(c);
1449 ++d;
1450 if (v >= NativeReal(d)) bIsInRange = false;
1451 }
1452 else {
1453 cfloat c(SpecificValue::maxneg);
1454 cfloat<nbits + 1, es, BlockType, hasSubnormals, hasSupernormals, isSaturating> d;
1455 d = NativeReal(c);
1456 --d;
1457 if (v <= NativeReal(d)) bIsInRange = false;
1458 }
1459
1460 return bIsInRange;
1461 }
test(size_t bitIndex) const1462 inline constexpr bool test(size_t bitIndex) const noexcept {
1463 return at(bitIndex);
1464 }
at(size_t bitIndex) const1465 inline constexpr bool at(size_t bitIndex) const noexcept {
1466 if (bitIndex < nbits) {
1467 bt word = _block[bitIndex / bitsInBlock];
1468 bt mask = bt(1ull << (bitIndex % bitsInBlock));
1469 return (word & mask);
1470 }
1471 return false;
1472 }
nibble(size_t n) const1473 inline constexpr uint8_t nibble(size_t n) const noexcept {
1474 if (n < (1 + ((nbits - 1) >> 2))) {
1475 bt word = _block[(n * 4) / bitsInBlock];
1476 int nibbleIndexInWord = int(n % (bitsInBlock >> 2ull));
1477 bt mask = bt(0xF << (nibbleIndexInWord * 4));
1478 bt nibblebits = bt(mask & word);
1479 return uint8_t(nibblebits >> (nibbleIndexInWord * 4));
1480 }
1481 return false;
1482 }
block(size_t b) const1483 inline constexpr bt block(size_t b) const noexcept {
1484 if (b < nrBlocks) {
1485 return _block[b];
1486 }
1487 return 0;
1488 }
1489
1490 // helper debug function
constexprClassParameters() const1491 void constexprClassParameters() const {
1492 std::cout << "-------------------------------------------------------------\n";
1493 std::cout << "type : " << typeid(*this).name() << '\n';
1494 std::cout << "nbits : " << nbits << '\n';
1495 std::cout << "es : " << es << std::endl;
1496 std::cout << "hasSubnormals : " << (hasSubnormals ? "true" : "false") << '\n';
1497 std::cout << "hasSupernormals : " << (hasSupernormals ? "true" : "false") << '\n';
1498 std::cout << "isSaturating : " << (isSaturating ? "true" : "false") << '\n';
1499 std::cout << "ALL_ONES : " << to_binary(ALL_ONES, 0, true) << '\n';
1500 std::cout << "BLOCK_MASK : " << to_binary(BLOCK_MASK, 0, true) << '\n';
1501 std::cout << "nrBlocks : " << nrBlocks << '\n';
1502 std::cout << "bits in MSU : " << bitsInMSU << '\n';
1503 std::cout << "MSU : " << MSU << '\n';
1504 std::cout << "MSU MASK : " << to_binary(MSU_MASK, 0, true) << '\n';
1505 std::cout << "SIGN_BIT_MASK : " << to_binary(SIGN_BIT_MASK, 0, true) << '\n';
1506 std::cout << "LSB_BIT_MASK : " << to_binary(LSB_BIT_MASK, 0, true) << '\n';
1507 std::cout << "MSU CAPTURES_EXP : " << (MSU_CAPTURES_EXP ? "yes\n" : "no\n");
1508 std::cout << "EXP_SHIFT : " << EXP_SHIFT << '\n';
1509 std::cout << "MSU EXP MASK : " << to_binary(MSU_EXP_MASK, 0, true) << '\n';
1510 std::cout << "ALL_ONE_MASK_ES : " << to_binary(ALL_ONES_ES) << '\n';
1511 std::cout << "EXP_BIAS : " << EXP_BIAS << '\n';
1512 std::cout << "MAX_EXP : " << MAX_EXP << '\n';
1513 std::cout << "MIN_EXP_NORMAL : " << MIN_EXP_NORMAL << '\n';
1514 std::cout << "MIN_EXP_SUBNORMAL : " << MIN_EXP_SUBNORMAL << '\n';
1515 std::cout << "fraction Blocks : " << fBlocks << '\n';
1516 std::cout << "bits in FSU : " << bitsInFSU << '\n';
1517 std::cout << "FSU : " << FSU << '\n';
1518 std::cout << "FSU MASK : " << to_binary(FSU_MASK, 0, true) << '\n';
1519 std::cout << "topfbits : " << topfbits << '\n';
1520 std::cout << "ALL_ONE_MASK_FR : " << to_binary(ALL_ONES_FR) << '\n';
1521 }
1522 // extract the sign field from the encoding
sign(bool & s) const1523 inline constexpr void sign(bool& s) const {
1524 s = sign();
1525 }
1526 // extract the exponent field from the encoding
exponent(blockbinary<es,bt> & e) const1527 inline constexpr void exponent(blockbinary<es, bt>& e) const {
1528 e.clear();
1529 if constexpr (0 == nrBlocks) return;
1530 else if constexpr (1 == nrBlocks) {
1531 bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
1532 e.setbits(uint64_t(ebits >> EXP_SHIFT));
1533 }
1534 else if constexpr (nrBlocks > 1) {
1535 if (MSU_CAPTURES_EXP) {
1536 bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
1537 e.setbits(uint64_t(ebits >> ((nbits - 1ull - es) % bitsInBlock)));
1538 }
1539 else {
1540 for (size_t i = 0; i < es; ++i) { e.setbit(i, at(nbits - 1ull - es + i)); }
1541 }
1542 }
1543 }
1544 // extract the fraction field from the encoding
fraction(blockbinary<fbits,bt> & f) const1545 inline constexpr void fraction(blockbinary<fbits, bt>& f) const {
1546 f.clear();
1547 if constexpr (0 == nrBlocks) return;
1548 else if constexpr (1 == nrBlocks) {
1549 bt fraction = bt(_block[MSU] & ~MSU_EXP_MASK);
1550 f.setbits(fraction);
1551 }
1552 else if constexpr (nrBlocks > 1) {
1553 for (size_t i = 0; i < fbits; ++i) { f.setbit(i, at(i)); } // TODO: TEST!
1554 }
1555 }
fraction_ull() const1556 inline constexpr uint64_t fraction_ull() const {
1557 uint64_t raw{ 0 };
1558 if constexpr (nbits - es - 1ull < 65ull) { // no-op if precondition doesn't hold
1559 if constexpr (1 == nrBlocks) {
1560 uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1561 raw = fbitMask & uint64_t(_block[0]);
1562 }
1563 else if constexpr (2 == nrBlocks) {
1564 uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1565 raw = fbitMask & ((uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
1566 }
1567 else if constexpr (3 == nrBlocks) {
1568 uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1569 raw = fbitMask & ((uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
1570 }
1571 else if constexpr (4 == nrBlocks) {
1572 uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1573 raw = fbitMask & ((uint64_t(_block[3]) << (3 * bitsInBlock)) | (uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
1574 }
1575 else {
1576 uint64_t mask{ 1 };
1577 for (size_t i = 0; i < fbits; ++i) {
1578 if (test(i)) {
1579 raw |= mask;
1580 }
1581 mask <<= 1;
1582 }
1583 }
1584 }
1585 return raw;
1586 }
1587 // construct the significant from the encoding, returns normalization offset
significant(blockbinary<fhbits,bt> & s,bool isNormal=true) const1588 inline constexpr size_t significant(blockbinary<fhbits, bt>& s, bool isNormal = true) const {
1589 size_t shift = 0;
1590 if (iszero()) return 0;
1591 if constexpr (0 == nrBlocks) return 0;
1592 else if constexpr (1 == nrBlocks) {
1593 bt significant = bt(_block[MSU] & ~MSU_EXP_MASK & ~SIGN_BIT_MASK);
1594 if (isNormal) {
1595 significant |= (bt(0x1ul) << fbits);
1596 }
1597 else {
1598 size_t msb = findMostSignificantBit(significant);
1599 // std::cout << "msb : " << msb << " : fhbits : " << fhbits << " : " << to_binary(significant, true) << std::endl;
1600 shift = fhbits - msb;
1601 significant <<= shift;
1602 }
1603 s.setbits(significant);
1604 }
1605 else if constexpr (nrBlocks > 1) {
1606 s.clear();
1607 // TODO: design and implement a block-oriented algorithm, this sequential algorithm is super slow
1608 if (isNormal) {
1609 s.setbit(fbits);
1610 for (size_t i = 0; i < fbits; ++i) { s.setbit(i, at(i)); }
1611 }
1612 else {
1613 // Find the MSB of the subnormal:
1614 size_t msb = 0;
1615 for (size_t i = 0; i < fbits; ++i) { // msb protected from not being assigned through iszero test at prelude of function
1616 msb = fbits - 1ull - i;
1617 if (test(msb)) break;
1618 }
1619 // m-----lsb
1620 // h00001010101
1621 // 101010100000
1622 for (size_t i = 0; i <= msb; ++i) {
1623 s.setbit(fbits - msb + i, at(i));
1624 }
1625 shift = fhbits - msb;
1626 }
1627 }
1628 return shift;
1629 }
1630 // get the raw bits from the encoding
getbits(blockbinary<nbits,bt> & b) const1631 inline constexpr void getbits(blockbinary<nbits, bt>& b) const {
1632 b.clear();
1633 for (size_t i = 0; i < nbits; ++i) { b.setbit(i, at(i)); }
1634 }
1635 // casts to native types
to_long() const1636 long to_long() const { return long(to_native<double>()); }
to_long_long() const1637 long long to_long_long() const { return (long long)(to_native<double>()); }
1638 // transform an cfloat to a native C++ floating-point. We are using the native
1639 // precision to compute, which means that all sub-values need to be representable
1640 // by the native precision.
1641 // A more accurate approximation would require an adaptive precision algorithm
1642 // with a final rounding step.
1643 template<typename TargetFloat>
to_native() const1644 TargetFloat to_native() const {
1645 TargetFloat v{ 0.0 };
1646 if (iszero()) {
1647 // the optimizer might destroy the sign
1648 return sign() ? -TargetFloat(0) : TargetFloat(0);
1649 }
1650 else if (isnan()) {
1651 v = sign() ? std::numeric_limits<TargetFloat>::signaling_NaN() : std::numeric_limits<TargetFloat>::quiet_NaN();
1652 }
1653 else if (isinf()) {
1654 v = sign() ? -std::numeric_limits<TargetFloat>::infinity() : std::numeric_limits<TargetFloat>::infinity();
1655 }
1656 else { // TODO: this approach has catastrophic cancellation when nbits is large and native target float is too small
1657 TargetFloat f{ 0 };
1658 TargetFloat fbit{ 0.5 };
1659 for (int i = static_cast<int>(nbits - 2ull - es); i >= 0; --i) {
1660 f += at(static_cast<size_t>(i)) ? fbit : TargetFloat(0);
1661 fbit *= TargetFloat(0.5);
1662 }
1663 blockbinary<es, bt> ebits;
1664 exponent(ebits);
1665 if constexpr (hasSubnormals) {
1666 if (ebits.iszero()) {
1667 // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1668 TargetFloat exponentiation = TargetFloat(subnormal_exponent[es]); // precomputed values for 2^(2-2^(es-1))
1669 v = exponentiation * f; // f is already f/2^fbits
1670 return sign() ? -v : v;
1671 }
1672 }
1673 else {
1674 if (ebits.iszero()) { // underflow to 0
1675 // compiler fast float optimization might destroy the sign
1676 return sign() ? -TargetFloat(0) : TargetFloat(0);
1677 }
1678 }
1679 if constexpr (hasSupernormals) {
1680 // regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
1681 int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
1682 if (-64 < exponent && exponent < 64) {
1683 TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
1684 v = exponentiation * (TargetFloat(1.0) + f);
1685 }
1686 else {
1687 double exponentiation = ipow(exponent);
1688 v = TargetFloat(exponentiation * (1.0 + f));
1689 }
1690 }
1691 else {
1692 if (ebits.isallones()) {
1693 // supernormals are mapped to quiet NaNs
1694 v = std::numeric_limits<TargetFloat>::quiet_NaN();
1695 return v;
1696 }
1697 else {
1698 // regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
1699 int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
1700 if (-64 < exponent && exponent < 64) {
1701 TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
1702 v = exponentiation * (TargetFloat(1.0) + f);
1703 }
1704 else {
1705 double exponentiation = ipow(exponent);
1706 v = TargetFloat(exponentiation * (1.0 + f));
1707 }
1708 }
1709 }
1710 v = sign() ? -v : v;
1711 }
1712 return v;
1713 }
1714
1715 // make conversions to native types explicit
operator int() const1716 explicit operator int() const { return to_long_long(); }
operator long long() const1717 explicit operator long long() const { return to_long_long(); }
operator double() const1718 explicit operator double() const { return to_native<double>(); }
operator float() const1719 explicit operator float() const { return to_native<float>(); }
1720
1721 // convert a cfloat to a blocktriple with the fraction format 1.ffff
1722 // we are using the same block type so that we can use block copies to move bits around.
1723 // Since we tend to have at least two exponent bits, this will lead to
1724 // most cfloat<->blocktriple cases being efficient as the block types are aligned.
1725 // The relationship between the source cfloat and target blocktriple is not
1726 // arbitrary, enforce it by setting the blocktriple fbits to the cfloat's (nbits - es - 1)
normalize(blocktriple<fbits,BlockTripleOperator::REPRESENTATION,bt> & tgt) const1727 constexpr void normalize(blocktriple<fbits, BlockTripleOperator::REPRESENTATION, bt>& tgt) const {
1728 // test special cases
1729 if (isnan()) {
1730 tgt.setnan();
1731 }
1732 else if (isinf()) {
1733 tgt.setinf();
1734 }
1735 else if (iszero()) {
1736 tgt.setzero();
1737 }
1738 else {
1739 tgt.setnormal(); // a blocktriple is always normalized
1740 int scale = this->scale();
1741 tgt.setsign(sign());
1742 tgt.setscale(scale);
1743 // set significant
1744 // we are going to unify to the format 01.ffffeeee
1745 // where 'f' is a fraction bit, and 'e' is an extension bit
1746 // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
1747 if (isnormal()) {
1748 if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
1749 uint64_t raw = fraction_ull();
1750 raw |= (1ull << fbits);
1751 tgt.setbits(raw);
1752 }
1753 else {
1754 // brute force copy of blocks
1755 if constexpr (1 == fBlocks) {
1756 tgt.setblock(0, _block[0] & FSU_MASK);
1757 }
1758 else if constexpr (2 == fBlocks) {
1759 tgt.setblock(0, _block[0]);
1760 tgt.setblock(1, _block[1] & FSU_MASK);
1761 }
1762 else if constexpr (3 == fBlocks) {
1763 tgt.setblock(0, _block[0]);
1764 tgt.setblock(1, _block[1]);
1765 tgt.setblock(2, _block[2] & FSU_MASK);
1766 }
1767 else if constexpr (4 == fBlocks) {
1768 tgt.setblock(0, _block[0]);
1769 tgt.setblock(1, _block[1]);
1770 tgt.setblock(2, _block[2]);
1771 tgt.setblock(3, _block[3] & FSU_MASK);
1772 }
1773 else {
1774 for (size_t i = 0; i < FSU; ++i) {
1775 tgt.setblock(i, _block[i]);
1776 }
1777 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1778 }
1779 }
1780 }
1781 else { // it is a subnormal encoding in this target cfloat
1782 if constexpr (fbits < 64) {
1783 uint64_t raw = fraction_ull();
1784 int shift = MIN_EXP_NORMAL - scale;
1785 raw <<= shift;
1786 raw |= (1ull << fbits);
1787 tgt.setbits(raw);
1788 }
1789 else {
1790 // brute force copy of blocks
1791 if constexpr (1 == fBlocks) {
1792 tgt.setblock(0, _block[0] & FSU_MASK);
1793 }
1794 else if constexpr (2 == fBlocks) {
1795 tgt.setblock(0, _block[0]);
1796 tgt.setblock(1, _block[1] & FSU_MASK);
1797 }
1798 else if constexpr (3 == fBlocks) {
1799 tgt.setblock(0, _block[0]);
1800 tgt.setblock(1, _block[1]);
1801 tgt.setblock(2, _block[2] & FSU_MASK);
1802 }
1803 else if constexpr (4 == fBlocks) {
1804 tgt.setblock(0, _block[0]);
1805 tgt.setblock(1, _block[1]);
1806 tgt.setblock(2, _block[2]);
1807 tgt.setblock(3, _block[3] & FSU_MASK);
1808 }
1809 else {
1810 for (size_t i = 0; i < FSU; ++i) {
1811 tgt.setblock(i, _block[i]);
1812 }
1813 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1814 }
1815 }
1816 }
1817 }
1818 }
1819
1820 // normalize a cfloat to a blocktriple used in add/sub, which has the form 00h.fffff
1821 // that is 3 + fbits, the 3 extra bits are required to be able to use 2's complement
1822 // and capture the largest value of an addition/subtraction.
1823 // TODO: currently abits = 2*fhbits as the worst case input argument size to
1824 // capture the smallest normal value in aligned form. There is a faster/smaller
1825 // implementation where the input is constrainted to just the round, guard, and sticky bits.
normalizeAddition(blocktriple<fbits,BlockTripleOperator::ADD,bt> & tgt) const1826 constexpr void normalizeAddition(blocktriple<fbits, BlockTripleOperator::ADD, bt>& tgt) const {
1827 // test special cases
1828 if (isnan()) {
1829 tgt.setnan();
1830 }
1831 else if (isinf()) {
1832 tgt.setinf();
1833 }
1834 else if (iszero()) {
1835 tgt.setzero();
1836 }
1837 else {
1838 tgt.setnormal(); // a blocktriple is always normalized
1839 int scale = this->scale();
1840 tgt.setsign(sign());
1841 tgt.setscale(scale);
1842 // set significant
1843 // we are going to unify to the format 001.ffffeeee
1844 // where 'f' is a fraction bit, and 'e' is an extension bit
1845 // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
1846 if (isnormal()) {
1847 if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
1848 uint64_t raw = fraction_ull();
1849 raw |= (1ull << fbits); // add the hidden bit
1850 tgt.setbits(raw);
1851 }
1852 else {
1853 // brute force copy of blocks
1854 if constexpr (1 == fBlocks) {
1855 tgt.setblock(0, _block[0] & FSU_MASK);
1856 }
1857 else if constexpr (2 == fBlocks) {
1858 tgt.setblock(0, _block[0]);
1859 tgt.setblock(1, _block[1] & FSU_MASK);
1860 }
1861 else if constexpr (3 == fBlocks) {
1862 tgt.setblock(0, _block[0]);
1863 tgt.setblock(1, _block[1]);
1864 tgt.setblock(2, _block[2] & FSU_MASK);
1865 }
1866 else if constexpr (4 == fBlocks) {
1867 tgt.setblock(0, _block[0]);
1868 tgt.setblock(1, _block[1]);
1869 tgt.setblock(2, _block[2]);
1870 tgt.setblock(3, _block[3] & FSU_MASK);
1871 }
1872 else if constexpr (5 == fBlocks) {
1873 tgt.setblock(0, _block[0]);
1874 tgt.setblock(1, _block[1]);
1875 tgt.setblock(2, _block[2]);
1876 tgt.setblock(3, _block[3]);
1877 tgt.setblock(4, _block[4] & FSU_MASK);
1878 }
1879 else if constexpr (6 == fBlocks) {
1880 tgt.setblock(0, _block[0]);
1881 tgt.setblock(1, _block[1]);
1882 tgt.setblock(2, _block[2]);
1883 tgt.setblock(3, _block[3]);
1884 tgt.setblock(4, _block[4]);
1885 tgt.setblock(5, _block[5] & FSU_MASK);
1886 }
1887 else if constexpr (7 == fBlocks) {
1888 tgt.setblock(0, _block[0]);
1889 tgt.setblock(1, _block[1]);
1890 tgt.setblock(2, _block[2]);
1891 tgt.setblock(3, _block[3]);
1892 tgt.setblock(4, _block[4]);
1893 tgt.setblock(5, _block[5]);
1894 tgt.setblock(6, _block[6] & FSU_MASK);
1895 }
1896 else if constexpr (8 == fBlocks) {
1897 tgt.setblock(0, _block[0]);
1898 tgt.setblock(1, _block[1]);
1899 tgt.setblock(2, _block[2]);
1900 tgt.setblock(3, _block[3]);
1901 tgt.setblock(4, _block[4]);
1902 tgt.setblock(5, _block[5]);
1903 tgt.setblock(6, _block[6]);
1904 tgt.setblock(7, _block[7] & FSU_MASK);
1905 }
1906 else {
1907 for (size_t i = 0; i < FSU; ++i) {
1908 tgt.setblock(i, _block[i]);
1909 }
1910 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1911 }
1912 }
1913 }
1914 else {
1915 if (isdenormal()) { // it is a subnormal encoding in this target cfloat
1916 if constexpr (hasSubnormals) {
1917 if constexpr (fbits < 64) {
1918 uint64_t raw = fraction_ull();
1919 int shift = MIN_EXP_NORMAL - scale;
1920 raw <<= shift; // shift but do NOT add a hidden bit as the MSB of the subnormal is shifted in the hidden bit position
1921 tgt.setbits(raw);
1922 }
1923 else {
1924 // brute force copy of blocks
1925 if constexpr (1 == fBlocks) {
1926 tgt.setblock(0, _block[0] & FSU_MASK);
1927 }
1928 else if constexpr (2 == fBlocks) {
1929 tgt.setblock(0, _block[0]);
1930 tgt.setblock(1, _block[1] & FSU_MASK);
1931 }
1932 else if constexpr (3 == fBlocks) {
1933 tgt.setblock(0, _block[0]);
1934 tgt.setblock(1, _block[1]);
1935 tgt.setblock(2, _block[2] & FSU_MASK);
1936 }
1937 else if constexpr (4 == fBlocks) {
1938 tgt.setblock(0, _block[0]);
1939 tgt.setblock(1, _block[1]);
1940 tgt.setblock(2, _block[2]);
1941 tgt.setblock(3, _block[3] & FSU_MASK);
1942 }
1943 else if constexpr (5 == fBlocks) {
1944 tgt.setblock(0, _block[0]);
1945 tgt.setblock(1, _block[1]);
1946 tgt.setblock(2, _block[2]);
1947 tgt.setblock(3, _block[3]);
1948 tgt.setblock(4, _block[4] & FSU_MASK);
1949 }
1950 else if constexpr (6 == fBlocks) {
1951 tgt.setblock(0, _block[0]);
1952 tgt.setblock(1, _block[1]);
1953 tgt.setblock(2, _block[2]);
1954 tgt.setblock(3, _block[3]);
1955 tgt.setblock(4, _block[4]);
1956 tgt.setblock(5, _block[5] & FSU_MASK);
1957 }
1958 else if constexpr (7 == fBlocks) {
1959 tgt.setblock(0, _block[0]);
1960 tgt.setblock(1, _block[1]);
1961 tgt.setblock(2, _block[2]);
1962 tgt.setblock(3, _block[3]);
1963 tgt.setblock(4, _block[4]);
1964 tgt.setblock(5, _block[5]);
1965 tgt.setblock(6, _block[6] & FSU_MASK);
1966 }
1967 else if constexpr (8 == fBlocks) {
1968 tgt.setblock(0, _block[0]);
1969 tgt.setblock(1, _block[1]);
1970 tgt.setblock(2, _block[2]);
1971 tgt.setblock(3, _block[3]);
1972 tgt.setblock(4, _block[4]);
1973 tgt.setblock(5, _block[5]);
1974 tgt.setblock(6, _block[6]);
1975 tgt.setblock(7, _block[7] & FSU_MASK);
1976 }
1977 else {
1978 for (size_t i = 0; i < FSU; ++i) {
1979 tgt.setblock(i, _block[i]);
1980 }
1981 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1982 }
1983 }
1984 }
1985 else { // this cfloat has no subnormals
1986 tgt.setzero(tgt.sign()); // preserve the sign
1987 }
1988 }
1989 else {
1990 // by design, a cfloat is either normal, subnormal, or supernormal, so this else clause is by deduction covering a supernormal
1991 // if (issupernormal()) { // it is a supernormal encoding
1992 if constexpr (hasSupernormals) {
1993 if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
1994 uint64_t raw = fraction_ull();
1995 raw |= (1ull << fbits); // add the hidden bit
1996 tgt.setbits(raw);
1997 }
1998 else {
1999 // brute force copy of blocks
2000 if constexpr (1 == fBlocks) {
2001 tgt.setblock(0, _block[0] & FSU_MASK);
2002 }
2003 else if constexpr (2 == fBlocks) {
2004 tgt.setblock(0, _block[0]);
2005 tgt.setblock(1, _block[1] & FSU_MASK);
2006 }
2007 else if constexpr (3 == fBlocks) {
2008 tgt.setblock(0, _block[0]);
2009 tgt.setblock(1, _block[1]);
2010 tgt.setblock(2, _block[2] & FSU_MASK);
2011 }
2012 else if constexpr (4 == fBlocks) {
2013 tgt.setblock(0, _block[0]);
2014 tgt.setblock(1, _block[1]);
2015 tgt.setblock(2, _block[2]);
2016 tgt.setblock(3, _block[3] & FSU_MASK);
2017 }
2018 else if constexpr (5 == fBlocks) {
2019 tgt.setblock(0, _block[0]);
2020 tgt.setblock(1, _block[1]);
2021 tgt.setblock(2, _block[2]);
2022 tgt.setblock(3, _block[3]);
2023 tgt.setblock(4, _block[4] & FSU_MASK);
2024 }
2025 else if constexpr (6 == fBlocks) {
2026 tgt.setblock(0, _block[0]);
2027 tgt.setblock(1, _block[1]);
2028 tgt.setblock(2, _block[2]);
2029 tgt.setblock(3, _block[3]);
2030 tgt.setblock(4, _block[4]);
2031 tgt.setblock(5, _block[5] & FSU_MASK);
2032 }
2033 else if constexpr (7 == fBlocks) {
2034 tgt.setblock(0, _block[0]);
2035 tgt.setblock(1, _block[1]);
2036 tgt.setblock(2, _block[2]);
2037 tgt.setblock(3, _block[3]);
2038 tgt.setblock(4, _block[4]);
2039 tgt.setblock(5, _block[5]);
2040 tgt.setblock(6, _block[6] & FSU_MASK);
2041 }
2042 else if constexpr (8 == fBlocks) {
2043 tgt.setblock(0, _block[0]);
2044 tgt.setblock(1, _block[1]);
2045 tgt.setblock(2, _block[2]);
2046 tgt.setblock(3, _block[3]);
2047 tgt.setblock(4, _block[4]);
2048 tgt.setblock(5, _block[5]);
2049 tgt.setblock(6, _block[6]);
2050 tgt.setblock(7, _block[7] & FSU_MASK);
2051 }
2052 else {
2053 for (size_t i = 0; i < FSU; ++i) {
2054 tgt.setblock(i, _block[i]);
2055 }
2056 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2057 }
2058 }
2059 }
2060 else { // this cfloat has no supernormals and thus this represents a nan, signalling or quiet determined by the sign
2061 tgt.setnan(tgt.sign());
2062 }
2063 // }
2064 }
2065 }
2066 }
2067 }
2068
2069 // Normalize a cfloat to a blocktriple used in mul, which has the form 0'00001.fffff
2070 // that is 2*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2071 // The result radix will go to 2*fbits after multiplication.
normalizeMultiplication(blocktriple<fbits,BlockTripleOperator::MUL,bt> & tgt) const2072 constexpr void normalizeMultiplication(blocktriple<fbits, BlockTripleOperator::MUL, bt>& tgt) const {
2073 // test special cases
2074 if (isnan()) {
2075 tgt.setnan();
2076 }
2077 else if (isinf()) {
2078 tgt.setinf();
2079 }
2080 else if (iszero()) {
2081 tgt.setzero();
2082 }
2083 else {
2084 tgt.setnormal(); // a blocktriple is always normalized
2085 int scale = this->scale();
2086 tgt.setsign(sign());
2087 tgt.setscale(scale);
2088 tgt.setradix(fbits);
2089 // set significant
2090 // we are going to unify to the format 01.ffffeeee
2091 // where 'f' is a fraction bit, and 'e' is an extension bit
2092 // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2093 if (isnormal()) {
2094 if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2095 uint64_t raw = fraction_ull();
2096 raw |= (1ull << fbits);
2097 tgt.setbits(raw);
2098 }
2099 else {
2100 // brute force copy of blocks
2101 if constexpr (1 == fBlocks) {
2102 tgt.setblock(0, _block[0] & FSU_MASK);
2103 }
2104 else if constexpr (2 == fBlocks) {
2105 tgt.setblock(0, _block[0]);
2106 tgt.setblock(1, _block[1] & FSU_MASK);
2107 }
2108 else if constexpr (3 == fBlocks) {
2109 tgt.setblock(0, _block[0]);
2110 tgt.setblock(1, _block[1]);
2111 tgt.setblock(2, _block[2] & FSU_MASK);
2112 }
2113 else if constexpr (4 == fBlocks) {
2114 tgt.setblock(0, _block[0]);
2115 tgt.setblock(1, _block[1]);
2116 tgt.setblock(2, _block[2]);
2117 tgt.setblock(3, _block[3] & FSU_MASK);
2118 }
2119 else if constexpr (5 == fBlocks) {
2120 tgt.setblock(0, _block[0]);
2121 tgt.setblock(1, _block[1]);
2122 tgt.setblock(2, _block[2]);
2123 tgt.setblock(3, _block[3]);
2124 tgt.setblock(4, _block[4] & FSU_MASK);
2125 }
2126 else if constexpr (6 == fBlocks) {
2127 tgt.setblock(0, _block[0]);
2128 tgt.setblock(1, _block[1]);
2129 tgt.setblock(2, _block[2]);
2130 tgt.setblock(3, _block[3]);
2131 tgt.setblock(4, _block[4]);
2132 tgt.setblock(5, _block[5] & FSU_MASK);
2133 }
2134 else if constexpr (7 == fBlocks) {
2135 tgt.setblock(0, _block[0]);
2136 tgt.setblock(1, _block[1]);
2137 tgt.setblock(2, _block[2]);
2138 tgt.setblock(3, _block[3]);
2139 tgt.setblock(4, _block[4]);
2140 tgt.setblock(5, _block[5]);
2141 tgt.setblock(6, _block[6] & FSU_MASK);
2142 }
2143 else if constexpr (8 == fBlocks) {
2144 tgt.setblock(0, _block[0]);
2145 tgt.setblock(1, _block[1]);
2146 tgt.setblock(2, _block[2]);
2147 tgt.setblock(3, _block[3]);
2148 tgt.setblock(4, _block[4]);
2149 tgt.setblock(5, _block[5]);
2150 tgt.setblock(6, _block[6]);
2151 tgt.setblock(7, _block[7] & FSU_MASK);
2152 }
2153 else {
2154 for (size_t i = 0; i < FSU; ++i) {
2155 tgt.setblock(i, _block[i]);
2156 }
2157 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2158 }
2159 }
2160 }
2161 else {
2162 if (isdenormal()) { // it is a subnormal encoding in this target cfloat
2163 if constexpr (hasSubnormals) {
2164 if constexpr (fbits < 64) {
2165 uint64_t raw = fraction_ull();
2166 int shift = MIN_EXP_NORMAL - scale;
2167 raw <<= shift;
2168 raw |= (1ull << fbits);
2169 tgt.setbits(raw);
2170 }
2171 else {
2172 // brute force copy of blocks
2173 if constexpr (1 == fBlocks) {
2174 tgt.setblock(0, _block[0] & FSU_MASK);
2175 }
2176 else if constexpr (2 == fBlocks) {
2177 tgt.setblock(0, _block[0]);
2178 tgt.setblock(1, _block[1] & FSU_MASK);
2179 }
2180 else if constexpr (3 == fBlocks) {
2181 tgt.setblock(0, _block[0]);
2182 tgt.setblock(1, _block[1]);
2183 tgt.setblock(2, _block[2] & FSU_MASK);
2184 }
2185 else if constexpr (4 == fBlocks) {
2186 tgt.setblock(0, _block[0]);
2187 tgt.setblock(1, _block[1]);
2188 tgt.setblock(2, _block[2]);
2189 tgt.setblock(3, _block[3] & FSU_MASK);
2190 }
2191 else if constexpr (5 == fBlocks) {
2192 tgt.setblock(0, _block[0]);
2193 tgt.setblock(1, _block[1]);
2194 tgt.setblock(2, _block[2]);
2195 tgt.setblock(3, _block[3]);
2196 tgt.setblock(4, _block[4] & FSU_MASK);
2197 }
2198 else if constexpr (6 == fBlocks) {
2199 tgt.setblock(0, _block[0]);
2200 tgt.setblock(1, _block[1]);
2201 tgt.setblock(2, _block[2]);
2202 tgt.setblock(3, _block[3]);
2203 tgt.setblock(4, _block[4]);
2204 tgt.setblock(5, _block[5] & FSU_MASK);
2205 }
2206 else if constexpr (7 == fBlocks) {
2207 tgt.setblock(0, _block[0]);
2208 tgt.setblock(1, _block[1]);
2209 tgt.setblock(2, _block[2]);
2210 tgt.setblock(3, _block[3]);
2211 tgt.setblock(4, _block[4]);
2212 tgt.setblock(5, _block[5]);
2213 tgt.setblock(6, _block[6] & FSU_MASK);
2214 }
2215 else if constexpr (8 == fBlocks) {
2216 tgt.setblock(0, _block[0]);
2217 tgt.setblock(1, _block[1]);
2218 tgt.setblock(2, _block[2]);
2219 tgt.setblock(3, _block[3]);
2220 tgt.setblock(4, _block[4]);
2221 tgt.setblock(5, _block[5]);
2222 tgt.setblock(6, _block[6]);
2223 tgt.setblock(7, _block[7] & FSU_MASK);
2224 }
2225 else {
2226 for (size_t i = 0; i < FSU; ++i) {
2227 tgt.setblock(i, _block[i]);
2228 }
2229 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2230 }
2231 }
2232 }
2233 else { // this cfloat has no subnormals
2234 tgt.setzero(tgt.sign()); // preserve the sign
2235 }
2236 }
2237 else {
2238 // by design, a cfloat is either normal, subnormal, or supernormal, so this else clause is by deduction covering a supernormal
2239 if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2240 uint64_t raw = fraction_ull();
2241 raw |= (1ull << fbits);
2242 tgt.setbits(raw);
2243 }
2244 else {
2245 // brute force copy of blocks
2246 if constexpr (1 == fBlocks) {
2247 tgt.setblock(0, _block[0] & FSU_MASK);
2248 }
2249 else if constexpr (2 == fBlocks) {
2250 tgt.setblock(0, _block[0]);
2251 tgt.setblock(1, _block[1] & FSU_MASK);
2252 }
2253 else if constexpr (3 == fBlocks) {
2254 tgt.setblock(0, _block[0]);
2255 tgt.setblock(1, _block[1]);
2256 tgt.setblock(2, _block[2] & FSU_MASK);
2257 }
2258 else if constexpr (4 == fBlocks) {
2259 tgt.setblock(0, _block[0]);
2260 tgt.setblock(1, _block[1]);
2261 tgt.setblock(2, _block[2]);
2262 tgt.setblock(3, _block[3] & FSU_MASK);
2263 }
2264 else if constexpr (5 == fBlocks) {
2265 tgt.setblock(0, _block[0]);
2266 tgt.setblock(1, _block[1]);
2267 tgt.setblock(2, _block[2]);
2268 tgt.setblock(3, _block[3]);
2269 tgt.setblock(4, _block[4] & FSU_MASK);
2270 }
2271 else if constexpr (6 == fBlocks) {
2272 tgt.setblock(0, _block[0]);
2273 tgt.setblock(1, _block[1]);
2274 tgt.setblock(2, _block[2]);
2275 tgt.setblock(3, _block[3]);
2276 tgt.setblock(4, _block[4]);
2277 tgt.setblock(5, _block[5] & FSU_MASK);
2278 }
2279 else if constexpr (7 == fBlocks) {
2280 tgt.setblock(0, _block[0]);
2281 tgt.setblock(1, _block[1]);
2282 tgt.setblock(2, _block[2]);
2283 tgt.setblock(3, _block[3]);
2284 tgt.setblock(4, _block[4]);
2285 tgt.setblock(5, _block[5]);
2286 tgt.setblock(6, _block[6] & FSU_MASK);
2287 }
2288 else if constexpr (8 == fBlocks) {
2289 tgt.setblock(0, _block[0]);
2290 tgt.setblock(1, _block[1]);
2291 tgt.setblock(2, _block[2]);
2292 tgt.setblock(3, _block[3]);
2293 tgt.setblock(4, _block[4]);
2294 tgt.setblock(5, _block[5]);
2295 tgt.setblock(6, _block[6]);
2296 tgt.setblock(7, _block[7] & FSU_MASK);
2297 }
2298 else {
2299 for (size_t i = 0; i < FSU; ++i) {
2300 tgt.setblock(i, _block[i]);
2301 }
2302 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2303 }
2304 }
2305
2306 }
2307 }
2308 }
2309 }
2310
2311 // normalize a cfloat to a blocktriple used in div, which has the form 0'00000'00001.fffff
2312 // that is 3*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2313 // the result radix will go to 2*fbits after multiplication.
2314 // TODO: needs implementation
normalizeDivision(blocktriple<fbits,BlockTripleOperator::DIV,bt> & tgt) const2315 constexpr void normalizeDivision(blocktriple<fbits, BlockTripleOperator::DIV, bt>& tgt) const {
2316 // test special cases
2317 if (isnan()) {
2318 tgt.setnan();
2319 }
2320 else if (isinf()) {
2321 tgt.setinf();
2322 }
2323 else if (iszero()) {
2324 tgt.setzero();
2325 }
2326 else {
2327 tgt.setnormal(); // a blocktriple is always normalized
2328 int scale = this->scale();
2329 tgt.setsign(sign());
2330 tgt.setscale(scale);
2331 // set significant
2332 // we are going to unify to the format 01.ffffeeee
2333 // where 'f' is a fraction bit, and 'e' is an extension bit
2334 // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2335 if (isnormal()) {
2336 if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2337 uint64_t raw = fraction_ull();
2338 raw |= (1ull << fbits);
2339 tgt.setbits(raw);
2340 }
2341 else {
2342 // brute force copy of blocks
2343 if constexpr (1 == fBlocks) {
2344 tgt.setblock(0, _block[0] & FSU_MASK);
2345 }
2346 else if constexpr (2 == fBlocks) {
2347 tgt.setblock(0, _block[0]);
2348 tgt.setblock(1, _block[1] & FSU_MASK);
2349 }
2350 else if constexpr (3 == fBlocks) {
2351 tgt.setblock(0, _block[0]);
2352 tgt.setblock(1, _block[1]);
2353 tgt.setblock(2, _block[2] & FSU_MASK);
2354 }
2355 else if constexpr (4 == fBlocks) {
2356 tgt.setblock(0, _block[0]);
2357 tgt.setblock(1, _block[1]);
2358 tgt.setblock(2, _block[2]);
2359 tgt.setblock(3, _block[3] & FSU_MASK);
2360 }
2361 else {
2362 for (size_t i = 0; i < FSU; ++i) {
2363 tgt.setblock(i, _block[i]);
2364 }
2365 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2366 }
2367 }
2368 }
2369 else { // it is a subnormal encoding in this target cfloat
2370 if constexpr (fbits < 64) {
2371 uint64_t raw = fraction_ull();
2372 int shift = MIN_EXP_NORMAL - scale;
2373 raw <<= shift;
2374 raw |= (1ull << fbits);
2375 tgt.setbits(raw);
2376 }
2377 else {
2378 // brute force copy of blocks
2379 if constexpr (1 == fBlocks) {
2380 tgt.setblock(0, _block[0] & FSU_MASK);
2381 }
2382 else if constexpr (2 == fBlocks) {
2383 tgt.setblock(0, _block[0]);
2384 tgt.setblock(1, _block[1] & FSU_MASK);
2385 }
2386 else if constexpr (3 == fBlocks) {
2387 tgt.setblock(0, _block[0]);
2388 tgt.setblock(1, _block[1]);
2389 tgt.setblock(2, _block[2] & FSU_MASK);
2390 }
2391 else if constexpr (4 == fBlocks) {
2392 tgt.setblock(0, _block[0]);
2393 tgt.setblock(1, _block[1]);
2394 tgt.setblock(2, _block[2]);
2395 tgt.setblock(3, _block[3] & FSU_MASK);
2396 }
2397 else {
2398 for (size_t i = 0; i < FSU; ++i) {
2399 tgt.setblock(i, _block[i]);
2400 }
2401 tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2402 }
2403 }
2404 }
2405 }
2406 }
2407
2408 protected:
2409 // HELPER methods
2410
2411 // convert an unsigned integer into a cfloat
2412 // TODO: this method does not protect against being called with a signed integer
2413 template<typename Ty>
convert_unsigned_integer(const Ty & rhs)2414 constexpr cfloat& convert_unsigned_integer(const Ty& rhs) noexcept {
2415 clear();
2416 if (0 == rhs) return *this;
2417 uint64_t raw = static_cast<uint64_t>(rhs);
2418 int exponent = int(findMostSignificantBit(raw)) - 1; // precondition that msb > 0 is satisfied by the zero test above
2419 constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
2420 uint32_t shift = sizeInBits - exponent - 1;
2421 raw <<= shift;
2422 raw = round<sizeInBits, uint64_t>(raw, exponent);
2423 return *this;
2424 }
2425 // convert a signed integer into a cfloat
2426 // TODO: this method does not protect against being called with a signed integer
2427 template<typename Ty>
convert_signed_integer(const Ty & rhs)2428 constexpr cfloat& convert_signed_integer(const Ty& rhs) noexcept {
2429 clear();
2430 if (0 == rhs) return *this;
2431 bool s = (rhs < 0);
2432 uint64_t raw = static_cast<uint64_t>(s ? -rhs : rhs);
2433 int exponent = int(findMostSignificantBit(raw)) - 1; // precondition that msb > 0 is satisfied by the zero test above
2434 constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
2435 uint32_t shift = sizeInBits - exponent - 1;
2436 raw <<= shift;
2437 raw = round<sizeInBits, uint64_t>(raw, exponent);
2438 #ifdef TODO
2439 // construct the target cfloat
2440 if constexpr (64 >= nbits - es - 1ull) {
2441 uint64_t bits = (s ? 1u : 0u);
2442 bits <<= es;
2443 bits |= exponent + EXP_BIAS;
2444 bits <<= nbits - 1ull - es;
2445 bits |= raw;
2446 bits &= 0xFFFF'FFFF;
2447 if constexpr (1 == nrBlocks) {
2448 _block[MSU] = bt(bits);
2449 }
2450 else {
2451 copyBits(bits);
2452 }
2453 }
2454 else {
2455 std::cerr << "TBD\n";
2456 }
2457 #endif
2458 return *this;
2459 }
2460
2461 public:
2462 template<typename Real>
convert_ieee754(Real rhs)2463 CONSTEXPRESSION cfloat& convert_ieee754(Real rhs) noexcept {
2464 // when we are a perfect match to single precision IEEE-754
2465 // take the bits verbatim as a cfloat is a superset of IEEE-754 in all configurations
2466 if constexpr (nbits == 32 && es == 8) {
2467 bool s{ false };
2468 uint64_t rawExponent{ 0 };
2469 uint64_t rawFraction{ 0 };
2470 // use native conversion
2471 extractFields(float(rhs), s, rawExponent, rawFraction);
2472 uint64_t raw{ s ? 1ull : 0ull };
2473 raw <<= 31;
2474 raw |= (rawExponent << fbits);
2475 raw |= rawFraction;
2476 setbits(raw);
2477 return *this;
2478 }
2479 // when we are a perfect match to double precision IEEE-754
2480 else if constexpr (nbits == 64 && es == 11) {
2481 bool s{ false };
2482 uint64_t rawExponent{ 0 };
2483 uint64_t rawFraction{ 0 };
2484 // use native conversion
2485 extractFields(double(rhs), s, rawExponent, rawFraction);
2486 uint64_t raw{ s ? 1ull : 0ull };
2487 raw <<= 63;
2488 raw |= (rawExponent << fbits);
2489 raw |= rawFraction;
2490 setbits(raw);
2491 return *this;
2492 }
2493 else {
2494 clear();
2495 // extract raw IEEE-754 bits
2496 bool s{ false };
2497 uint64_t rawExponent{ 0 };
2498 uint64_t rawFraction{ 0 };
2499 extractFields(rhs, s, rawExponent, rawFraction);
2500
2501 // special case handling
2502 if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf
2503 if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
2504 rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
2505 // 1.11111111.00000000.......00000001 signalling nan
2506 // 0.11111111.00000000000000000000001 signalling nan
2507 // MSVC
2508 // 1.11111111.10000000.......00000001 signalling nan
2509 // 0.11111111.10000000.......00000001 signalling nan
2510 setnan(NAN_TYPE_SIGNALLING);
2511 return *this;
2512 }
2513 if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
2514 // 1.11111111.10000000.......00000000 quiet nan
2515 // 0.11111111.10000000.......00000000 quiet nan
2516 setnan(NAN_TYPE_QUIET);
2517 return *this;
2518 }
2519 if (rawFraction == 0ull) {
2520 // 1.11111111.0000000.......000000000 -inf
2521 // 0.11111111.0000000.......000000000 +inf
2522 setinf(s);
2523 return *this;
2524 }
2525 }
2526 if (rhs == 0.0) { // IEEE rule: this is valid for + and - 0.0
2527 setbit(nbits - 1ull, s);
2528 return *this;
2529 }
2530 // normal number consists of fbits fraction bits and one hidden bit
2531 // subnormal number has no hidden bit
2532 int exponent = static_cast<int>(rawExponent) - ieee754_parameter<Real>::bias; // unbias the exponent
2533
2534 // check special case of
2535 // 1- saturating to maxpos/maxneg, or
2536 // 2- projecting to +-inf
2537 // if the value is out of range.
2538 //
2539 // One problem here is that at the rounding cusps of maxpos <-> inf <-> nan
2540 // you need to go through the rounding logic to know which encoding you end up
2541 // with.
2542 // For each specific cfloat configuration, you can work out these rounding cusps
2543 // but they need to go through the value transformation to map them back to native
2544 // IEEE-754. That is a complex computation to do as a static constexpr as you need
2545 // to construct the value, then evaluate it, and store it.
2546 //
2547 // The algorithm used here is to check for the obvious out of range values by
2548 // comparing their scale to the max scale this cfloat encoding can represent.
2549 // For the rounding cusps, we go through the rounding logic, and then clean up
2550 // after rounding using the observation that no conversion from a value can ever
2551 // yield the NaN encoding.
2552 //
2553 // The rounding logic will correctly sort between maxpos and inf, and we clean
2554 // up any NaN encodings by projecting back to the configuration's saturation rule.
2555 //
2556 // We could improve on this by creating the database of rounding cusps and
2557 // referencing them with a direct value comparison with the input. That would be
2558 // the most performant implementation.
2559 if (exponent > MAX_EXP) {
2560 if constexpr (isSaturating) {
2561 if (s) this->maxneg(); else this->maxpos(); // saturate to maxpos or maxneg
2562 }
2563 else {
2564 setinf(s);
2565 }
2566 return *this;
2567 }
2568 if constexpr (hasSubnormals) {
2569 if (exponent < MIN_EXP_SUBNORMAL - 1) {
2570 // map to +-0 any values that have a scale less than (MIN_EXP_SUBMORNAL - 1)
2571 this->setbit(nbits - 1, s);
2572 return *this;
2573 }
2574 }
2575 else {
2576 if (exponent < MIN_EXP_NORMAL - 1) {
2577 // map to +-0 any values that have a scale less than (MIN_EXP_MORNAL - 1)
2578 this->setbit(nbits - 1, s);
2579 return *this;
2580 }
2581 }
2582
2583 /////////////////
2584 /// end of special case processing, move on to value sampling and rounding
2585
2586 #if TRACE_CONVERSION
2587 std::cout << '\n';
2588 std::cout << "value : " << rhs << '\n';
2589 std::cout << "segments : " << to_binary(rhs) << '\n';
2590 std::cout << "sign bit : " << (s ? '1' : '0') << '\n';
2591 std::cout << "exponent bits : " << to_binary(rawExponent, ieee754_parameter<Real>::ebits, true) << '\n';
2592 std::cout << "fraction bits : " << to_binary(rawFraction, ieee754_parameter<Real>::fbits, true) << std::endl;
2593 std::cout << "exponent value : " << exponent << '\n';
2594 #endif
2595
2596 // do the following scenarios have different rounding bits?
2597 // input is normal, cfloat is normal <-- rounding can happen with native ieee-754 bits
2598 // input is normal, cfloat is subnormal
2599 // input is subnormal, cfloat is normal
2600 // input is subnormal, cfloat is subnormal
2601
2602 // The first condition is the relationship between the number
2603 // of fraction bits from the source and the number of fraction bits
2604 // in the target cfloat: these are constexpressions and guard the shifts
2605 // input fbits >= cfloat fbits <-- need to round
2606 // input fbits < cfloat fbits <-- no need to round
2607
2608 if constexpr (fbits < ieee754_parameter<Real>::fbits) {
2609 // this is the common case for cfloats that are smaller in size compared to single and double precision IEEE-754
2610 constexpr int rightShift = ieee754_parameter<Real>::fbits - fbits; // this is the bit shift to get the MSB of the src to the MSB of the tgt
2611 uint32_t biasedExponent{ 0 };
2612 int adjustment{ 0 }; // right shift adjustment for subnormal representation
2613 uint64_t mask;
2614 if (rawExponent != 0) {
2615 // the source real is a normal number,
2616 if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2617 // the value is a subnormal number in this representation: biasedExponent = 0
2618 // add the hidden bit to the fraction bits so the denormalization has the correct MSB
2619 rawFraction |= ieee754_parameter<Real>::hmask;
2620
2621 // fraction processing: we have 1 hidden + 23 explicit fraction bits
2622 // f = 1.ffff 2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2623 // -exponent because we are right shifting and exponent in this range is negative
2624 adjustment = -(exponent + subnormal_reciprocal_shift[es]);
2625 // this is the right shift adjustment required for subnormal representation due
2626 // to the scale of the input number, i.e. the exponent of 2^-adjustment
2627 }
2628 else {
2629 // the value is a normal number in this representation: common case
2630 biasedExponent = static_cast<uint32_t>(exponent + EXP_BIAS); // project the exponent into the target
2631 // fraction processing
2632 // float structure is: seee'eeee'efff'ffff'ffff'ffff'ffff'ffff, s = sign, e - exponent bit, f = fraction bit
2633 // target structure is for example cfloat<8,2>: seef'ffff
2634 // since both are normals, we can shift the incoming fraction to the target structure bits, and round
2635 // MSB of source = 23 - 1, MSB of target = fbits - 1: shift = MSB of src - MSB of tgt => 23 - fbits
2636 adjustment = 0;
2637 }
2638 if constexpr (rightShift > 0) {
2639 // if true we need to round
2640 // round-to-even logic
2641 // ... lsb | guard round sticky round
2642 // x 0 x x down
2643 // 0 1 0 0 down round to even
2644 // 1 1 0 0 up round to even
2645 // x 1 0 1 up
2646 // x 1 1 0 up
2647 // x 1 1 1 up
2648 // collect lsb, guard, round, and sticky bits
2649 // std::cout << "shift to LSB " << (rightShift + adjustment) << " adjustment = " << adjustment << '\n';
2650 mask = (1ull << (rightShift + adjustment)); // bit mask for the lsb bit
2651 bool lsb = (mask & rawFraction);
2652 mask >>= 1;
2653 bool guard = (mask & rawFraction);
2654 mask >>= 1;
2655 bool round = (mask & rawFraction);
2656 if constexpr (rightShift > 1) {
2657 mask = (0xFFFF'FFFF'FFFF'FFFFull << (rightShift - 2));
2658 mask = ~mask;
2659 }
2660 else {
2661 mask = 0;
2662 }
2663 bool sticky = (mask & rawFraction);
2664 rawFraction >>= (static_cast<int64_t>(rightShift) + static_cast<int64_t>(adjustment));
2665
2666 // execute rounding operation
2667 if (guard) {
2668 if (lsb && (!round && !sticky)) ++rawFraction; // round to even
2669 if (round || sticky) ++rawFraction;
2670 if (rawFraction == (1ull << fbits)) { // overflow
2671 if (biasedExponent == ALL_ONES_ES) { // overflow to INF == .111..01
2672 rawFraction = INF_ENCODING;
2673 }
2674 else {
2675 ++biasedExponent;
2676 rawFraction = 0;
2677 }
2678 }
2679 }
2680 #if TRACE_CONVERSION
2681 std::cout << "lsb : " << (lsb ? "1\n" : "0\n");
2682 std::cout << "guard : " << (guard ? "1\n" : "0\n");
2683 std::cout << "round : " << (round ? "1\n" : "0\n");
2684 std::cout << "sticky : " << (sticky ? "1\n" : "0\n");
2685 std::cout << "rounding decision : " << (lsb && (!round && !sticky) ? "round to even\n" : "-\n");
2686 std::cout << "rounding direction: " << (round || sticky ? "round up\n" : "round down\n");
2687 #endif
2688 }
2689 else { // all bits of the float go into this representation and need to be shifted up, no rounding necessary
2690 int shiftLeft = fbits - ieee754_parameter<Real>::fbits;
2691 rawFraction <<= shiftLeft;
2692 }
2693 #if TRACE_CONVERSION
2694 std::cout << "biased exponent : " << biasedExponent << " : 0x" << std::hex << biasedExponent << std::dec << '\n';
2695 std::cout << "right shift : " << rightShift << '\n';
2696 std::cout << "adjustment shift : " << adjustment << '\n';
2697 std::cout << "sticky bit mask : " << to_binary(mask, 32, true) << '\n';
2698 std::cout << "fraction bits : " << to_binary(rawFraction, 32, true) << '\n';
2699 #endif
2700 // construct the target cfloat
2701 uint64_t bits = (s ? 1ull : 0ull);
2702 bits <<= es;
2703 bits |= biasedExponent;
2704 bits <<= fbits;
2705 bits |= rawFraction;
2706 setbits(bits);
2707 }
2708 else {
2709 // the source real is a subnormal number
2710 mask = 0x00FF'FFFFu >> (fbits + exponent + subnormal_reciprocal_shift[es] + 1); // mask for sticky bit
2711
2712 // fraction processing: we have fbits+1 bits = 1 hidden + fbits explicit fraction bits
2713 // f = 1.ffff 2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2714 // -exponent because we are right shifting and exponent in this range is negative
2715 adjustment = -(exponent + subnormal_reciprocal_shift[es]); // this is the right shift adjustment due to the scale of the input number, i.e. the exponent of 2^-adjustment
2716
2717 std::cout << "source is subnormal: TBD\n";
2718 std::cout << "shift to LSB " << (rightShift + adjustment) << '\n';
2719 std::cout << "adjustment " << adjustment << '\n';
2720 std::cout << "exponent " << exponent << '\n';
2721 std::cout << "subnormal shift " << subnormal_reciprocal_shift[es] << '\n';
2722
2723 if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2724 // the value is a subnormal number in this representation
2725 }
2726 else {
2727 // the value is a normal number in this representation
2728 }
2729 }
2730 }
2731 else {
2732 // no need to round, but we need to shift left to deliver the bits
2733 // cfloat<40, 8> = float
2734 // cfloat<48, 9> = float
2735 // cfloat<56, 10> = float
2736 // cfloat<64, 11> = float
2737 // cfloat<64, 10> = double
2738 // can we go from an input subnormal to a cfloat normal?
2739 // yes, for example a cfloat<64,11> assigned to a subnormal float
2740
2741 // map exponent into target cfloat encoding
2742 uint64_t biasedExponent = static_cast<uint64_t>(static_cast<int64_t>(exponent) + EXP_BIAS);
2743 constexpr int upshift = fbits - ieee754_parameter<Real>::fbits;
2744 // output processing
2745 if constexpr (nbits < 65) {
2746 // we can compose the bits in a native 64-bit unsigned integer
2747 // common case: normal to normal
2748 // nbits = 40, es = 8, fbits = 31: rhs = float fbits = 23; shift left by (31 - 23) = 8
2749
2750 if (rawExponent != 0) {
2751 // rhs is a normal encoding
2752 uint64_t bits{ s ? 1ull : 0ull };
2753 bits <<= es;
2754 bits |= biasedExponent;
2755 bits <<= fbits;
2756 rawFraction <<= upshift;
2757 bits |= rawFraction;
2758 setbits(bits);
2759 }
2760 else {
2761 // rhs is a subnormal
2762 // std::cerr << "rhs is a subnormal : " << to_binary(rhs) << " : " << rhs << '\n';
2763 // we need to calculate the effective scale to see
2764 // if this value becomes a normal, or maps to a subnormal encoding
2765 // in this target format
2766 }
2767 }
2768 else {
2769 // we need to write and shift bits into place
2770 // use cases are cfloats like cfloat<80, 11, bt>
2771 // even though the bits that come in are single or double precision
2772 // we need to write the fields and then shifting them in place
2773 //
2774 // common case: normal to normal
2775 if (rawExponent != 0) {
2776 // nbits = 128, es = 15, fbits = 112: rhs = float: shift left by (112 - 23) = 89
2777 setbits(biasedExponent);
2778 shiftLeft(fbits);
2779 bt fractionBlock[nrBlocks]{ 0 };
2780 // copy fraction bits
2781 size_t blocksRequired = (8 * sizeof(rawFraction) + 1) / bitsInBlock;
2782 size_t maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
2783 uint64_t mask = static_cast<uint64_t>(ALL_ONES); // set up the block mask
2784 size_t shift = 0;
2785 for (size_t i = 0; i < maxBlockNr; ++i) {
2786 fractionBlock[i] = bt((mask & rawFraction) >> shift);
2787 mask <<= bitsInBlock;
2788 shift += bitsInBlock;
2789 }
2790 // shift fraction bits
2791 int bitsToShift = upshift;
2792 if (bitsToShift >= static_cast<int>(bitsInBlock)) {
2793 int blockShift = static_cast<int>(bitsToShift / bitsInBlock);
2794 for (int i = MSU; i >= blockShift; --i) {
2795 fractionBlock[i] = fractionBlock[i - blockShift];
2796 }
2797 for (int i = blockShift - 1; i >= 0; --i) {
2798 fractionBlock[i] = bt(0);
2799 }
2800 // adjust the shift
2801 bitsToShift -= blockShift * bitsInBlock;
2802 }
2803 if (bitsToShift > 0) {
2804 // construct the mask for the upper bits in the block that need to move to the higher word
2805 bt bitsToMoveMask = bt(ALL_ONES << (bitsInBlock - bitsToShift));
2806 for (size_t i = MSU; i > 0; --i) {
2807 fractionBlock[i] <<= bitsToShift;
2808 // mix in the bits from the right
2809 bt bits = static_cast<bt>(bitsToMoveMask & fractionBlock[i - 1]); // operator & yields an int
2810 fractionBlock[i] |= (bits >> (bitsInBlock - bitsToShift));
2811 }
2812 fractionBlock[0] <<= bitsToShift;
2813 }
2814 // OR the bits in
2815 for (size_t i = 0; i <= MSU; ++i) {
2816 _block[i] |= fractionBlock[i];
2817 }
2818 // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
2819 _block[MSU] &= MSU_MASK;
2820 // finally, set the sign bit
2821 setsign(s);
2822 }
2823 else {
2824 // rhs is a subnormal
2825 // std::cerr << "rhs is a subnormal : " << to_binary(rhs) << " : " << rhs << '\n';
2826 }
2827 }
2828 }
2829 // post-processing results to implement saturation and projection after rounding logic
2830 if constexpr (isSaturating) {
2831 if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
2832 maxpos();
2833 }
2834 else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
2835 maxneg();
2836 }
2837 }
2838 else {
2839 if (isnan(NAN_TYPE_QUIET)) {
2840 setinf(false);
2841 }
2842 else if (isnan(NAN_TYPE_SIGNALLING)) {
2843 setinf(true);
2844 }
2845 }
2846 return *this; // TODO: unreachable in some configurations
2847 }
2848 }
2849
2850 // post-processing results to implement saturation and projection after rounding logic
2851 // arithmetic bit operations can't produce NaN encodings, so we need to re-interpret
2852 // these encodings and 'project' them to the proper values.
post_process()2853 void constexpr post_process() noexcept {
2854 if constexpr (isSaturating) {
2855 if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
2856 maxpos();
2857 }
2858 else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
2859 maxneg();
2860 }
2861 }
2862 else {
2863 if (isnan(NAN_TYPE_QUIET)) {
2864 setinf(false);
2865 }
2866 else if (isnan(NAN_TYPE_SIGNALLING)) {
2867 setinf(true);
2868 }
2869 }
2870 }
2871
2872 protected:
2873
2874 /// <summary>
2875 /// round a set of source bits to the present representation.
2876 /// srcbits is the number of bits of significant in the source representation
2877 /// </summary>
2878 /// <typeparam name="StorageType"></typeparam>
2879 /// <param name="raw"></param>
2880 /// <returns></returns>
2881 template<size_t srcbits, typename StorageType>
round(StorageType raw,int & exponent)2882 constexpr uint64_t round(StorageType raw, int& exponent) noexcept {
2883 if constexpr (fhbits < srcbits) {
2884 // round to even: lsb guard round sticky
2885 // collect guard, round, and sticky bits
2886 // this same logic will work for the case where
2887 // we only have a guard bit and no round and sticky bits
2888 // because the mask logic will make round and sticky both 0
2889 constexpr uint32_t shift = srcbits - fhbits - 1ull;
2890 StorageType mask = (StorageType(1ull) << shift);
2891 bool guard = (mask & raw);
2892 mask >>= 1;
2893 bool round = (mask & raw);
2894 if constexpr (shift > 1u) { // protect against a negative shift
2895 StorageType allones(StorageType(~0));
2896 mask = StorageType(allones << (shift - 2));
2897 mask = ~mask;
2898 }
2899 else {
2900 mask = 0;
2901 }
2902 bool sticky = (mask & raw);
2903
2904 raw >>= (shift + 1); // shift out the bits we are rounding away
2905 bool lsb = (raw & 0x1u);
2906 // ... lsb | guard round sticky round
2907 // x 0 x x down
2908 // 0 1 0 0 down round to even
2909 // 1 1 0 0 up round to even
2910 // x 1 0 1 up
2911 // x 1 1 0 up
2912 // x 1 1 1 up
2913 if (guard) {
2914 if (lsb && (!round && !sticky)) ++raw; // round to even
2915 if (round || sticky) ++raw;
2916 if (raw == (1ull << nbits)) { // overflow
2917 ++exponent;
2918 raw >>= 1u;
2919 }
2920 }
2921 }
2922 else {
2923 constexpr size_t shift = fhbits - srcbits;
2924 if constexpr (shift < (sizeof(StorageType))) {
2925 raw <<= shift;
2926 }
2927 else {
2928 std::cerr << "round: shift " << shift << " >= " << sizeof(StorageType) << std::endl;
2929 raw = 0;
2930 }
2931 }
2932 uint64_t significant = raw;
2933 return significant;
2934 }
2935 template<typename ArgumentBlockType>
copyBits(ArgumentBlockType v)2936 constexpr void copyBits(ArgumentBlockType v) {
2937 size_t blocksRequired = (8 * sizeof(v) + 1 ) / bitsInBlock;
2938 size_t maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
2939 bt b{ 0ul }; b = bt(~b);
2940 ArgumentBlockType mask = ArgumentBlockType(b);
2941 size_t shift = 0;
2942 for (size_t i = 0; i < maxBlockNr; ++i) {
2943 _block[i] = bt((mask & v) >> shift);
2944 mask <<= bitsInBlock;
2945 shift += bitsInBlock;
2946 }
2947 }
shiftLeft(int leftShift)2948 void shiftLeft(int leftShift) {
2949 if (leftShift == 0) return;
2950 if (leftShift < 0) return shiftRight(-leftShift);
2951 if (leftShift > long(nbits)) leftShift = nbits; // clip to max
2952 if (leftShift >= long(bitsInBlock)) {
2953 int blockShift = leftShift / static_cast<int>(bitsInBlock);
2954 for (signed i = signed(MSU); i >= blockShift; --i) {
2955 _block[i] = _block[i - blockShift];
2956 }
2957 for (signed i = blockShift - 1; i >= 0; --i) {
2958 _block[i] = bt(0);
2959 }
2960 // adjust the shift
2961 leftShift -= (long)(blockShift * bitsInBlock);
2962 if (leftShift == 0) return;
2963 }
2964 // construct the mask for the upper bits in the block that need to move to the higher word
2965 // bt mask = static_cast<bt>(0xFFFFFFFFFFFFFFFFull << (bitsInBlock - leftShift));
2966 bt mask = ALL_ONES;
2967 mask <<= (bitsInBlock - leftShift);
2968 for (unsigned i = MSU; i > 0; --i) {
2969 _block[i] <<= leftShift;
2970 // mix in the bits from the right
2971 bt bits = static_cast<bt>(mask & _block[i - 1]);
2972 _block[i] |= (bits >> (bitsInBlock - leftShift));
2973 }
2974 _block[0] <<= leftShift;
2975 }
shiftRight(int rightShift)2976 void shiftRight(int rightShift) {
2977 if (rightShift == 0) return;
2978 if (rightShift < 0) return shiftLeft(-rightShift);
2979 if (rightShift >= long(nbits)) {
2980 setzero();
2981 return;
2982 }
2983 bool signext = sign();
2984 size_t blockShift = 0;
2985 if (rightShift >= long(bitsInBlock)) {
2986 blockShift = rightShift / bitsInBlock;
2987 if (MSU >= blockShift) {
2988 // shift by blocks
2989 for (size_t i = 0; i <= MSU - blockShift; ++i) {
2990 _block[i] = _block[i + blockShift];
2991 }
2992 }
2993 // adjust the shift
2994 rightShift -= (long)(blockShift * bitsInBlock);
2995 if (rightShift == 0) {
2996 // fix up the leading zeros if we have a negative number
2997 if (signext) {
2998 // rightShift is guaranteed to be less than nbits
2999 rightShift += (long)(blockShift * bitsInBlock);
3000 for (size_t i = nbits - rightShift; i < nbits; ++i) {
3001 this->setbit(i);
3002 }
3003 }
3004 else {
3005 // clean up the blocks we have shifted clean
3006 rightShift += (long)(blockShift * bitsInBlock);
3007 for (size_t i = nbits - rightShift; i < nbits; ++i) {
3008 this->setbit(i, false);
3009 }
3010 }
3011 }
3012 }
3013
3014 bt mask = ALL_ONES;
3015 mask >>= (bitsInBlock - rightShift); // this is a mask for the lower bits in the block that need to move to the lower word
3016 for (unsigned i = 0; i < MSU; ++i) { // TODO: can this be improved? we should not have to work on the upper blocks in case we block shifted
3017 _block[i] >>= rightShift;
3018 // mix in the bits from the left
3019 bt bits = static_cast<bt>(mask & _block[i + 1]); // & operator returns an int
3020 _block[i] |= (bits << (bitsInBlock - rightShift));
3021 }
3022 _block[MSU] >>= rightShift;
3023
3024 // fix up the leading zeros if we have a negative number
3025 if (signext) {
3026 // bitsToShift is guaranteed to be less than nbits
3027 rightShift += (long)(blockShift * bitsInBlock);
3028 for (size_t i = nbits - rightShift; i < nbits; ++i) {
3029 this->setbit(i);
3030 }
3031 }
3032 else {
3033 // clean up the blocks we have shifted clean
3034 rightShift += (long)(blockShift * bitsInBlock);
3035 for (size_t i = nbits - rightShift; i < nbits; ++i) {
3036 this->setbit(i, false);
3037 }
3038 }
3039
3040 // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
3041 _block[MSU] &= MSU_MASK;
3042 }
3043
3044 // calculate the integer power 2 ^ b using exponentiation by squaring
ipow(int exponent) const3045 double ipow(int exponent) const {
3046 bool negative = (exponent < 0);
3047 exponent = negative ? -exponent : exponent;
3048 double result(1.0);
3049 double base = 2.0;
3050 for (;;) {
3051 if (exponent % 2) result *= base;
3052 exponent >>= 1;
3053 if (exponent == 0) break;
3054 base *= base;
3055 }
3056 return (negative ? (1.0 / result) : result);
3057 }
3058
3059 private:
3060 bt _block[nrBlocks];
3061
3062 //////////////////////////////////////////////////////////////////////////////
3063 // friend functions
3064
3065 // template parameters need names different from class template parameters (for gcc and clang)
3066 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3067 friend std::ostream& operator<< (std::ostream& ostr, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3068 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3069 friend std::istream& operator>> (std::istream& istr, cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3070
3071 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3072 friend bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3073 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3074 friend bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3075 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3076 friend bool operator< (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3077 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3078 friend bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3079 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3080 friend bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3081 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3082 friend bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3083 };
3084
3085 ////////////////////// operators
3086 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <<(std::ostream & ostr,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3087 inline std::ostream& operator<<(std::ostream& ostr, const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating>& v) {
3088 // TODO: make it a native conversion
3089 ostr << double(v);
3090 return ostr;
3091 }
3092
3093 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >>(std::istream & istr,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3094 inline std::istream& operator>>(std::istream& istr, const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating>& v) {
3095 istr >> v._fraction;
3096 return istr;
3097 }
3098
3099 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator ==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3100 inline bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) {
3101 for (size_t i = 0; i < lhs.nrBlocks; ++i) {
3102 if (lhs._block[i] != rhs._block[i]) {
3103 return false;
3104 }
3105 }
3106 return true;
3107 }
3108 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator !=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3109 inline bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator==(lhs, rhs); }
3110 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator <(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3111 inline bool operator< (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return (lhs - rhs).isneg(); }
3112 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator >(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3113 inline bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return operator< (rhs, lhs); }
3114 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator <=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3115 inline bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator> (lhs, rhs); }
3116 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator >=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3117 inline bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator< (lhs, rhs); }
3118
3119 // posit - posit binary arithmetic operators
3120 // BINARY ADDITION
3121 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator +(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3122 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3123 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> sum(lhs);
3124 sum += rhs;
3125 return sum;
3126 }
3127 // BINARY SUBTRACTION
3128 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator -(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3129 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3130 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> diff(lhs);
3131 diff -= rhs;
3132 return diff;
3133 }
3134 // BINARY MULTIPLICATION
3135 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator *(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3136 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3137 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> mul(lhs);
3138 mul *= rhs;
3139 return mul;
3140 }
3141 // BINARY DIVISION
3142 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator /(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3143 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3144 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> ratio(lhs);
3145 ratio /= rhs;
3146 return ratio;
3147 }
3148
3149 // encoding helpers
3150
3151 // return the Unit in the Last Position
3152 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
ulp(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & a)3153 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> ulp(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& a) {
3154 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> b(a);
3155 return ++b - a;
3156 }
3157
3158 // convert to std::string
3159 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
to_string(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3160 inline std::string to_string(const cfloat<nbits,es,bt, hasSubnormals, hasSupernormals, isSaturating>& v) {
3161 std::stringstream s;
3162 if (v.iszero()) {
3163 s << " zero b";
3164 return s.str();
3165 }
3166 else if (v.isinf()) {
3167 s << " infinite b";
3168 return s.str();
3169 }
3170 // s << "(" << (v.sign() ? "-" : "+") << "," << v.scale() << "," << v.fraction() << ")";
3171 return s.str();
3172 }
3173
3174 // transform cfloat to a binary representation
3175 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
to_binary(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & number,bool nibbleMarker=false)3176 inline std::string to_binary(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& number, bool nibbleMarker = false) {
3177 std::stringstream s;
3178 s << "0b";
3179 size_t index = nbits;
3180 s << (number.at(--index) ? '1' : '0') << '.';
3181
3182 for (int i = int(es)-1; i >= 0; --i) {
3183 s << (number.at(--index) ? '1' : '0');
3184 if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
3185 }
3186
3187 s << '.';
3188
3189 constexpr int fbits = nbits - 1ull - es;
3190 for (int i = fbits-1; i >= 0; --i) {
3191 s << (number.at(--index) ? '1' : '0');
3192 if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
3193 }
3194
3195 return s.str();
3196 }
3197
3198 // transform a cfloat into a triple representation
3199 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
to_triple(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & number,bool nibbleMarker=true)3200 inline std::string to_triple(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& number, bool nibbleMarker = true) {
3201 std::stringstream s;
3202 blocktriple<cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>::fbits, BlockTripleOperator::REPRESENTATION, bt> triple;
3203 number.normalize(triple);
3204 s << to_triple(triple, nibbleMarker);
3205 return s.str();
3206 }
3207
3208 // Magnitude of a cfloat (equivalent to turning the sign bit off).
3209 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
3210 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>
abs(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3211 abs(const cfloat<nbits,es,bt, hasSubnormals, hasSupernormals, isSaturating>& v) {
3212 return cfloat<nbits,es,bt, hasSubnormals, hasSupernormals, isSaturating>(false, v.scale(), v.fraction(), v.isZero());
3213 }
3214
3215 ///////////////////////////////////////////////////////////////////////
3216 /// binary logic literal comparisons
3217
3218 // cfloat - literal float logic operators
3219 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3220 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3221 return float(lhs) == rhs;
3222 }
3223 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3224 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3225 return float(lhs) != rhs;
3226 }
3227 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3228 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3229 return float(lhs) < rhs;
3230 }
3231 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3232 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3233 return float(lhs) > rhs;
3234 }
3235 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3236 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3237 return float(lhs) <= rhs;
3238 }
3239 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3240 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3241 return float(lhs) >= rhs;
3242 }
3243 // cfloat - literal double logic operators
3244 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3245 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3246 return double(lhs) == rhs;
3247 }
3248 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3249 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3250 return double(lhs) != rhs;
3251 }
3252 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3253 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3254 return double(lhs) < rhs;
3255 }
3256 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3257 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3258 return double(lhs) > rhs;
3259 }
3260 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3261 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3262 return double(lhs) <= rhs;
3263 }
3264 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3265 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3266 return double(lhs) >= rhs;
3267 }
3268
3269 #if LONG_DOUBLE_SUPPORT
3270 // cfloat - literal long double logic operators
3271 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3272 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3273 return (long double)(lhs) == rhs;
3274 }
3275 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3276 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3277 return (long double)(lhs) != rhs;
3278 }
3279 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3280 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3281 return (long double)(lhs) < rhs;
3282 }
3283 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3284 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3285 return (long double)(lhs) > rhs;
3286 }
3287 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3288 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3289 return (long double)(lhs) <= rhs;
3290 }
3291 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3292 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3293 return (long double)(lhs) >= rhs;
3294 }
3295 #endif
3296
3297 // cfloat - literal int logic operators
3298 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3299 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3300 return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3301 }
3302 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3303 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3304 return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3305 }
3306 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3307 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3308 return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3309 }
3310 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3311 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3312 return operator<(cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs), lhs);
3313 }
3314 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3315 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3316 return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt>(rhs));
3317 }
3318 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3319 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3320 return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3321 }
3322
3323 // cfloat - long long logic operators
3324 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3325 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3326 return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3327 }
3328 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3329 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3330 return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3331 }
3332 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3333 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3334 return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3335 }
3336 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3337 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3338 return operator<(cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs), lhs);
3339 }
3340 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3341 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3342 return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt>(rhs));
3343 }
3344 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3345 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3346 return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3347 }
3348
3349 } // namespace sw::universal
3350