1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #pragma once
10
11 #include <limits.h>
12 #include <cmath>
13
14 namespace iSTD
15 {
16 /*****************************************************************************\
17 Constants:
18 FPU_FLOAT32_*
19
20 Description:
21 Binary representation of 32-bit floating point specials.
22 FPU_FLOAT32_COMPUTE special value can be used in result tables to mark
23 cases, where final value should be computed normally.
24 \*****************************************************************************/
25 const DWORD FPU_FLOAT32_NAN = 0x7FFFFFFF;
26 const DWORD FPU_FLOAT32_NEG_INF = 0xFF800000;
27 const DWORD FPU_FLOAT32_POS_INF = 0x7F800000;
28 const DWORD FPU_FLOAT32_NEG_ZERO = 0x80000000;
29 const DWORD FPU_FLOAT32_POS_ZERO = 0x00000000;
30 const DWORD FPU_FLOAT32_COMPUTE = 0xFFFFFFFF;
31 const DWORD FPU_FLOAT32_ONE = (DWORD) 0x3F800000;
32 const DWORD FPU_FLOAT32_MINUS_ONE = (DWORD) 0xBF800000;
33
34
35 /*****************************************************************************\
36 Enumeration:
37 FPU_FLOAT_CLASS
38
39 Description:
40 Classes of floating point numbers.
41 (+0, -0, +finite, -finite, +Inf, -Inf, NaN, -denorm, +denorm)
42 \*****************************************************************************/
43 enum FPU_FLOAT_CLASS {
44 FPU_FLOAT_CLASS_NEG_INF = 0,
45 FPU_FLOAT_CLASS_NEG_FINITE = 1,
46 FPU_FLOAT_CLASS_NEG_DENORM = 2,
47 FPU_FLOAT_CLASS_NEG_ZERO = 3,
48 FPU_FLOAT_CLASS_POS_ZERO = 4,
49 FPU_FLOAT_CLASS_POS_DENORM = 5,
50 FPU_FLOAT_CLASS_POS_FINITE = 6,
51 FPU_FLOAT_CLASS_POS_INF = 7,
52 FPU_FLOAT_CLASS_NAN = 8,
53 NUM_FPU_FLOAT_CLASSES = 9
54 };
55
56 /*****************************************************************************\
57 Inline Function:
58 Float32GetClass
59
60 Description:
61 Returns class (+0, -0, +finite, -finite, +Inf, -Inf, NaN) of 32-bit float.
62 \*****************************************************************************/
Float32GetClass(const float f)63 inline FPU_FLOAT_CLASS Float32GetClass( const float f )
64 {
65 FLOAT32 f32;
66 f32.value.f = f;
67
68 switch( f32.value.u )
69 {
70 case FPU_FLOAT32_POS_ZERO: return FPU_FLOAT_CLASS_POS_ZERO;
71 case FPU_FLOAT32_NEG_ZERO: return FPU_FLOAT_CLASS_NEG_ZERO;
72 case FPU_FLOAT32_POS_INF: return FPU_FLOAT_CLASS_POS_INF;
73 case FPU_FLOAT32_NEG_INF: return FPU_FLOAT_CLASS_NEG_INF;
74 default: break;
75 }
76
77 if( f32.exponent == 0xFF )
78 {
79 return FPU_FLOAT_CLASS_NAN;
80 }
81 else if( f32.exponent == 0x00 )
82 {
83 if( f32.sign == 0 )
84 {
85 return FPU_FLOAT_CLASS_POS_DENORM;
86 }
87 else
88 {
89 return FPU_FLOAT_CLASS_NEG_DENORM;
90 }
91 }
92
93 if( f32.sign )
94 {
95 return FPU_FLOAT_CLASS_NEG_FINITE;
96 }
97
98 return FPU_FLOAT_CLASS_POS_FINITE;
99 }
100
101 /*****************************************************************************\
102 Inline Function:
103 Float32IsInfinity
104
105 Description:
106 Returns true if class is +Inf or -Inf of 32-bit float.
107 \*****************************************************************************/
Float32IsInfinity(const float f)108 inline bool Float32IsInfinity( const float f )
109 {
110 FPU_FLOAT_CLASS fClass = Float32GetClass( f );
111
112 return ( fClass == FPU_FLOAT_CLASS_POS_INF ) ||
113 ( fClass == FPU_FLOAT_CLASS_NEG_INF );
114 }
115
116 /*****************************************************************************\
117 Inline Function:
118 Float32IsDenorm
119
120 Description:
121 Returns true if class is +Denorm or -Denorm.
122 \*****************************************************************************/
Float32IsDenorm(const float f)123 inline bool Float32IsDenorm( const float f )
124 {
125 FPU_FLOAT_CLASS fClass = Float32GetClass( f );
126
127 return ( fClass == FPU_FLOAT_CLASS_NEG_DENORM ) ||
128 ( fClass == FPU_FLOAT_CLASS_POS_DENORM );
129 }
130
131 /*****************************************************************************\
132
133 Inline Function:
134 Float32IsFinite
135
136 Description:
137 Returns true if f is finite: not +/-INF, and not NaN.
138 \*****************************************************************************/
Float32IsFinite(const float f)139 inline bool Float32IsFinite( const float f )
140 {
141 FPU_FLOAT_CLASS fClass = Float32GetClass( f );
142
143 return ( fClass != FPU_FLOAT_CLASS_NAN ) &&
144 ( fClass != FPU_FLOAT_CLASS_NEG_INF ) &&
145 ( fClass != FPU_FLOAT_CLASS_POS_INF );
146 }
147
148 /*****************************************************************************\
149 Inline Function:
150 IsFPZero
151
152 Description:
153 Returns true if the argument x seen as a 32-bit IEEE754 floating point
154 number is either positive or negative zero +0.0, -0.0.
155
156 Input:
157 dword value that will be interpreted as a binary32 representation
158 of single-precision floating point value.
159
160 Output:
161 True if the value represents either positive or negative float zero.
162
163 \*****************************************************************************/
IsFPZero(const DWORD x)164 inline bool IsFPZero( const DWORD x )
165 {
166 return ( x == iSTD::FPU_FLOAT32_POS_ZERO ) ||
167 ( x == iSTD::FPU_FLOAT32_NEG_ZERO );
168 }
169
170 /*****************************************************************************\
171 Inline Function:
172 Float32SafeAdd
173
174 Description:
175 Performs addition taking care of floating point specials in software.
176 \*****************************************************************************/
Float32SafeAdd(const float arg1,const float arg2,const bool denormRetain)177 inline float Float32SafeAdd( const float arg1, const float arg2, const bool denormRetain )
178 {
179 // Table for handling IEEE 754 specials in addition
180 //
181 // a + b -Inf -X -0 +0 +X +Inf NaN
182 //
183 // -Inf -Inf -Inf -Inf -Inf -Inf NaN NaN
184 // -X -Inf <add> <add> <add> <add> +Inf NaN
185 // -0 -Inf <add> -0 +0 <add> +Inf NaN
186 // +0 -Inf <add> +0 +0 <add> +Inf NaN
187 // +X -Inf <add> <add> <add> <add> +Inf NaN
188 // +Inf NaN +Inf +Inf +Inf +Inf +Inf NaN
189 // NaN NaN NaN NaN NaN NaN NaN NaN
190 //
191
192 static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
193 // -Inf -X -denorm -0 +0 +denorm +X +Inf NaN
194 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // -Inf
195 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // -X
196 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // -denorm
197 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // -0
198 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // +0
199 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // +denorm
200 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // +X
201 { FPU_FLOAT32_NAN , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // +Inf
202 { FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // NaN
203 };
204
205 const FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
206 const FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
207
208 FLOAT32 f32;
209 f32.value.u = RESULT[ t1 ][ t2 ];
210
211 bool computeDenorms = ( denormRetain && ( Float32IsDenorm( arg1 ) || Float32IsDenorm( arg2 ) ) );
212
213 if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
214 {
215 return arg1 + arg2;
216 }
217
218 return f32.value.f;
219 }
220
221 /*****************************************************************************\
222 Inline Function:
223 Float32SafeSubtract
224
225 Description:
226 Performs subtraction taking care of floating point specials in software.
227 \*****************************************************************************/
Float32SafeSubtract(const float arg1,const float arg2,const bool denormRetain)228 inline float Float32SafeSubtract( const float arg1, const float arg2, const bool denormRetain )
229 {
230 FLOAT32 f32;
231 f32.value.f = arg2;
232
233 // flip sign bit
234 f32.sign ^= 1;
235
236 return Float32SafeAdd( arg1, f32.value.f, denormRetain );
237 }
238
239 /*****************************************************************************\
240 Inline Function:
241 Float32SafeMultiply
242
243 Description:
244 Performs multiplication taking care of floating point specials in software.
245 \*****************************************************************************/
Float32SafeMultiply(const float arg1,const float arg2,const bool denormRetain)246 inline float Float32SafeMultiply( const float arg1, const float arg2, const bool denormRetain )
247 {
248 // Table for handling IEEE 754 specials in multiplication
249 //
250 // a * b -Inf -X -0 +0 +X +Inf NaN
251 //
252 // -Inf +Inf +Inf NaN NaN -Inf -Inf NaN
253 // -X +Inf <mul> +0 -0 <mul> -Inf NaN
254 // -0 NaN +0 +0 -0 -0 NaN NaN
255 // +0 NaN -0 -0 +0 +0 NaN NaN
256 // +X -Inf <mul> -0 +0 <mul> +Inf NaN
257 // +Inf -Inf -Inf NaN NaN +Inf +Inf NaN
258 // NaN NaN NaN NaN NaN NaN NaN NaN
259 //
260
261 static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
262 // -Inf -X -denorm -0 +0 +denorm +X +Inf NaN
263 { FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN }, // -Inf
264 { FPU_FLOAT32_POS_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN }, // -X
265 { FPU_FLOAT32_NAN , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // -denorm
266 { FPU_FLOAT32_NAN , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // -0
267 { FPU_FLOAT32_NAN , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // +0
268 { FPU_FLOAT32_NAN , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // +denorm
269 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // +X
270 { FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN }, // +Inf
271 { FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // NaN
272 };
273
274 FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
275 FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
276
277 FLOAT32 f32;
278 f32.value.u = RESULT[ t1 ][ t2 ];
279
280 bool computeDenorms = ( denormRetain && ( Float32IsDenorm( arg1 ) || Float32IsDenorm( arg2 ) ) );
281
282 if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
283 {
284 return arg1 * arg2;
285 }
286
287 return f32.value.f;
288 }
289
290 /*****************************************************************************\
291 Inline Function:
292 Float32SafeFMA
293
294 Description:
295 Performs fused mutliply and add taking care of floating point specials in
296 software.
297
298 This is machine generated code provided by SSG.
299
300 \*****************************************************************************/
Float32SafeFMA(const float a,const float b,const float c)301 inline float Float32SafeFMA( const float a, const float b, const float c )
302 {
303 const DWORD _own_large_value_32[] = { 0x71800000, 0xf1800000 };
304 const DWORD _own_small_value_32[] = { 0x0d800000, 0x8d800000 };
305 const DWORD _ones[] = { 0x3f800000, 0xbf800000 };
306
307 DWORD ux = 0;
308 DWORD uy = 0;
309 DWORD uz = 0;
310 DWORD ur = 0;
311 DWORD xbits = 0;
312 DWORD ybits = 0;
313 DWORD zbits = 0;
314 DWORD uhi = 0;
315 DWORD ulo = 0;
316 DWORD vhi = 0;
317 DWORD vlo = 0;
318 DWORD remain = 0;
319 DWORD temp = 0;
320 DWORD L_mask = 0;
321 DWORD R_mask = 0;
322
323 INT zsign = 0;
324 INT rsign = 0;
325 INT xexp = 0;
326 INT yexp = 0;
327 INT zexp = 0;
328 INT rexp = 0;
329 INT carry = 0;
330 INT borrow = 0;
331 INT rm = 0;
332 INT shift = 0;
333 INT L_shift = 0;
334 INT R_shift = 0;
335
336 UINT64 ubits = 0;
337 float resultf = 0;
338 float tv = 0;
339 float x = a;
340 float y = b;
341 float z = c;
342
343 // Set to round to nearest even.
344 rm = 0;
345
346 ux = FLOAT32( x >= 0.0f ? x : -x ).value.u;
347 uy = FLOAT32( y >= 0.0f ? y : -y ).value.u;;
348 uz = FLOAT32( z >= 0.0f ? z : -z ).value.u;;
349
350 int cond1 = ( ux == 0 ) |
351 ( ux >= 0x7f800000 ) |
352 ( ux == 0x3f800000 ) |
353 ( uy == 0 ) |
354 ( uy >= 0x7f800000 ) |
355 ( uy == 0x3f800000 ) |
356 ( uz == 0 ) |
357 ( uz >= 0x7f800000 );
358
359 if( cond1 != 0 )
360 {
361 if( Float32IsInfinity( z ) &&
362 !Float32IsInfinity( x ) &&
363 !Float32IsInfinity( y ) )
364 {
365 resultf = ( z + x ) + y;
366 }
367 else
368 {
369 resultf = x * y + z;
370 }
371
372 return resultf;
373 }
374
375 xexp = (int)( ux >> 23 );
376 yexp = (int)( uy >> 23 );
377 zexp = (int)( uz >> 23 );
378
379 xbits = 0x00800000 | ( ux & 0x007fffff );
380 ybits = 0x00800000 | ( uy & 0x007fffff );
381 zbits = 0x00800000 | ( uz & 0x007fffff );
382
383
384 rsign = ( FLOAT32(x).value.s ^ FLOAT32(y).value.s ) & 0x80000000;
385 rexp = ( xexp + yexp ) - 0x7F;
386 ubits = (UINT64)xbits * ybits;
387
388 if( (DWORD) ( ubits >> 32 ) & 0x00008000 )
389 {
390 uhi = (DWORD)( ubits >> 24 );
391 ulo = ( (DWORD)ubits << 8 );
392 rexp++;
393 }
394 else
395 {
396 uhi = (DWORD)( ubits >> 23 );
397 ulo = ( (DWORD)ubits << 9 );
398 }
399
400 int cond2 = ( rexp > zexp ) |
401 ( ( rexp == zexp ) & ( uhi >= zbits ) );
402
403 if( cond2 != 0 )
404 {
405 shift = ( rexp - zexp );
406 vhi = zbits;
407 vlo = 0;
408 zsign = FLOAT32(z).value.s & 0x80000000;
409 }
410 else
411 {
412 shift = ( zexp - rexp );
413 rexp = zexp;
414 vhi = uhi;
415 vlo = ulo;
416 uhi = zbits;
417 ulo = 0;
418 zsign = rsign;
419 rsign = FLOAT32(z).value.s & 0x80000000;
420 }
421
422 remain = 0;
423 if( shift != 0 )
424 {
425 if( shift < 32 )
426 {
427 L_shift = 32 - shift;
428 R_shift = shift - 0;
429 L_mask = ~( 0xffffffffu >> R_shift );
430 remain = ( vlo << L_shift );
431 vlo = ( ( vhi << L_shift ) & L_mask) | ( vlo >> R_shift );
432 vhi = ( vhi >> R_shift );
433 }
434 else if( shift < 64 )
435 {
436 L_shift = 64 - shift;
437 R_shift = shift - 32;
438 L_mask = ~( 0xffffffffu >> R_shift );
439 remain = ( ( vhi << L_shift ) & L_mask ) | ( vlo != 0 );
440 vlo = ( vhi >> R_shift );
441 vhi = 0;
442 }
443 else
444 {
445 remain = ( vhi | vlo ) != 0;
446 vhi = vlo = 0;
447 }
448 }
449
450 if( rsign == zsign )
451 {
452 temp = ulo;
453 ulo += vlo;
454 carry = ( ulo < temp );
455 uhi += ( vhi + carry );
456
457 if ( uhi & 0x01000000 )
458 {
459 remain = ( uhi << 31 ) | ( ( ulo | remain ) != 0 );
460 ur = ( uhi >> 1 ) & 0x007fffff;
461 rexp += 1;
462 }
463 else
464 {
465 remain = ulo | ( remain != 0 );
466 ur = (uhi & 0x007fffff);
467 }
468 }
469 else
470 {
471 remain = ( 0 - remain );
472 borrow = ( remain != 0 );
473 temp = ulo;
474 ulo -= borrow;
475 borrow = ( ulo > temp );
476 uhi -= borrow;
477 temp = ulo;
478 ulo -= vlo;
479 borrow = ( ulo > temp );
480 uhi -= borrow;
481 uhi -= vhi;
482
483 if( uhi != 0 )
484 {
485 temp = ( uhi << 8 );
486 shift = 0;
487 }
488 else if( ulo != 0 )
489 {
490 temp = ulo;
491 shift = 24;
492 }
493 else if( remain != 0 )
494 {
495 temp = remain;
496 shift = 24 + 32;
497 }
498 else
499 {
500 return FLOAT32( (DWORD)0x00000000 ).value.f;
501 }
502
503 shift += clz( temp );
504
505 if( shift < 32 )
506 {
507 L_shift = shift - 0;
508 R_shift = 32 - shift;
509 R_mask = ( (DWORD) 1 << L_shift ) - 1;
510 ur = ( ( uhi << L_shift ) | (( ulo >> R_shift ) & R_mask ) ) & 0x007fffff;
511 remain = ( ulo << L_shift ) | ( remain != 0 );
512 }
513 else if( shift < 64 )
514 {
515 L_shift = shift - 32;
516 R_shift = 64 - shift;
517 R_mask = ( (DWORD) 1 << L_shift ) - 1;
518 ur = ( ( ulo << L_shift ) | ( ( remain >> R_shift ) & R_mask ) ) & 0x007fffff;
519 remain = ( remain << L_shift );
520 }
521 else
522 {
523 L_shift = shift - 64;
524 ur = ( remain << L_shift ) & 0x007fffff;
525 remain = 0;
526 }
527 rexp -= shift;
528 }
529
530 if( (DWORD) rexp - 1 >= 0xFF - 1 )
531 {
532 if( rexp >= 0xFF )
533 {
534 rsign = ( (DWORD)rsign >> 31 );
535 if( rsign )
536 {
537 resultf = tv = FLOAT32(_own_large_value_32[(1)]).value.f * FLOAT32(_own_large_value_32[0]).value.f;
538 }
539 else
540 {
541 resultf = tv = FLOAT32(_own_large_value_32[(0)]).value.f * FLOAT32(_own_large_value_32[0]).value.f;
542 }
543
544 return resultf;
545 }
546 else
547 {
548 //enters here only for rexp = 0
549 L_shift = 31;
550 R_shift = 1;
551 L_mask = ~(0xffffffffu >> R_shift );
552 ur |= 0x00800000;
553 remain = ( ( ur << L_shift ) & L_mask ) | ( remain != 0 );
554 ur = ( ur >> R_shift );
555
556 }
557 }
558 else
559 {
560 ur |= ( rexp << 23 );
561 }
562
563 if( remain != 0 )
564 {
565 tv = ( ( (float *)_ones)[0] + ( (float *)_own_small_value_32)[0] );
566
567 int cond3, cond4, cond5, cond6;
568
569 switch( rm )
570 {
571 case ( 0 << 10 ):
572 cond3 = ( ( remain & 0x80000000 ) != 0 ) & ( ( ( ur & 1 ) != 0 ) |
573 ( ( remain & ~0x80000000 ) != 0 ) );
574 if( cond3 != 0 )
575 {
576 ur++;
577 if( ur >= 0x7f800000 )
578 {
579 rsign = ( (unsigned)rsign >> 31 );
580 if( rsign )
581 {
582 resultf = tv =
583 ( ( (float *) _own_large_value_32)[1] *
584 ( (float *) _own_large_value_32)[0] );
585 }
586 else
587 {
588 resultf = tv =
589 (((float *) _own_large_value_32)[(0)] *
590 ((float *) _own_large_value_32)[0]);
591 }
592
593 return resultf;
594 }
595 }
596
597 case ( 3 << 10 ):
598 cond4 = ( ur < 0x00800000 ) |
599 ( (ur == 0x00800000 ) & ( remain == 0x80000000 ) );
600
601 if( cond4 != 0 )
602 {
603 tv = ( ( ( float *)_own_small_value_32)[0] *
604 ( ( float *)_own_small_value_32)[0] );
605 }
606 break;
607
608 case ( 2 << 10 ):
609 cond5 = ( rsign & ( ur < 0x00800000 ) ) |
610 ( (!rsign) & ( (ur < 0x007fffff ) | ( ( ur == 0x007fffff ) & ( remain < 0x80000000 ) ) ) );
611
612 if( cond5 != 0 )
613 {
614 tv = ( ( (float *)_own_small_value_32)[0] *
615 ( (float *)_own_small_value_32)[0] );
616 }
617
618 if( !rsign )
619 {
620 ur++;
621 if( ur >= 0x7f800000 )
622 {
623 //rsign = ((unsigned) rsign >> 31);
624 resultf = tv = ( ( (float *)_own_large_value_32)[0] *
625 ( (float *)_own_large_value_32)[0] );
626 return resultf;
627 }
628 }
629 break;
630
631 case ( 1 << 10 ):
632 cond6 = ( !rsign & ( ur < 0x00800000 ) ) |
633 ( rsign & ( (ur < 0x007fffff ) | ( ( ur == 0x007fffff ) & ( remain < 0x80000000 ) ) ) );
634
635 if( cond6 != 0 )
636 {
637 tv = ( ( (float *)_own_small_value_32)[0] *
638 ( (float *)_own_small_value_32)[0] );
639 }
640
641 if( rsign )
642 {
643 ur++;
644 if (ur >= 0x7f800000 )
645 {
646 //rsign = ((unsigned) rsign >> 31);
647 resultf = tv =
648 ( ( (float *)_own_large_value_32)[1] *
649 ( (float *)_own_large_value_32)[0] );
650
651 return resultf;
652 }
653 }
654 break;
655 }
656 }
657
658 resultf = FLOAT32( (DWORD) (rsign | ur ) ).value.f;
659
660 return resultf;
661 }
662
663 /*****************************************************************************\
664 Inline Function:
665 Float32SafeRSQRT
666
667 Description:
668 Performs correctly rounded single precision reciprocal square root
669 operation taking care of floating point specials in software.
670 \*****************************************************************************/
Float32SafeRSQRT(const float arg,bool denormRetain)671 inline float Float32SafeRSQRT( const float arg, bool denormRetain )
672 {
673 static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES] =
674 {
675 FPU_FLOAT32_NAN, // rsqrt( -inf ) = NaN
676 FPU_FLOAT32_NAN, // rsqrt( -X ) = NaN //but to be really OK,we should try to maintain the NaN payload
677 FPU_FLOAT32_NAN, // rsqrt( -denorm ) = NaN //but to be really OK,we should try to maintain the NaN payload
678 FPU_FLOAT32_NEG_INF, // rsqrt( -0 ) = -inf
679 FPU_FLOAT32_POS_INF, // rsqrt( +0 ) = +inf
680 FPU_FLOAT32_COMPUTE, // rsqrt( +denorm) = computed value
681 FPU_FLOAT32_COMPUTE, // rsqrt( +X ) == computed value
682 FPU_FLOAT32_POS_ZERO, // rsqrt( +inf ) == +0.0
683 FPU_FLOAT32_NAN // rsqrt( NaN ) == NaN
684 };
685
686 FPU_FLOAT_CLASS t1 = Float32GetClass( arg );
687
688 FLOAT32 f32;
689 f32.value.u = RESULT[ t1 ];
690
691 bool computeDenorms = denormRetain && Float32IsDenorm( arg );
692
693 if ( !computeDenorms && t1 == FPU_FLOAT_CLASS_NEG_DENORM )
694 {
695 f32.value.u = FPU_FLOAT32_NEG_INF;
696 }
697 if ( !computeDenorms && t1 == FPU_FLOAT_CLASS_POS_DENORM )
698 {
699 f32.value.u = FPU_FLOAT32_POS_INF;
700 }
701
702 if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
703 {
704 double darg = arg;
705 double s = sqrt(darg); //double-precision square root
706 double result = 1.0 / s; //double-precision division
707 return static_cast<float>(result); //back to floats
708 }
709
710 return f32.value.f;
711 }
712
713 /*****************************************************************************\
714 Inline Function:
715 Float32SafeDivide
716
717 Description:
718 Performs division taking care of floating point specials in software.
719 \*****************************************************************************/
Float32SafeDivide(const float arg1,const float arg2,const bool denormRetain)720 inline float Float32SafeDivide( const float arg1, const float arg2, const bool denormRetain )
721 {
722 // Table for handling IEEE 754 specials in division
723 //
724 // a / b -Inf -X -0 +0 +X +Inf NaN
725 //
726 // -Inf NaN +Inf +Inf -Inf -Inf NaN NaN
727 // -X +0 <div> +Inf -Inf <div> -0 NaN
728 // -0 +0 +0 NaN NaN -0 -0 NaN
729 // +0 -0 -0 NaN NaN +0 +0 NaN
730 // +X -0 <div> -Inf +Inf <div> +0 NaN
731 // +Inf NaN -Inf -Inf +Inf +Inf NaN NaN
732 // NaN NaN NaN NaN NaN NaN NaN NaN
733 //
734
735 static const DWORD RESULT[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
736 // -Inf -X -denorm -0 +0 +denorm +X +Inf NaN
737 { FPU_FLOAT32_NAN , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // -Inf
738 { FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN }, // -X
739 { FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN }, // -denorm
740 { FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN }, // -0
741 { FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN }, // +0
742 { FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN }, // +denorm
743 { FPU_FLOAT32_NEG_ZERO, FPU_FLOAT32_COMPUTE , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_COMPUTE , FPU_FLOAT32_POS_ZERO, FPU_FLOAT32_NAN }, // +X
744 { FPU_FLOAT32_NAN , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_NEG_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_POS_INF , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // +Inf
745 { FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN , FPU_FLOAT32_NAN }, // NaN
746 };
747
748 FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
749 FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
750
751 FLOAT32 f32;
752 f32.value.u = RESULT[ t1 ][ t2 ];
753
754 bool computeDenorms = ( denormRetain && ( Float32IsDenorm( arg1 ) || Float32IsDenorm( arg2 ) ) );
755
756 if( ( f32.value.u == FPU_FLOAT32_COMPUTE ) || ( computeDenorms ) )
757 {
758 return arg1 / arg2;
759 }
760
761 return f32.value.f;
762 }
763
764 /*****************************************************************************\
765 Inline Function:
766 Signed32SafeDivideQuotient
767
768 Description:
769 Computes src0 divided by src1
770 Table for handling signed divide quotient and remainder:
771 IDIV SRC0
772 SRC1 +INT -INT 0
773 +INT +INT -INT 0
774 -INT -INT +INT 0
775 0 Q:0x7FFFFFFF Q: 0x80000000 Q:0x7FFFFFFF
776 R:0x7FFFFFFF R: 0x80000000 R:0x7FFFFFFF
777 \*****************************************************************************/
Signed32SafeDivideQuotient(const signed long src0,const signed long src1)778 inline signed long Signed32SafeDivideQuotient(
779 const signed long src0,
780 const signed long src1 )
781 {
782 if( !src1 )
783 {
784 if( src0 < 0 )
785 {
786 return LONG_MIN;
787 }
788 return LONG_MAX;
789 }
790
791 return src0 / src1;
792 }
793
794 /*****************************************************************************\
795 Inline Function:
796 Signed32SafeDivideRemainder
797
798 Description:
799 Computes remainder of src0 divided by src1
800 \*****************************************************************************/
Signed32SafeDivideRemainder(const signed long src0,const signed long src1)801 inline signed long Signed32SafeDivideRemainder(
802 const signed long src0,
803 const signed long src1 )
804 {
805 if( !src1 )
806 {
807 if( src0 < 0 )
808 {
809 return LONG_MIN;
810 }
811 return LONG_MAX;
812 }
813
814 return src0 % src1;
815 }
816
817 /*****************************************************************************\
818 Inline Function:
819 Unsigned32SafeDivideQuotient
820
821 Description:
822 Computes src0 divided by src1
823 Table for handling unsigned divide quotient and remainder
824 UDIV SRC0
825 SRC1 <>0 0
826 <>0 UINT 0
827 0 Q:0xFFFFFFFF Q:0xFFFFFFFF
828 R:0xFFFFFFFF R:0xFFFFFFFF
829 \*****************************************************************************/
Unsigned32SafeDivideQuotient(const DWORD src0,const DWORD src1)830 inline DWORD Unsigned32SafeDivideQuotient(
831 const DWORD src0,
832 const DWORD src1 )
833 {
834 if( !src1 )
835 {
836 return UINT_MAX;
837 }
838
839 return src0 / src1;
840 }
841
842 /*****************************************************************************\
843 Inline Function:
844 Unsigned32SafeDivideRemainder
845
846 Description:
847 Computes remainder of src0 divided by src1
848 \*****************************************************************************/
Unsigned32SafeDivideRemainder(const DWORD src0,const DWORD src1)849 inline DWORD Unsigned32SafeDivideRemainder(
850 const DWORD src0,
851 const DWORD src1 )
852 {
853 if( !src1 )
854 {
855 return UINT_MAX;
856 }
857
858 return src0 % src1;
859 }
860
861 /*****************************************************************************\
862 Inline Function:
863 F32ToF16_d
864
865 Description:
866 Float32 to float16 conversion based on "Fast Half Float Conversions"
867 by Jeroen van der Zijp
868
869 Input:
870 32-bit DWORD represantation of float value
871 Output:
872 16-bit DWORD represantation of float value
873
874 \*****************************************************************************/
F32ToF16_d(DWORD arg)875 inline WORD F32ToF16_d( DWORD arg )
876 {
877 static const WORD btbl[512] = {
878 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
879 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
880 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
881 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
882 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
883 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
884 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
885 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
886 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7c00,
887 0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
888 0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
889 0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
890 0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
891 0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
892 0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
893 0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,0x7c00,
894 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
895 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
896 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
897 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
898 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
899 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
900 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
901 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
902 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfc00,
903 0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
904 0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
905 0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
906 0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
907 0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
908 0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,
909 0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00,0xfc00
910 };
911 static const unsigned char stbl[512] = {
912 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
913 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
914 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
915 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
916 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
917 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
918 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
919 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
920 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
921 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
922 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
923 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
924 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
925 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
926 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
927 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x0d,
928 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
929 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
930 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
931 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
932 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
933 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
934 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
935 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
936 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
937 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
938 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
939 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
940 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
941 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
942 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
943 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x0d
944 };
945 DWORD sexp = (arg>>23)&0x1ff;
946 return (WORD)(btbl[ sexp ]+( (arg&0x007fffff)>>stbl[ sexp ] ));
947 }
948
949 /*****************************************************************************\
950
951 Inline Function:
952 F32ToF16_f
953
954 Description:
955 Float32 to float16 conversion based on "Fast Half Float Conversions"
956 by Jeroen van der Zijp
957
958 Input:
959 32-bit float value
960 Output:
961 16-bit WORD represantation of float value
962
963 \*****************************************************************************/
F32ToF16_f(float arg)964 inline WORD F32ToF16_f( float arg )
965 {
966 return F32ToF16_d( *(DWORD *)&arg );
967 }
968
969 /*****************************************************************************\
970
971 Inline Function:
972 F16ToF32
973
974 Description:
975 Float16 to float32 conversion
976
977 Input:
978 16-bit WORD representation of float16 value
979 Output:
980 32-bit DWORD represantation of float32 value
981
982 \*****************************************************************************/
F16ToF32(WORD v)983 static inline DWORD F16ToF32( WORD v )
984 {
985 unsigned long index;
986 return
987 // is exponent!=0 ?
988 v & 0x7C00
989 // is exponent==max ?
990 ? ( v & 0x7C00 ) == 0x7C00
991 // is mantissa!=0 ?
992 ? v & 0x03FF
993 // convert NaN
994 ? ( ( v << 13 ) + 0x70000000 ) | 0x7f800000
995 // convert infinities
996 : ( v << 16 ) | 0x7f800000
997 // convert normalized values
998 : ( ( ( v << 13 ) + 0x70000000 ) & ~0x70000000 ) + 0x38000000
999 // is mantissa non-zero ?
1000 : v & 0x03FF
1001 // convert denormalized values
1002 ? index=bsr( v & 0x03FF ), ( ( ( ( v << 16 ) & 0x80000000 ) | ( ( v << 13 ) & 0xF800000 ) ) + 0x33800000 + ( index << 23 ) ) | ( ( ( v & 0x03FF ) << ( 23-index ) ) & ~0x800000 )
1003 // convert zeros
1004 : v << 16;
1005 }
1006
1007 /*****************************************************************************\
1008 Inline Function:
1009 Float32SafeMax
1010
1011 Description:
1012 MinMax of Floating Point Numbers.
1013
1014 Input:
1015 arg1
1016 arg2
1017 isGen7
1018
1019 Output:
1020 max( arg1, arg2 )
1021
1022 \*****************************************************************************/
Float32SafeMax(const float arg1,const float arg2,bool isGen7)1023 inline float Float32SafeMax( const float arg1, const float arg2, bool isGen7 )
1024 {
1025 // Values of following arrays corresponds to results of sel.l instructions.
1026
1027 static const bool RESULT_preGen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1028 // -Inf -X -denorm -0 +0 +denorm +X +Inf NaN
1029 { true , false , false , false , false , false , false , false , true }, // -Inf
1030 { true , false , false , false , false , false , false , false , true }, // -X
1031 { true , true , true , true , true , true , false , false , true }, // -denorm
1032 { true , true , true , true , true , true , false , false , true }, // -0
1033 { true , true , true , true , true , true , false , false , true }, // +0
1034 { true , true , true , true , true , true , false , false , true }, // +denorm
1035 { true , true , true , true , true , true , false , false , true }, // +X
1036 { true , true , true , true , true , true , true , true , true }, // +Inf
1037 { false , false , false , false , false , false , false , false , false }, // NaN
1038 };
1039
1040 static const bool RESULT_Gen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1041 // -Inf -X -denorm -0 +0 +denorm +X +Inf NaN
1042 { true , false , false , false , false , false , false , false , true }, // -Inf
1043 { true , false , false , false , false , false , false , false , true }, // -X
1044 { true , true , true , true , true , true , false , false , true }, // -denorm
1045 { true , true , true , true , false , true , false , false , true }, // -0
1046 { true , true , true , true , true , true , false , false , true }, // +0
1047 { true , true , true , true , true , true , false , false , true }, // +denorm
1048 { true , true , true , true , true , true , false , false , true }, // +X
1049 { true , true , true , true , true , true , true , true , true }, // +Inf
1050 { false , false , false , false , false , false , false , false , false }, // NaN
1051 };
1052
1053 const FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
1054 const FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
1055
1056 if( ( t1 == FPU_FLOAT_CLASS_NEG_FINITE || t1 == FPU_FLOAT_CLASS_POS_FINITE ) &&
1057 ( t2 == FPU_FLOAT_CLASS_NEG_FINITE || t2 == FPU_FLOAT_CLASS_POS_FINITE ) )
1058 {
1059 return ( arg1 >= arg2 ) ? arg1 : arg2;
1060 }
1061
1062 FLOAT32 f32;
1063
1064 if( isGen7 )
1065 {
1066 f32.value.f = ( RESULT_Gen7[t1][t2] ) ? arg1 : arg2;
1067 }
1068 else
1069 {
1070 f32.value.f = ( RESULT_preGen7[t1][t2] ) ? arg1 : arg2;
1071 }
1072
1073 return f32.value.f;
1074 }
1075
1076 /*****************************************************************************\
1077 Inline Function:
1078 Float32SafeMin
1079
1080 Description:
1081 MinMax of Floating Point Numbers.
1082
1083 Input:
1084 arg1
1085 arg2
1086 isGen7
1087
1088 Output:
1089 max( arg1, arg2 )
1090
1091 \*****************************************************************************/
Float32SafeMin(const float arg1,const float arg2,bool isGen7)1092 inline float Float32SafeMin( const float arg1, const float arg2, bool isGen7 )
1093 {
1094 // Values of following arrays corresponds to results of sel.ge instruction.
1095
1096 static const bool RESULT_preGen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1097 // -Inf -X -denorm -0 +0 +denorm +X +Inf NaN
1098 { false , true , true , true , true , true , true , true , true }, // -Inf
1099 { false , false , true , true , true , true , true , true , true }, // -X
1100 { false , false , false , false , false , false , true , true , true }, // -denorm
1101 { false , false , false , false , false , false , true , true , true }, // -0
1102 { false , false , false , false , false , false , true , true , true }, // +0
1103 { false , false , false , false , false , false , true , true , true }, // +denorm
1104 { false , false , false , false , false , false , false , true , true }, // +X
1105 { false , false , false , false , false , false , false , false , true }, // +Inf
1106 { false , false , false , false , false , false , false , false , false }, // NaN
1107 };
1108
1109 static const bool RESULT_Gen7[NUM_FPU_FLOAT_CLASSES][NUM_FPU_FLOAT_CLASSES] = {
1110 // -Inf -X -denorm -0 +0 +denorm +X +Inf NaN
1111 { false , true , true , true , true , true , true , true , true }, // -Inf
1112 { false , false , true , true , true , true , true , true , true }, // -X
1113 { false , false , false , false , false , false , true , true , true }, // -denorm
1114 { false , false , false , false , true , false , true , true , true }, // -0
1115 { false , false , false , false , false , false , true , true , true }, // +0
1116 { false , false , false , false , false , false , true , true , true }, // +denorm
1117 { false , false , false , false , false , false , false , true , true }, // +X
1118 { false , false , false , false , false , false , false , false , true }, // +Inf
1119 { false , false , false , false , false , false , false , false , false }, // NaN
1120 };
1121
1122 const FPU_FLOAT_CLASS t1 = Float32GetClass( arg1 );
1123 const FPU_FLOAT_CLASS t2 = Float32GetClass( arg2 );
1124
1125 if( ( t1 == FPU_FLOAT_CLASS_NEG_FINITE || t1 == FPU_FLOAT_CLASS_POS_FINITE ) &&
1126 ( t2 == FPU_FLOAT_CLASS_NEG_FINITE || t2 == FPU_FLOAT_CLASS_POS_FINITE ) )
1127 {
1128 return ( arg1 < arg2 ) ? arg1 : arg2;
1129 }
1130
1131 FLOAT32 f32;
1132
1133 if( isGen7 )
1134 {
1135 f32.value.f = ( RESULT_Gen7[t1][t2] ) ? arg1 : arg2;
1136 }
1137 else
1138 {
1139 f32.value.f = ( RESULT_preGen7[t1][t2] ) ? arg1 : arg2;
1140 }
1141
1142 return f32.value.f;
1143 }
1144
1145 /*****************************************************************************\
1146 Inline Function:
1147 FloatSaturate
1148
1149 Description:
1150
1151 For a floating-point destination type, the saturation target range is [0.0,
1152 1.0]. For a floating-point NaN, there is no "closest value"; any NaN
1153 saturates to 0.0. (...) Any floating-point number greater than 1.0,
1154 including +INF, saturates to 1.0. Any negative floating-point number,
1155 including -INF, saturates to 0.0. Any floating-point number in the range 0.0
1156 to 1.0 is not changed by saturation.
1157
1158 -0.0 is changed to +0.0.
1159
1160 Input:
1161 const float f
1162
1163 Output:
1164 float
1165
1166 \*****************************************************************************/
FloatSaturate(const float f)1167 inline float FloatSaturate( const float f )
1168 {
1169 switch( Float32GetClass( f ) )
1170 {
1171 case FPU_FLOAT_CLASS_NEG_INF:
1172 case FPU_FLOAT_CLASS_NEG_FINITE:
1173 case FPU_FLOAT_CLASS_NEG_DENORM:
1174 case FPU_FLOAT_CLASS_NEG_ZERO:
1175 case FPU_FLOAT_CLASS_POS_ZERO:
1176 case FPU_FLOAT_CLASS_NAN:
1177 return 0.f;
1178 case FPU_FLOAT_CLASS_POS_DENORM:
1179 return f;
1180 case FPU_FLOAT_CLASS_POS_FINITE:
1181 return ( f <= 1.f ) ? f : 1.f;
1182 case FPU_FLOAT_CLASS_POS_INF:
1183 return 1.f;
1184 default:
1185 ASSERT( 0 );
1186 return 0.f;
1187 }
1188 }
1189
1190 } // namespace iSTD
1191