1
2 /*
3 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include "fslog_defs.h"
28
29 extern "C" __m128 __fvs_log_fma3(__m128);
30
__fvs_log_fma3(__m128 a)31 __m128 __fvs_log_fma3(__m128 a) {
32 __m128 const LOG_C1_VEC = _mm_set1_ps(LOG_C1);
33 __m128 const LOG_C2_VEC = _mm_set1_ps(LOG_C2);
34 __m128 const LOG_C3_VEC = _mm_set1_ps(LOG_C3);
35 __m128 const LOG_C4_VEC = _mm_set1_ps(LOG_C4);
36 __m128 const LOG_C5_VEC = _mm_set1_ps(LOG_C5);
37 __m128 const LOG_C6_VEC = _mm_set1_ps(LOG_C6);
38 __m128 const LOG_C7_VEC = _mm_set1_ps(LOG_C7);
39 __m128 const LOG_C8_VEC = _mm_set1_ps(LOG_C8);
40 __m128 const LOG_C9_VEC = _mm_set1_ps(LOG_C9);
41 __m128 const LOG_CA_VEC = _mm_set1_ps(LOG_CA);
42
43 __m128i const CANONICAL_NAN_VEC = _mm_set1_epi32(CANONICAL_NAN);
44 __m128i const MINUS_INF_VEC = _mm_set1_epi32(MINUS_INF);
45 __m128i const NAN_INF_MASK_VEC = _mm_set1_epi32(NAN_INF_MASK);
46
47 __m128 const PARTITION_CONST_VEC = _mm_set1_ps(PARTITION_CONST);
48 __m128 const TWO_TO_M126_F_VEC = _mm_set1_ps(TWO_TO_M126_F);
49 __m128 const TWO_TO_24_F_VEC = _mm_set1_ps(TWO_TO_24_F);
50
51 __m128 const ONE_VEC = _mm_set1_ps(1.0f);
52 __m128 const F24_VEC = _mm_set1_ps(U24);
53 __m128i const BIT_MASK2_VEC = _mm_set1_epi32(BIT_MASK2);
54 __m128i const OFFSET_VEC = _mm_set1_epi32(OFFSET);
55 __m128i exp_offset_vec = _mm_set1_epi32(EXP_OFFSET);
56
57 __m128 const FLT2INT_CVT = _mm_set1_ps(12582912.0f);
58 __m128 FLT2INT_CVT_BIAS = _mm_set1_ps(12582912.0f + 126.0f);
59
60 __m128 mask = _mm_cmp_ps(a, TWO_TO_M126_F_VEC, _CMP_LT_OS);
61 __m128 fix = _mm_blendv_ps(ONE_VEC, TWO_TO_24_F_VEC, mask);
62 a = _mm_mul_ps(a, fix);
63 FLT2INT_CVT_BIAS = _mm_add_ps(FLT2INT_CVT_BIAS, _mm_and_ps(mask, F24_VEC));
64
65 __m128 tmpm;
66 __m128 spec;
67
68 mask = _mm_cmp_ps(a, _mm_set1_ps(0.0f), _CMP_LT_OS);
69 spec = _mm_and_ps((__m128)CANONICAL_NAN_VEC, mask);
70
71 mask = _mm_cmp_ps(a, _mm_set1_ps(0.0f), _CMP_EQ_OS);
72 tmpm = _mm_and_ps(mask, (__m128)MINUS_INF_VEC);
73 spec = _mm_or_ps(tmpm, spec);
74
75 mask = _mm_cmp_ps(a, (__m128)NAN_INF_MASK_VEC, _CMP_EQ_OS);
76 tmpm = _mm_and_ps(mask, a);
77 spec = _mm_or_ps(tmpm,spec);
78 mask = _mm_cmp_ps(a, a, _CMP_NEQ_UQ);
79 tmpm = _mm_and_ps(mask, _mm_add_ps(a,a));
80 spec = _mm_or_ps(tmpm,spec);
81
82 __m128 e = (__m128)_mm_srli_epi32((__m128i)a, 23);
83 e = (__m128)_mm_add_epi32((__m128i)e, (__m128i)FLT2INT_CVT);
84 e = _mm_sub_ps(e, FLT2INT_CVT_BIAS);
85
86 __m128 m = _mm_and_ps((__m128)BIT_MASK2_VEC, a);
87 m = (__m128)_mm_add_epi32((__m128i)m, OFFSET_VEC);
88
89 __m128 mask_shift = _mm_cmp_ps(m, PARTITION_CONST_VEC, _CMP_LT_OS);
90
91 e = _mm_sub_ps(e, _mm_and_ps(mask_shift, _mm_set1_ps(1.0f)));
92 m = _mm_add_ps(m, _mm_and_ps(mask_shift, m));
93 m = _mm_sub_ps(m, _mm_set1_ps(1.0f));
94
95 __m128 const LN2 = _mm_set1_ps(0x1.62E43p-01);
96 e = _mm_mul_ps(e, LN2);
97
98 __m128 t = LOG_CA_VEC;
99 t = _mm_fmadd_ps(t, m, LOG_C9_VEC);
100 t = _mm_fmadd_ps(t, m, LOG_C8_VEC);
101 t = _mm_fmadd_ps(t, m, LOG_C7_VEC);
102 t = _mm_fmadd_ps(t, m, LOG_C6_VEC);
103 t = _mm_fmadd_ps(t, m, LOG_C5_VEC);
104 t = _mm_fmadd_ps(t, m, LOG_C4_VEC);
105 t = _mm_fmadd_ps(t, m, LOG_C3_VEC);
106 t = _mm_fmadd_ps(t, m, LOG_C2_VEC);
107 t = _mm_fmadd_ps(t, m, LOG_C1_VEC);
108
109 __m128 m2 = _mm_mul_ps(m, m);
110 t = _mm_fmadd_ps(t, m2, m);
111 t = _mm_add_ps(t, e);
112 t = _mm_add_ps(t, spec);
113
114 return t;
115 }
116
117