1 
2 /*
3  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include "fslog_defs.h"
28 
29 extern "C" __m128 __fvs_log_fma3(__m128);
30 
__fvs_log_fma3(__m128 a)31 __m128 __fvs_log_fma3(__m128 a) {
32     __m128 const LOG_C1_VEC = _mm_set1_ps(LOG_C1);
33     __m128 const LOG_C2_VEC = _mm_set1_ps(LOG_C2);
34     __m128 const LOG_C3_VEC = _mm_set1_ps(LOG_C3);
35     __m128 const LOG_C4_VEC = _mm_set1_ps(LOG_C4);
36     __m128 const LOG_C5_VEC = _mm_set1_ps(LOG_C5);
37     __m128 const LOG_C6_VEC = _mm_set1_ps(LOG_C6);
38     __m128 const LOG_C7_VEC = _mm_set1_ps(LOG_C7);
39     __m128 const LOG_C8_VEC = _mm_set1_ps(LOG_C8);
40     __m128 const LOG_C9_VEC = _mm_set1_ps(LOG_C9);
41     __m128 const LOG_CA_VEC = _mm_set1_ps(LOG_CA);
42 
43     __m128i const CANONICAL_NAN_VEC = _mm_set1_epi32(CANONICAL_NAN);
44     __m128i const MINUS_INF_VEC = _mm_set1_epi32(MINUS_INF);
45     __m128i const NAN_INF_MASK_VEC = _mm_set1_epi32(NAN_INF_MASK);
46 
47     __m128 const PARTITION_CONST_VEC = _mm_set1_ps(PARTITION_CONST);
48     __m128 const TWO_TO_M126_F_VEC = _mm_set1_ps(TWO_TO_M126_F);
49     __m128 const TWO_TO_24_F_VEC = _mm_set1_ps(TWO_TO_24_F);
50 
51     __m128 const ONE_VEC = _mm_set1_ps(1.0f);
52     __m128 const F24_VEC = _mm_set1_ps(U24);
53     __m128i const BIT_MASK2_VEC = _mm_set1_epi32(BIT_MASK2);
54     __m128i const OFFSET_VEC = _mm_set1_epi32(OFFSET);
55     __m128i exp_offset_vec = _mm_set1_epi32(EXP_OFFSET);
56 
57     __m128 const FLT2INT_CVT = _mm_set1_ps(12582912.0f);
58     __m128 FLT2INT_CVT_BIAS = _mm_set1_ps(12582912.0f + 126.0f);
59 
60     __m128 mask = _mm_cmp_ps(a, TWO_TO_M126_F_VEC, _CMP_LT_OS);
61     __m128 fix = _mm_blendv_ps(ONE_VEC, TWO_TO_24_F_VEC, mask);
62     a = _mm_mul_ps(a, fix);
63     FLT2INT_CVT_BIAS = _mm_add_ps(FLT2INT_CVT_BIAS, _mm_and_ps(mask, F24_VEC));
64 
65     __m128 tmpm;
66     __m128 spec;
67 
68     mask = _mm_cmp_ps(a, _mm_set1_ps(0.0f), _CMP_LT_OS);
69     spec = _mm_and_ps((__m128)CANONICAL_NAN_VEC, mask);
70 
71     mask = _mm_cmp_ps(a, _mm_set1_ps(0.0f), _CMP_EQ_OS);
72     tmpm = _mm_and_ps(mask, (__m128)MINUS_INF_VEC);
73     spec = _mm_or_ps(tmpm, spec);
74 
75     mask = _mm_cmp_ps(a, (__m128)NAN_INF_MASK_VEC, _CMP_EQ_OS);
76     tmpm = _mm_and_ps(mask, a);
77     spec = _mm_or_ps(tmpm,spec);
78     mask = _mm_cmp_ps(a, a, _CMP_NEQ_UQ);
79     tmpm = _mm_and_ps(mask, _mm_add_ps(a,a));
80     spec = _mm_or_ps(tmpm,spec);
81 
82     __m128 e = (__m128)_mm_srli_epi32((__m128i)a, 23);
83            e = (__m128)_mm_add_epi32((__m128i)e, (__m128i)FLT2INT_CVT);
84            e = _mm_sub_ps(e, FLT2INT_CVT_BIAS);
85 
86     __m128 m = _mm_and_ps((__m128)BIT_MASK2_VEC, a);
87            m = (__m128)_mm_add_epi32((__m128i)m, OFFSET_VEC);
88 
89     __m128 mask_shift = _mm_cmp_ps(m, PARTITION_CONST_VEC, _CMP_LT_OS);
90 
91     e = _mm_sub_ps(e, _mm_and_ps(mask_shift, _mm_set1_ps(1.0f)));
92     m = _mm_add_ps(m, _mm_and_ps(mask_shift, m));
93     m = _mm_sub_ps(m, _mm_set1_ps(1.0f));
94 
95     __m128 const LN2 = _mm_set1_ps(0x1.62E43p-01);
96     e = _mm_mul_ps(e, LN2);
97 
98     __m128 t =                       LOG_CA_VEC;
99            t = _mm_fmadd_ps(t, m, LOG_C9_VEC);
100            t = _mm_fmadd_ps(t, m, LOG_C8_VEC);
101            t = _mm_fmadd_ps(t, m, LOG_C7_VEC);
102            t = _mm_fmadd_ps(t, m, LOG_C6_VEC);
103            t = _mm_fmadd_ps(t, m, LOG_C5_VEC);
104            t = _mm_fmadd_ps(t, m, LOG_C4_VEC);
105            t = _mm_fmadd_ps(t, m, LOG_C3_VEC);
106            t = _mm_fmadd_ps(t, m, LOG_C2_VEC);
107            t = _mm_fmadd_ps(t, m, LOG_C1_VEC);
108 
109     __m128 m2 = _mm_mul_ps(m, m);
110            t = _mm_fmadd_ps(t, m2, m);
111            t = _mm_add_ps(t, e);
112            t = _mm_add_ps(t, spec);
113 
114     return t;
115 }
116 
117