1 /*
2 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18 /*
19 * Common set of interface routines to convert an intrinsic math library call
20 * using Intel AVX-512 vectors in to two calls of the corresponding AVX2
21 * implementation.
22 *
23 * Note: code is common to both AVX-512 and KNL architectures.
24 * Thus, have to use Intel intrinsics that are common to both systems.
25 */
26
27
28 static
29 vrs16_t
30 __attribute__((noinline))
__gs_z2yy_x(vrs16_t x,vrs8_t (* func)(vrs8_t))31 __gs_z2yy_x(vrs16_t x, vrs8_t(*func)(vrs8_t))
32 {
33 vrs8_t rl, ru;
34 ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1));
35 rl = func((vrs8_t) _mm512_castps512_ps256(x));
36 return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
37 (__m256d)ru, 1);
38 }
39
40 static
41 vrs16_t
42 __attribute__((noinline))
__gs_z2yy_xy(vrs16_t x,vrs16_t y,vrs8_t (* func)(vrs8_t,vrs8_t))43 __gs_z2yy_xy(vrs16_t x, vrs16_t y, vrs8_t(*func)(vrs8_t, vrs8_t))
44 {
45 vrs8_t rl, ru;
46 ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),
47 (vrs8_t) _mm512_extractf64x4_pd((__m512d)y, 1));
48 rl = func((vrs8_t) _mm512_castps512_ps256(x),
49 (vrs8_t) _mm512_castps512_ps256(y));
50 return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
51 (__m256d)ru, 1);
52 }
53
54 static
55 vrs16_t
56 __attribute__((noinline))
__gs_z2yy_sincos(vrs16_t x,vrs8_t (* func)(vrs8_t))57 __gs_z2yy_sincos(vrs16_t x, vrs8_t(*func)(vrs8_t))
58 {
59 vrs8_t su, sl, cu;
60 su = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1));
61 asm("vmovaps %%ymm1, %0" : :"m"(cu) :);
62 sl = func((vrs8_t) _mm512_castps512_ps256(x));
63 asm("vinsertf64x4 $0x1,%0,%%zmm1,%%zmm1" : : "m"(cu) : );
64 return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)sl),
65 (__m256d)su, 1);
66 }
67
68 static
69 vrs16_t
70 __attribute__((noinline))
__gs_z2yy_xk1(vrs16_t x,int64_t iy,vrs8_t (* func)(vrs8_t,int64_t))71 __gs_z2yy_xk1(vrs16_t x, int64_t iy, vrs8_t(*func)(vrs8_t, int64_t))
72 {
73 vrs8_t rl, ru;
74 ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1), iy);
75 rl = func((vrs8_t) _mm512_castps512_ps256(x), iy);
76 return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
77 (__m256d)ru, 1);
78 }
79
80 static
81 vrs16_t
82 __attribute__((noinline))
__gs_z2yy_xi(vrs16_t x,vis16_t iy,vrs8_t (* func)(vrs8_t,vis8_t))83 __gs_z2yy_xi(vrs16_t x, vis16_t iy, vrs8_t(*func)(vrs8_t, vis8_t))
84 {
85 vrs8_t rl, ru;
86 ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),
87 (vis8_t) _mm512_extractf64x4_pd((__m512d)iy, 1));
88 rl = func((vrs8_t) _mm512_castps512_ps256(x),
89 (vis8_t) _mm512_castps512_ps256((__m512)iy));
90 return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
91 (__m256d)ru, 1);
92 }
93
94 static
95 vrs16_t
96 __attribute__((noinline))
__gs_z2yy_xk(vrs16_t x,vid8_t iyu,vid8_t iyl,vrs8_t (* func)(vrs8_t,vid4_t,vid4_t))97 __gs_z2yy_xk(vrs16_t x, vid8_t iyu, vid8_t iyl, vrs8_t(*func)(vrs8_t, vid4_t, vid4_t))
98 {
99 vrs8_t rl, ru;
100 ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),
101 (vid4_t) _mm512_extractf64x4_pd((__m512d)iyu, 1),
102 (vid4_t) _mm512_extractf64x4_pd((__m512d)iyu, 0));
103 rl = func((vrs8_t) _mm512_castps512_ps256(x),
104 (vid4_t) _mm512_extractf64x4_pd((__m512d)iyl, 1),
105 (vid4_t) _mm512_extractf64x4_pd((__m512d)iyl, 0));
106 return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
107 (__m256d)ru, 1);
108 }
109
110 static
111 vrd8_t
112 __attribute__((noinline))
__gd_z2yy_x(vrd8_t x,vrd4_t (* func)(vrd4_t))113 __gd_z2yy_x(vrd8_t x, vrd4_t(*func)(vrd4_t))
114 {
115 vrd4_t rl, ru;
116 ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1));
117 rl = func((vrd4_t) _mm512_castpd512_pd256(x));
118 return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
119 (__m256d)ru, 1);
120 }
121
122 static
123 vrd8_t
124 __attribute__((noinline))
__gd_z2yy_xy(vrd8_t x,vrd8_t y,vrd4_t (* func)(vrd4_t,vrd4_t))125 __gd_z2yy_xy(vrd8_t x, vrd8_t y, vrd4_t(*func)(vrd4_t, vrd4_t))
126 {
127 vrd4_t rl, ru;
128 ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),
129 (vrd4_t) _mm512_extractf64x4_pd((__m512d)y, 1));
130 rl = func((vrd4_t) _mm512_castpd512_pd256(x),
131 (vrd4_t) _mm512_castpd512_pd256(y));
132 return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
133 (__m256d)ru, 1);
134 }
135
136 static
137 vrd8_t
138 __attribute__((noinline))
__gd_z2yy_sincos(vrd8_t x,vrd4_t (* func)(vrd4_t))139 __gd_z2yy_sincos(vrd8_t x, vrd4_t(*func)(vrd4_t))
140 {
141 vrd4_t su, sl, cu;
142 su = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1));
143 asm("vmovaps %%ymm1, %0" : :"m"(cu) :);
144 sl = func((vrd4_t) _mm512_castpd512_pd256(x));
145 asm("vinsertf64x4 $0x1,%0,%%zmm1,%%zmm1" : : "m"(cu) : );
146 return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)sl),
147 (__m256d)su, 1);
148 }
149
150 static
151 vrd8_t
152 __attribute__((noinline))
__gd_z2yy_xk1(vrd8_t x,int64_t iy,vrd4_t (* func)(vrd4_t,int64_t))153 __gd_z2yy_xk1(vrd8_t x, int64_t iy, vrd4_t(*func)(vrd4_t, int64_t))
154 {
155 vrd4_t rl, ru;
156 ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1), iy);
157 rl = func((vrd4_t) _mm512_castpd512_pd256(x), iy);
158 return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
159 (__m256d)ru, 1);
160 }
161
162 static
163 vrd8_t
164 __attribute__((noinline))
__gd_z2yy_xk(vrd8_t x,vid8_t iy,vrd4_t (* func)(vrd4_t,vid4_t))165 __gd_z2yy_xk(vrd8_t x, vid8_t iy, vrd4_t(*func)(vrd4_t, vid4_t))
166 {
167 vrd4_t rl, ru;
168 ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),
169 (vid4_t) _mm512_extractf64x4_pd((__m512d)iy, 1));
170 rl = func((vrd4_t) _mm512_castpd512_pd256(x),
171 (vid4_t) _mm512_castpd512_pd256((__m512d)iy));
172 return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
173 (__m256d)ru, 1);
174 }
175
176 static
177 vrd8_t
178 __attribute__((noinline))
__gd_z2yy_xi(vrd8_t x,vis8_t iy,vrd4_t (* func)(vrd4_t,vis4_t))179 __gd_z2yy_xi(vrd8_t x, vis8_t iy, vrd4_t(*func)(vrd4_t, vis4_t))
180 {
181 vrd4_t rl, ru;
182 ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),
183 (vis4_t) _mm256_extractf128_si256((__m256i)iy, 1));
184 rl = func((vrd4_t) _mm512_castpd512_pd256(x),
185 (vis4_t) _mm256_castsi256_si128((__m256i)iy));
186 return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
187 (__m256d)ru, 1);
188 }
189
190