1 /*
2  * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17 
18 /*
19  * Common set of interface routines to convert an intrinsic math library call
20  * using Intel AVX-512 vectors in to two calls of the corresponding AVX2
21  * implementation.
22  *
23  * Note: code is common to both AVX-512 and KNL architectures.
24  *       Thus, have to use Intel intrinsics that are common to both systems.
25  */
26 
27 
28 static
29 vrs16_t
30 __attribute__((noinline))
__gs_z2yy_x(vrs16_t x,vrs8_t (* func)(vrs8_t))31 __gs_z2yy_x(vrs16_t x, vrs8_t(*func)(vrs8_t))
32 {
33   vrs8_t rl, ru;
34   ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1));
35   rl = func((vrs8_t) _mm512_castps512_ps256(x));
36   return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
37                                       (__m256d)ru, 1);
38 }
39 
40 static
41 vrs16_t
42 __attribute__((noinline))
__gs_z2yy_xy(vrs16_t x,vrs16_t y,vrs8_t (* func)(vrs8_t,vrs8_t))43 __gs_z2yy_xy(vrs16_t x, vrs16_t y, vrs8_t(*func)(vrs8_t, vrs8_t))
44 {
45   vrs8_t rl, ru;
46   ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),
47             (vrs8_t) _mm512_extractf64x4_pd((__m512d)y, 1));
48   rl = func((vrs8_t) _mm512_castps512_ps256(x),
49             (vrs8_t) _mm512_castps512_ps256(y));
50   return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
51                                       (__m256d)ru, 1);
52 }
53 
54 static
55 vrs16_t
56 __attribute__((noinline))
__gs_z2yy_sincos(vrs16_t x,vrs8_t (* func)(vrs8_t))57 __gs_z2yy_sincos(vrs16_t x, vrs8_t(*func)(vrs8_t))
58 {
59   vrs8_t su, sl, cu;
60   su = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1));
61   asm("vmovaps  %%ymm1, %0" : :"m"(cu) :);
62   sl = func((vrs8_t) _mm512_castps512_ps256(x));
63   asm("vinsertf64x4 $0x1,%0,%%zmm1,%%zmm1" : : "m"(cu) : );
64   return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)sl),
65                                       (__m256d)su, 1);
66 }
67 
68 static
69 vrs16_t
70 __attribute__((noinline))
__gs_z2yy_xk1(vrs16_t x,int64_t iy,vrs8_t (* func)(vrs8_t,int64_t))71 __gs_z2yy_xk1(vrs16_t x, int64_t iy, vrs8_t(*func)(vrs8_t, int64_t))
72 {
73   vrs8_t rl, ru;
74   ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1), iy);
75   rl = func((vrs8_t) _mm512_castps512_ps256(x), iy);
76   return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
77                                       (__m256d)ru, 1);
78 }
79 
80 static
81 vrs16_t
82 __attribute__((noinline))
__gs_z2yy_xi(vrs16_t x,vis16_t iy,vrs8_t (* func)(vrs8_t,vis8_t))83 __gs_z2yy_xi(vrs16_t x, vis16_t iy, vrs8_t(*func)(vrs8_t, vis8_t))
84 {
85   vrs8_t rl, ru;
86   ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),
87             (vis8_t) _mm512_extractf64x4_pd((__m512d)iy, 1));
88   rl = func((vrs8_t) _mm512_castps512_ps256(x),
89             (vis8_t) _mm512_castps512_ps256((__m512)iy));
90   return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
91                                      (__m256d)ru, 1);
92 }
93 
94 static
95 vrs16_t
96 __attribute__((noinline))
__gs_z2yy_xk(vrs16_t x,vid8_t iyu,vid8_t iyl,vrs8_t (* func)(vrs8_t,vid4_t,vid4_t))97 __gs_z2yy_xk(vrs16_t x, vid8_t iyu, vid8_t iyl, vrs8_t(*func)(vrs8_t, vid4_t, vid4_t))
98 {
99   vrs8_t rl, ru;
100   ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),
101             (vid4_t) _mm512_extractf64x4_pd((__m512d)iyu, 1),
102             (vid4_t) _mm512_extractf64x4_pd((__m512d)iyu, 0));
103   rl = func((vrs8_t) _mm512_castps512_ps256(x),
104             (vid4_t) _mm512_extractf64x4_pd((__m512d)iyl, 1),
105             (vid4_t) _mm512_extractf64x4_pd((__m512d)iyl, 0));
106   return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
107                                      (__m256d)ru, 1);
108 }
109 
110 static
111 vrd8_t
112 __attribute__((noinline))
__gd_z2yy_x(vrd8_t x,vrd4_t (* func)(vrd4_t))113 __gd_z2yy_x(vrd8_t x, vrd4_t(*func)(vrd4_t))
114 {
115   vrd4_t rl, ru;
116   ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1));
117   rl = func((vrd4_t) _mm512_castpd512_pd256(x));
118   return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
119                                      (__m256d)ru, 1);
120 }
121 
122 static
123 vrd8_t
124 __attribute__((noinline))
__gd_z2yy_xy(vrd8_t x,vrd8_t y,vrd4_t (* func)(vrd4_t,vrd4_t))125 __gd_z2yy_xy(vrd8_t x, vrd8_t y, vrd4_t(*func)(vrd4_t, vrd4_t))
126 {
127   vrd4_t rl, ru;
128   ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),
129             (vrd4_t) _mm512_extractf64x4_pd((__m512d)y, 1));
130   rl = func((vrd4_t) _mm512_castpd512_pd256(x),
131             (vrd4_t) _mm512_castpd512_pd256(y));
132   return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
133                                      (__m256d)ru, 1);
134 }
135 
136 static
137 vrd8_t
138 __attribute__((noinline))
__gd_z2yy_sincos(vrd8_t x,vrd4_t (* func)(vrd4_t))139 __gd_z2yy_sincos(vrd8_t x, vrd4_t(*func)(vrd4_t))
140 {
141   vrd4_t su, sl, cu;
142   su = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1));
143   asm("vmovaps  %%ymm1, %0" : :"m"(cu) :);
144   sl = func((vrd4_t) _mm512_castpd512_pd256(x));
145   asm("vinsertf64x4 $0x1,%0,%%zmm1,%%zmm1" : : "m"(cu) : );
146   return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)sl),
147                                       (__m256d)su, 1);
148 }
149 
150 static
151 vrd8_t
152 __attribute__((noinline))
__gd_z2yy_xk1(vrd8_t x,int64_t iy,vrd4_t (* func)(vrd4_t,int64_t))153 __gd_z2yy_xk1(vrd8_t x, int64_t iy, vrd4_t(*func)(vrd4_t, int64_t))
154 {
155   vrd4_t rl, ru;
156   ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1), iy);
157   rl = func((vrd4_t) _mm512_castpd512_pd256(x), iy);
158   return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
159                                      (__m256d)ru, 1);
160 }
161 
162 static
163 vrd8_t
164 __attribute__((noinline))
__gd_z2yy_xk(vrd8_t x,vid8_t iy,vrd4_t (* func)(vrd4_t,vid4_t))165 __gd_z2yy_xk(vrd8_t x, vid8_t iy, vrd4_t(*func)(vrd4_t, vid4_t))
166 {
167   vrd4_t rl, ru;
168   ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),
169             (vid4_t) _mm512_extractf64x4_pd((__m512d)iy, 1));
170   rl = func((vrd4_t) _mm512_castpd512_pd256(x),
171             (vid4_t) _mm512_castpd512_pd256((__m512d)iy));
172   return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
173                                      (__m256d)ru, 1);
174 }
175 
176 static
177 vrd8_t
178 __attribute__((noinline))
__gd_z2yy_xi(vrd8_t x,vis8_t iy,vrd4_t (* func)(vrd4_t,vis4_t))179 __gd_z2yy_xi(vrd8_t x, vis8_t iy, vrd4_t(*func)(vrd4_t, vis4_t))
180 {
181   vrd4_t rl, ru;
182   ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),
183             (vis4_t) _mm256_extractf128_si256((__m256i)iy, 1));
184   rl = func((vrd4_t) _mm512_castpd512_pd256(x),
185             (vis4_t) _mm256_castsi256_si128((__m256i)iy));
186   return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),
187                                      (__m256d)ru, 1);
188 }
189 
190