1 #pragma once
2
3 #ifdef ZIMG_X86_AVX512
4
5 #ifndef ZIMG_X86_AVX512_UTIL_H_
6 #define ZIMG_X86_AVX512_UTIL_H_
7
8 #include "common/ccdep.h"
9 #include "x86util.h"
10
11 namespace zimg {
12
13 namespace _avx512 {
14
15 // Transpose two 8x8 matrices stored in the lower and upper 256-bit lanes of [row0]-[row7].
mm512_transpose8_x2_ps(__m512 & row0,__m512 & row1,__m512 & row2,__m512 & row3,__m512 & row4,__m512 & row5,__m512 & row6,__m512 & row7)16 static inline FORCE_INLINE void mm512_transpose8_x2_ps(__m512 &row0, __m512 &row1, __m512 &row2, __m512 &row3,
17 __m512 &row4, __m512 &row5, __m512 &row6, __m512 &row7)
18 {
19 __m512 t0, t1, t2, t3, t4, t5, t6, t7;
20 __m512 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
21
22 t0 = _mm512_unpacklo_ps(row0, row1);
23 t1 = _mm512_unpackhi_ps(row0, row1);
24 t2 = _mm512_unpacklo_ps(row2, row3);
25 t3 = _mm512_unpackhi_ps(row2, row3);
26 t4 = _mm512_unpacklo_ps(row4, row5);
27 t5 = _mm512_unpackhi_ps(row4, row5);
28 t6 = _mm512_unpacklo_ps(row6, row7);
29 t7 = _mm512_unpackhi_ps(row6, row7);
30
31 tt0 = _mm512_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
32 tt1 = _mm512_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
33 tt2 = _mm512_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
34 tt3 = _mm512_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
35 tt4 = _mm512_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
36 tt5 = _mm512_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
37 tt6 = _mm512_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
38 tt7 = _mm512_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
39
40 row0 = _mm512_shuffle_f32x4(tt0, tt4, 0x88);
41 row1 = _mm512_shuffle_f32x4(tt1, tt5, 0x88);
42 row2 = _mm512_shuffle_f32x4(tt2, tt6, 0x88);
43 row3 = _mm512_shuffle_f32x4(tt3, tt7, 0x88);
44 row4 = _mm512_shuffle_f32x4(tt0, tt4, 0xdd);
45 row5 = _mm512_shuffle_f32x4(tt1, tt5, 0xdd);
46 row6 = _mm512_shuffle_f32x4(tt2, tt6, 0xdd);
47 row7 = _mm512_shuffle_f32x4(tt3, tt7, 0xdd);
48 }
49
50 // Exchange the upper 256-bit lane of [row0] with the lower 256-bit lane of [row1].
mm512_exchange_lanes_ps256(__m512 & row0,__m512 & row1)51 static inline FORCE_INLINE void mm512_exchange_lanes_ps256(__m512 &row0, __m512 &row1)
52 {
53 __m512 tmp0 = _mm512_shuffle_f32x4(row0, row1, 0x88);
54 __m512 tmp1 = _mm512_shuffle_f32x4(row0, row1, 0xdd);
55 row0 = tmp0;
56 row1 = tmp1;
57 }
58
59 // Transpose four 8x8 matrices stored in the 128-bit lanes of [row0]-[row7].
mm512_transpose8_x4_epi16(__m512i & row0,__m512i & row1,__m512i & row2,__m512i & row3,__m512i & row4,__m512i & row5,__m512i & row6,__m512i & row7)60 static inline FORCE_INLINE void mm512_transpose8_x4_epi16(__m512i &row0, __m512i &row1, __m512i &row2, __m512i &row3,
61 __m512i &row4, __m512i &row5, __m512i &row6, __m512i &row7)
62 {
63 __m512i t0, t1, t2, t3, t4, t5, t6, t7;
64 __m512i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
65
66 t0 = _mm512_unpacklo_epi16(row0, row1);
67 t1 = _mm512_unpacklo_epi16(row2, row3);
68 t2 = _mm512_unpacklo_epi16(row4, row5);
69 t3 = _mm512_unpacklo_epi16(row6, row7);
70 t4 = _mm512_unpackhi_epi16(row0, row1);
71 t5 = _mm512_unpackhi_epi16(row2, row3);
72 t6 = _mm512_unpackhi_epi16(row4, row5);
73 t7 = _mm512_unpackhi_epi16(row6, row7);
74
75 tt0 = _mm512_unpacklo_epi32(t0, t1);
76 tt1 = _mm512_unpackhi_epi32(t0, t1);
77 tt2 = _mm512_unpacklo_epi32(t2, t3);
78 tt3 = _mm512_unpackhi_epi32(t2, t3);
79 tt4 = _mm512_unpacklo_epi32(t4, t5);
80 tt5 = _mm512_unpackhi_epi32(t4, t5);
81 tt6 = _mm512_unpacklo_epi32(t6, t7);
82 tt7 = _mm512_unpackhi_epi32(t6, t7);
83
84 row0 = _mm512_unpacklo_epi64(tt0, tt2);
85 row1 = _mm512_unpackhi_epi64(tt0, tt2);
86 row2 = _mm512_unpacklo_epi64(tt1, tt3);
87 row3 = _mm512_unpackhi_epi64(tt1, tt3);
88 row4 = _mm512_unpacklo_epi64(tt4, tt6);
89 row5 = _mm512_unpackhi_epi64(tt4, tt6);
90 row6 = _mm512_unpacklo_epi64(tt5, tt7);
91 row7 = _mm512_unpackhi_epi64(tt5, tt7);
92 }
93
94 // Transpose the 4x4 matrix stored in [row0]-[row3].
mm512_transpose4_si128(__m512i & row0,__m512i & row1,__m512i & row2,__m512i & row3)95 static inline FORCE_INLINE void mm512_transpose4_si128(__m512i &row0, __m512i &row1, __m512i &row2, __m512i &row3)
96 {
97 __m512i t0, t1, t2, t3;
98
99 t0 = _mm512_shuffle_i32x4(row0, row1, 0x88);
100 t1 = _mm512_shuffle_i32x4(row0, row1, 0xdd);
101 t2 = _mm512_shuffle_i32x4(row2, row3, 0x88);
102 t3 = _mm512_shuffle_i32x4(row2, row3, 0xdd);
103
104 row0 = _mm512_shuffle_i32x4(t0, t2, 0x88);
105 row1 = _mm512_shuffle_i32x4(t1, t3, 0x88);
106 row2 = _mm512_shuffle_i32x4(t0, t2, 0xdd);
107 row3 = _mm512_shuffle_i32x4(t1, t3, 0xdd);
108 }
109
110 } // namespace _avx512
111
112
113 // Return mask with lower n bits set to 1.
mmask16_set_lo(unsigned n)114 static inline FORCE_INLINE __mmask16 mmask16_set_lo(unsigned n)
115 {
116 return 0xFFFFU >> (16 - n);
117 }
118
119 // Return mask with upper n bits set to 1.
mmask16_set_hi(unsigned n)120 static inline FORCE_INLINE __mmask16 mmask16_set_hi(unsigned n)
121 {
122 return 0xFFFFU << (16 - n);
123 }
124
125 // Return mask with lower n bits set to 1.
mmask32_set_lo(unsigned n)126 static inline FORCE_INLINE __mmask32 mmask32_set_lo(unsigned n)
127 {
128 return 0xFFFFFFFFU >> (32 - n);
129 }
130
131 // Return mask with upper n bits set to 1.
mmask32_set_hi(unsigned n)132 static inline FORCE_INLINE __mmask32 mmask32_set_hi(unsigned n)
133 {
134 return 0xFFFFFFFFU << (32 - n);
135 }
136
137 // Transpose in-place the 16x16 matrix stored in [row0]-[row15].
mm512_transpose16_ps(__m512 & row0,__m512 & row1,__m512 & row2,__m512 & row3,__m512 & row4,__m512 & row5,__m512 & row6,__m512 & row7,__m512 & row8,__m512 & row9,__m512 & row10,__m512 & row11,__m512 & row12,__m512 & row13,__m512 & row14,__m512 & row15)138 static inline FORCE_INLINE void mm512_transpose16_ps(__m512 &row0, __m512 &row1, __m512 &row2, __m512 &row3,
139 __m512 &row4, __m512 &row5, __m512 &row6, __m512 &row7,
140 __m512 &row8, __m512 &row9, __m512 &row10, __m512 &row11,
141 __m512 &row12, __m512 &row13, __m512 &row14, __m512 &row15)
142 {
143 _avx512::mm512_transpose8_x2_ps(row0, row1, row2, row3, row4, row5, row6, row7);
144 _avx512::mm512_transpose8_x2_ps(row8, row9, row10, row11, row12, row13, row14, row15);
145
146 _avx512::mm512_exchange_lanes_ps256(row0, row8);
147 _avx512::mm512_exchange_lanes_ps256(row1, row9);
148 _avx512::mm512_exchange_lanes_ps256(row2, row10);
149 _avx512::mm512_exchange_lanes_ps256(row3, row11);
150 _avx512::mm512_exchange_lanes_ps256(row4, row12);
151 _avx512::mm512_exchange_lanes_ps256(row5, row13);
152 _avx512::mm512_exchange_lanes_ps256(row6, row14);
153 _avx512::mm512_exchange_lanes_ps256(row7, row15);
154 }
155
156 // Transpose in-place the 32x32 matrix stored in [row0]-[row31].
mm512_transpose32_epi16(__m512i & row0,__m512i & row1,__m512i & row2,__m512i & row3,__m512i & row4,__m512i & row5,__m512i & row6,__m512i & row7,__m512i & row8,__m512i & row9,__m512i & row10,__m512i & row11,__m512i & row12,__m512i & row13,__m512i & row14,__m512i & row15,__m512i & row16,__m512i & row17,__m512i & row18,__m512i & row19,__m512i & row20,__m512i & row21,__m512i & row22,__m512i & row23,__m512i & row24,__m512i & row25,__m512i & row26,__m512i & row27,__m512i & row28,__m512i & row29,__m512i & row30,__m512i & row31)157 static inline FORCE_INLINE void mm512_transpose32_epi16(
158 __m512i &row0, __m512i &row1, __m512i &row2, __m512i &row3, __m512i &row4, __m512i &row5, __m512i &row6, __m512i &row7,
159 __m512i &row8, __m512i &row9, __m512i &row10, __m512i &row11, __m512i &row12, __m512i &row13, __m512i &row14, __m512i &row15,
160 __m512i &row16, __m512i &row17, __m512i &row18, __m512i &row19, __m512i &row20, __m512i &row21, __m512i &row22, __m512i &row23,
161 __m512i &row24, __m512i &row25, __m512i &row26, __m512i &row27, __m512i &row28, __m512i &row29, __m512i &row30, __m512i &row31)
162 {
163 _avx512::mm512_transpose8_x4_epi16(row0, row1, row2, row3, row4, row5, row6, row7);
164 _avx512::mm512_transpose8_x4_epi16(row8, row9, row10, row11, row12, row13, row14, row15);
165 _avx512::mm512_transpose8_x4_epi16(row16, row17, row18, row19, row20, row21, row22, row23);
166 _avx512::mm512_transpose8_x4_epi16(row24, row25, row26, row27, row28, row29, row30, row31);
167
168 _avx512::mm512_transpose4_si128(row0, row8, row16, row24);
169 _avx512::mm512_transpose4_si128(row1, row9, row17, row25);
170 _avx512::mm512_transpose4_si128(row2, row10, row18, row26);
171 _avx512::mm512_transpose4_si128(row3, row11, row19, row27);
172 _avx512::mm512_transpose4_si128(row4, row12, row20, row28);
173 _avx512::mm512_transpose4_si128(row5, row13, row21, row29);
174 _avx512::mm512_transpose4_si128(row6, row14, row22, row30);
175 _avx512::mm512_transpose4_si128(row7, row15, row23, row31);
176 }
177
178 } // namespace zimg
179
180 #endif // ZIMG_X86_AVX512_UTIL_H_
181
182 #endif // ZIMG_X86_AVX512
183