1 #pragma once
2 
3 #ifdef ZIMG_X86_AVX512
4 
5 #ifndef ZIMG_X86_AVX512_UTIL_H_
6 #define ZIMG_X86_AVX512_UTIL_H_
7 
8 #include "common/ccdep.h"
9 #include "x86util.h"
10 
11 namespace zimg {
12 
13 namespace _avx512 {
14 
15 // Transpose two 8x8 matrices stored in the lower and upper 256-bit lanes of [row0]-[row7].
mm512_transpose8_x2_ps(__m512 & row0,__m512 & row1,__m512 & row2,__m512 & row3,__m512 & row4,__m512 & row5,__m512 & row6,__m512 & row7)16 static inline FORCE_INLINE void mm512_transpose8_x2_ps(__m512 &row0, __m512 &row1, __m512 &row2, __m512 &row3,
17                                                        __m512 &row4, __m512 &row5, __m512 &row6, __m512 &row7)
18 {
19 	__m512 t0, t1, t2, t3, t4, t5, t6, t7;
20 	__m512 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
21 
22 	t0 = _mm512_unpacklo_ps(row0, row1);
23 	t1 = _mm512_unpackhi_ps(row0, row1);
24 	t2 = _mm512_unpacklo_ps(row2, row3);
25 	t3 = _mm512_unpackhi_ps(row2, row3);
26 	t4 = _mm512_unpacklo_ps(row4, row5);
27 	t5 = _mm512_unpackhi_ps(row4, row5);
28 	t6 = _mm512_unpacklo_ps(row6, row7);
29 	t7 = _mm512_unpackhi_ps(row6, row7);
30 
31 	tt0 = _mm512_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
32 	tt1 = _mm512_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
33 	tt2 = _mm512_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
34 	tt3 = _mm512_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
35 	tt4 = _mm512_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
36 	tt5 = _mm512_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
37 	tt6 = _mm512_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
38 	tt7 = _mm512_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
39 
40 	row0 = _mm512_shuffle_f32x4(tt0, tt4, 0x88);
41 	row1 = _mm512_shuffle_f32x4(tt1, tt5, 0x88);
42 	row2 = _mm512_shuffle_f32x4(tt2, tt6, 0x88);
43 	row3 = _mm512_shuffle_f32x4(tt3, tt7, 0x88);
44 	row4 = _mm512_shuffle_f32x4(tt0, tt4, 0xdd);
45 	row5 = _mm512_shuffle_f32x4(tt1, tt5, 0xdd);
46 	row6 = _mm512_shuffle_f32x4(tt2, tt6, 0xdd);
47 	row7 = _mm512_shuffle_f32x4(tt3, tt7, 0xdd);
48 }
49 
50 // Exchange the upper 256-bit lane of [row0] with the lower 256-bit lane of [row1].
mm512_exchange_lanes_ps256(__m512 & row0,__m512 & row1)51 static inline FORCE_INLINE void mm512_exchange_lanes_ps256(__m512 &row0, __m512 &row1)
52 {
53 	__m512 tmp0 = _mm512_shuffle_f32x4(row0, row1, 0x88);
54 	__m512 tmp1 = _mm512_shuffle_f32x4(row0, row1, 0xdd);
55 	row0 = tmp0;
56 	row1 = tmp1;
57 }
58 
59 // Transpose four 8x8 matrices stored in the 128-bit lanes of [row0]-[row7].
mm512_transpose8_x4_epi16(__m512i & row0,__m512i & row1,__m512i & row2,__m512i & row3,__m512i & row4,__m512i & row5,__m512i & row6,__m512i & row7)60 static inline FORCE_INLINE void mm512_transpose8_x4_epi16(__m512i &row0, __m512i &row1, __m512i &row2, __m512i &row3,
61                                                           __m512i &row4, __m512i &row5, __m512i &row6, __m512i &row7)
62 {
63 	__m512i t0, t1, t2, t3, t4, t5, t6, t7;
64 	__m512i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
65 
66 	t0 = _mm512_unpacklo_epi16(row0, row1);
67 	t1 = _mm512_unpacklo_epi16(row2, row3);
68 	t2 = _mm512_unpacklo_epi16(row4, row5);
69 	t3 = _mm512_unpacklo_epi16(row6, row7);
70 	t4 = _mm512_unpackhi_epi16(row0, row1);
71 	t5 = _mm512_unpackhi_epi16(row2, row3);
72 	t6 = _mm512_unpackhi_epi16(row4, row5);
73 	t7 = _mm512_unpackhi_epi16(row6, row7);
74 
75 	tt0 = _mm512_unpacklo_epi32(t0, t1);
76 	tt1 = _mm512_unpackhi_epi32(t0, t1);
77 	tt2 = _mm512_unpacklo_epi32(t2, t3);
78 	tt3 = _mm512_unpackhi_epi32(t2, t3);
79 	tt4 = _mm512_unpacklo_epi32(t4, t5);
80 	tt5 = _mm512_unpackhi_epi32(t4, t5);
81 	tt6 = _mm512_unpacklo_epi32(t6, t7);
82 	tt7 = _mm512_unpackhi_epi32(t6, t7);
83 
84 	row0 = _mm512_unpacklo_epi64(tt0, tt2);
85 	row1 = _mm512_unpackhi_epi64(tt0, tt2);
86 	row2 = _mm512_unpacklo_epi64(tt1, tt3);
87 	row3 = _mm512_unpackhi_epi64(tt1, tt3);
88 	row4 = _mm512_unpacklo_epi64(tt4, tt6);
89 	row5 = _mm512_unpackhi_epi64(tt4, tt6);
90 	row6 = _mm512_unpacklo_epi64(tt5, tt7);
91 	row7 = _mm512_unpackhi_epi64(tt5, tt7);
92 }
93 
94 // Transpose the 4x4 matrix stored in [row0]-[row3].
mm512_transpose4_si128(__m512i & row0,__m512i & row1,__m512i & row2,__m512i & row3)95 static inline FORCE_INLINE void mm512_transpose4_si128(__m512i &row0, __m512i &row1, __m512i &row2, __m512i &row3)
96 {
97 	__m512i t0, t1, t2, t3;
98 
99 	t0 = _mm512_shuffle_i32x4(row0, row1, 0x88);
100 	t1 = _mm512_shuffle_i32x4(row0, row1, 0xdd);
101 	t2 = _mm512_shuffle_i32x4(row2, row3, 0x88);
102 	t3 = _mm512_shuffle_i32x4(row2, row3, 0xdd);
103 
104 	row0 = _mm512_shuffle_i32x4(t0, t2, 0x88);
105 	row1 = _mm512_shuffle_i32x4(t1, t3, 0x88);
106 	row2 = _mm512_shuffle_i32x4(t0, t2, 0xdd);
107 	row3 = _mm512_shuffle_i32x4(t1, t3, 0xdd);
108 }
109 
110 } // namespace _avx512
111 
112 
113 // Return mask with lower n bits set to 1.
mmask16_set_lo(unsigned n)114 static inline FORCE_INLINE __mmask16 mmask16_set_lo(unsigned n)
115 {
116 	return 0xFFFFU >> (16 - n);
117 }
118 
119 // Return mask with upper n bits set to 1.
mmask16_set_hi(unsigned n)120 static inline FORCE_INLINE __mmask16 mmask16_set_hi(unsigned n)
121 {
122 	return 0xFFFFU << (16 - n);
123 }
124 
125 // Return mask with lower n bits set to 1.
mmask32_set_lo(unsigned n)126 static inline FORCE_INLINE __mmask32 mmask32_set_lo(unsigned n)
127 {
128 	return 0xFFFFFFFFU >> (32 - n);
129 }
130 
131 // Return mask with upper n bits set to 1.
mmask32_set_hi(unsigned n)132 static inline FORCE_INLINE __mmask32 mmask32_set_hi(unsigned n)
133 {
134 	return 0xFFFFFFFFU << (32 - n);
135 }
136 
137 // Transpose in-place the 16x16 matrix stored in [row0]-[row15].
mm512_transpose16_ps(__m512 & row0,__m512 & row1,__m512 & row2,__m512 & row3,__m512 & row4,__m512 & row5,__m512 & row6,__m512 & row7,__m512 & row8,__m512 & row9,__m512 & row10,__m512 & row11,__m512 & row12,__m512 & row13,__m512 & row14,__m512 & row15)138 static inline FORCE_INLINE void mm512_transpose16_ps(__m512 &row0, __m512 &row1, __m512 &row2, __m512 &row3,
139                                                      __m512 &row4, __m512 &row5, __m512 &row6, __m512 &row7,
140                                                      __m512 &row8, __m512 &row9, __m512 &row10, __m512 &row11,
141                                                      __m512 &row12, __m512 &row13, __m512 &row14, __m512 &row15)
142 {
143 	_avx512::mm512_transpose8_x2_ps(row0, row1, row2, row3, row4, row5, row6, row7);
144 	_avx512::mm512_transpose8_x2_ps(row8, row9, row10, row11, row12, row13, row14, row15);
145 
146 	_avx512::mm512_exchange_lanes_ps256(row0, row8);
147 	_avx512::mm512_exchange_lanes_ps256(row1, row9);
148 	_avx512::mm512_exchange_lanes_ps256(row2, row10);
149 	_avx512::mm512_exchange_lanes_ps256(row3, row11);
150 	_avx512::mm512_exchange_lanes_ps256(row4, row12);
151 	_avx512::mm512_exchange_lanes_ps256(row5, row13);
152 	_avx512::mm512_exchange_lanes_ps256(row6, row14);
153 	_avx512::mm512_exchange_lanes_ps256(row7, row15);
154 }
155 
156 // Transpose in-place the 32x32 matrix stored in [row0]-[row31].
mm512_transpose32_epi16(__m512i & row0,__m512i & row1,__m512i & row2,__m512i & row3,__m512i & row4,__m512i & row5,__m512i & row6,__m512i & row7,__m512i & row8,__m512i & row9,__m512i & row10,__m512i & row11,__m512i & row12,__m512i & row13,__m512i & row14,__m512i & row15,__m512i & row16,__m512i & row17,__m512i & row18,__m512i & row19,__m512i & row20,__m512i & row21,__m512i & row22,__m512i & row23,__m512i & row24,__m512i & row25,__m512i & row26,__m512i & row27,__m512i & row28,__m512i & row29,__m512i & row30,__m512i & row31)157 static inline FORCE_INLINE void mm512_transpose32_epi16(
158 	__m512i &row0, __m512i &row1, __m512i &row2, __m512i &row3, __m512i &row4, __m512i &row5, __m512i &row6, __m512i &row7,
159 	__m512i &row8, __m512i &row9, __m512i &row10, __m512i &row11, __m512i &row12, __m512i &row13, __m512i &row14, __m512i &row15,
160 	__m512i &row16, __m512i &row17, __m512i &row18, __m512i &row19, __m512i &row20, __m512i &row21, __m512i &row22, __m512i &row23,
161 	__m512i &row24, __m512i &row25, __m512i &row26, __m512i &row27, __m512i &row28, __m512i &row29, __m512i &row30, __m512i &row31)
162 {
163 	_avx512::mm512_transpose8_x4_epi16(row0, row1, row2, row3, row4, row5, row6, row7);
164 	_avx512::mm512_transpose8_x4_epi16(row8, row9, row10, row11, row12, row13, row14, row15);
165 	_avx512::mm512_transpose8_x4_epi16(row16, row17, row18, row19, row20, row21, row22, row23);
166 	_avx512::mm512_transpose8_x4_epi16(row24, row25, row26, row27, row28, row29, row30, row31);
167 
168 	_avx512::mm512_transpose4_si128(row0, row8, row16, row24);
169 	_avx512::mm512_transpose4_si128(row1, row9, row17, row25);
170 	_avx512::mm512_transpose4_si128(row2, row10, row18, row26);
171 	_avx512::mm512_transpose4_si128(row3, row11, row19, row27);
172 	_avx512::mm512_transpose4_si128(row4, row12, row20, row28);
173 	_avx512::mm512_transpose4_si128(row5, row13, row21, row29);
174 	_avx512::mm512_transpose4_si128(row6, row14, row22, row30);
175 	_avx512::mm512_transpose4_si128(row7, row15, row23, row31);
176 }
177 
178 } // namespace zimg
179 
180 #endif // ZIMG_X86_AVX512_UTIL_H_
181 
182 #endif // ZIMG_X86_AVX512
183