1 #ifdef ZIMG_X86_AVX512
2
3 #include <cstdint>
4 #include <immintrin.h>
5 #include "common/align.h"
6 #include "common/ccdep.h"
7 #include "depth_convert_x86.h"
8
9 #include "common/x86/avx512_util.h"
10
11 namespace zimg {
12 namespace depth {
13
14 namespace {
15
16 struct LoadU8 {
17 typedef uint8_t src_type;
18
load16zimg::depth::__anon867fc5c60111::LoadU819 static inline FORCE_INLINE __m512 load16(const uint8_t *ptr)
20 {
21 return _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i *)ptr)));
22 }
23
load32izimg::depth::__anon867fc5c60111::LoadU824 static inline FORCE_INLINE __m512i load32i(const uint8_t *ptr)
25 {
26 return _mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i *)ptr));
27 }
28 };
29
30 struct LoadU16 {
31 typedef uint16_t src_type;
32
load16zimg::depth::__anon867fc5c60111::LoadU1633 static inline FORCE_INLINE __m512 load16(const uint16_t *ptr)
34 {
35 return _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm256_load_si256((const __m256i *)ptr)));
36 }
37
load32izimg::depth::__anon867fc5c60111::LoadU1638 static inline FORCE_INLINE __m512i load32i(const uint16_t *ptr)
39 {
40 return _mm512_load_si512(ptr);
41 }
42 };
43
44 struct StoreU8 {
45 typedef uint8_t dst_type;
46
mask_store32izimg::depth::__anon867fc5c60111::StoreU847 static inline FORCE_INLINE void mask_store32i(uint8_t *ptr, __mmask32 mask, __m512i x)
48 {
49 _mm256_mask_storeu_epi8(ptr, mask, _mm512_cvtepi16_epi8(x));
50 }
51 };
52
53 struct StoreU16 {
54 typedef uint16_t dst_type;
55
mask_store32izimg::depth::__anon867fc5c60111::StoreU1656 static inline FORCE_INLINE void mask_store32i(uint16_t *ptr, __mmask32 mask, __m512i x)
57 {
58 _mm512_mask_storeu_epi16(ptr, mask, x);
59 }
60 };
61
62
63 struct StoreF16 {
64 typedef uint16_t dst_type;
65
mask_store16zimg::depth::__anon867fc5c60111::StoreF1666 static inline FORCE_INLINE void mask_store16(uint16_t *ptr, __mmask16 mask, __m512 x)
67 {
68 _mm256_mask_storeu_epi16(ptr, mask, _mm512_cvtps_ph(x, 0));
69 }
70 };
71
72 struct StoreF32 {
73 typedef float dst_type;
74
mask_store16zimg::depth::__anon867fc5c60111::StoreF3275 static inline FORCE_INLINE void mask_store16(float *ptr, __mmask16 mask, __m512 x)
76 {
77 _mm512_mask_store_ps(ptr, mask, x);
78 }
79 };
80
81 template <class Load, class Store>
left_shift_avx512_impl(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)82 inline FORCE_INLINE void left_shift_avx512_impl(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
83 {
84 const typename Load::src_type *src_p = static_cast<const typename Load::src_type *>(src);
85 typename Store::dst_type *dst_p = static_cast<typename Store::dst_type *>(dst);
86
87 unsigned vec_left = ceil_n(left, 32);
88 unsigned vec_right = floor_n(right, 32);
89
90 __m128i count = _mm_set1_epi64x(shift);
91
92 if (left != vec_left) {
93 __m512i x = Load::load32i(src_p + vec_left - 32);
94 x = _mm512_sll_epi16(x, count);
95
96 Store::mask_store32i(dst_p + vec_left - 32, mmask32_set_hi(vec_left - left), x);
97 }
98
99 for (unsigned j = vec_left; j < vec_right; j += 32) {
100 __m512i x = Load::load32i(src_p + j);
101 x = _mm512_sll_epi16(x, count);
102
103 Store::mask_store32i(dst_p + j, 0xFFFFFFFFU, x);
104 }
105
106 if (right != vec_right) {
107 __m512i x = Load::load32i(src_p + vec_right);
108 x = _mm512_sll_epi16(x, count);
109
110 Store::mask_store32i(dst_p + vec_right, mmask32_set_lo(right - vec_right), x);
111 }
112 }
113
114 template <class Load, class Store>
depth_convert_avx512_impl(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)115 inline FORCE_INLINE void depth_convert_avx512_impl(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
116 {
117 const typename Load::src_type *src_p = static_cast<const typename Load::src_type *>(src);
118 typename Store::dst_type *dst_p = static_cast<typename Store::dst_type *>(dst);
119
120 unsigned vec_left = floor_n(left, 16);
121 unsigned vec_right = ceil_n(right, 16);
122
123 const __m512 scale_ps = _mm512_set1_ps(scale);
124 const __m512 offset_ps = _mm512_set1_ps(offset);
125
126 if (left != vec_left) {
127 __m512 x = Load::load16(src_p + vec_left - 16);
128 x = _mm512_fmadd_ps(scale_ps, x, offset_ps);
129
130 Store::mask_store16(dst_p + vec_left - 16, mmask16_set_hi(vec_left - left), x);
131 }
132
133 for (unsigned j = vec_left; j < vec_right; j += 16) {
134 __m512 x = Load::load16(src_p + j);
135 x = _mm512_fmadd_ps(scale_ps, x, offset_ps);
136
137 Store::mask_store16(dst_p + j, 0xFFFFU, x);
138 }
139
140 if (right != vec_right) {
141 __m512 x = Load::load16(src_p + vec_right);
142 x = _mm512_fmadd_ps(scale_ps, x, offset_ps);
143
144 Store::mask_store16(dst_p + vec_right, mmask16_set_lo(right - vec_right), x);
145 }
146 }
147
148 } // namespace
149
150
left_shift_b2b_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)151 void left_shift_b2b_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
152 {
153 left_shift_avx512_impl<LoadU8, StoreU8>(src, dst, shift, left, right);
154 }
155
left_shift_b2w_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)156 void left_shift_b2w_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
157 {
158 left_shift_avx512_impl<LoadU8, StoreU16>(src, dst, shift, left, right);
159 }
160
left_shift_w2b_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)161 void left_shift_w2b_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
162 {
163 left_shift_avx512_impl<LoadU16, StoreU8>(src, dst, shift, left, right);
164 }
165
left_shift_w2w_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)166 void left_shift_w2w_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
167 {
168 left_shift_avx512_impl<LoadU16, StoreU16>(src, dst, shift, left, right);
169 }
170
depth_convert_b2h_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)171 void depth_convert_b2h_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
172 {
173 depth_convert_avx512_impl<LoadU8, StoreF16>(src, dst, scale, offset, left, right);
174 }
175
depth_convert_b2f_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)176 void depth_convert_b2f_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
177 {
178 depth_convert_avx512_impl<LoadU8, StoreF32>(src, dst, scale, offset, left, right);
179 }
180
depth_convert_w2h_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)181 void depth_convert_w2h_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
182 {
183 depth_convert_avx512_impl<LoadU16, StoreF16>(src, dst, scale, offset, left, right);
184 }
185
depth_convert_w2f_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)186 void depth_convert_w2f_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
187 {
188 depth_convert_avx512_impl<LoadU16, StoreF32>(src, dst, scale, offset, left, right);
189 }
190
191 } // namespace depth
192 } // namespace zimg
193
194 #endif // ZIMG_X86_AVX512
195