1 #ifdef ZIMG_X86_AVX512
2 
3 #include <cstdint>
4 #include <immintrin.h>
5 #include "common/align.h"
6 #include "common/ccdep.h"
7 #include "depth_convert_x86.h"
8 
9 #include "common/x86/avx512_util.h"
10 
11 namespace zimg {
12 namespace depth {
13 
14 namespace {
15 
16 struct LoadU8 {
17 	typedef uint8_t src_type;
18 
load16zimg::depth::__anon867fc5c60111::LoadU819 	static inline FORCE_INLINE __m512 load16(const uint8_t *ptr)
20 	{
21 		return _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i *)ptr)));
22 	}
23 
load32izimg::depth::__anon867fc5c60111::LoadU824 	static inline FORCE_INLINE __m512i load32i(const uint8_t *ptr)
25 	{
26 		return _mm512_cvtepu8_epi16(_mm256_load_si256((const __m256i *)ptr));
27 	}
28 };
29 
30 struct LoadU16 {
31 	typedef uint16_t src_type;
32 
load16zimg::depth::__anon867fc5c60111::LoadU1633 	static inline FORCE_INLINE __m512 load16(const uint16_t *ptr)
34 	{
35 		return _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm256_load_si256((const __m256i *)ptr)));
36 	}
37 
load32izimg::depth::__anon867fc5c60111::LoadU1638 	static inline FORCE_INLINE __m512i load32i(const uint16_t *ptr)
39 	{
40 		return _mm512_load_si512(ptr);
41 	}
42 };
43 
44 struct StoreU8 {
45 	typedef uint8_t dst_type;
46 
mask_store32izimg::depth::__anon867fc5c60111::StoreU847 	static inline FORCE_INLINE void mask_store32i(uint8_t *ptr, __mmask32 mask, __m512i x)
48 	{
49 		_mm256_mask_storeu_epi8(ptr, mask, _mm512_cvtepi16_epi8(x));
50 	}
51 };
52 
53 struct StoreU16 {
54 	typedef uint16_t dst_type;
55 
mask_store32izimg::depth::__anon867fc5c60111::StoreU1656 	static inline FORCE_INLINE void mask_store32i(uint16_t *ptr, __mmask32 mask, __m512i x)
57 	{
58 		_mm512_mask_storeu_epi16(ptr, mask, x);
59 	}
60 };
61 
62 
63 struct StoreF16 {
64 	typedef uint16_t dst_type;
65 
mask_store16zimg::depth::__anon867fc5c60111::StoreF1666 	static inline FORCE_INLINE void mask_store16(uint16_t *ptr, __mmask16 mask, __m512 x)
67 	{
68 		_mm256_mask_storeu_epi16(ptr, mask, _mm512_cvtps_ph(x, 0));
69 	}
70 };
71 
72 struct StoreF32 {
73 	typedef float dst_type;
74 
mask_store16zimg::depth::__anon867fc5c60111::StoreF3275 	static inline FORCE_INLINE void mask_store16(float *ptr, __mmask16 mask, __m512 x)
76 	{
77 		_mm512_mask_store_ps(ptr, mask, x);
78 	}
79 };
80 
81 template <class Load, class Store>
left_shift_avx512_impl(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)82 inline FORCE_INLINE void left_shift_avx512_impl(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
83 {
84 	const typename Load::src_type *src_p = static_cast<const typename Load::src_type *>(src);
85 	typename Store::dst_type *dst_p = static_cast<typename Store::dst_type *>(dst);
86 
87 	unsigned vec_left = ceil_n(left, 32);
88 	unsigned vec_right = floor_n(right, 32);
89 
90 	__m128i count = _mm_set1_epi64x(shift);
91 
92 	if (left != vec_left) {
93 		__m512i x = Load::load32i(src_p + vec_left - 32);
94 		x = _mm512_sll_epi16(x, count);
95 
96 		Store::mask_store32i(dst_p + vec_left - 32, mmask32_set_hi(vec_left - left), x);
97 	}
98 
99 	for (unsigned j = vec_left; j < vec_right; j += 32) {
100 		__m512i x = Load::load32i(src_p + j);
101 		x = _mm512_sll_epi16(x, count);
102 
103 		Store::mask_store32i(dst_p + j, 0xFFFFFFFFU, x);
104 	}
105 
106 	if (right != vec_right) {
107 		__m512i x = Load::load32i(src_p + vec_right);
108 		x = _mm512_sll_epi16(x, count);
109 
110 		Store::mask_store32i(dst_p + vec_right, mmask32_set_lo(right - vec_right), x);
111 	}
112 }
113 
114 template <class Load, class Store>
depth_convert_avx512_impl(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)115 inline FORCE_INLINE void depth_convert_avx512_impl(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
116 {
117 	const typename Load::src_type *src_p = static_cast<const typename Load::src_type *>(src);
118 	typename Store::dst_type *dst_p = static_cast<typename Store::dst_type *>(dst);
119 
120 	unsigned vec_left = floor_n(left, 16);
121 	unsigned vec_right = ceil_n(right, 16);
122 
123 	const __m512 scale_ps = _mm512_set1_ps(scale);
124 	const __m512 offset_ps = _mm512_set1_ps(offset);
125 
126 	if (left != vec_left) {
127 		__m512 x = Load::load16(src_p + vec_left - 16);
128 		x = _mm512_fmadd_ps(scale_ps, x, offset_ps);
129 
130 		Store::mask_store16(dst_p + vec_left - 16, mmask16_set_hi(vec_left - left), x);
131 	}
132 
133 	for (unsigned j = vec_left; j < vec_right; j += 16) {
134 		__m512 x = Load::load16(src_p + j);
135 		x = _mm512_fmadd_ps(scale_ps, x, offset_ps);
136 
137 		Store::mask_store16(dst_p + j, 0xFFFFU, x);
138 	}
139 
140 	if (right != vec_right) {
141 		__m512 x = Load::load16(src_p + vec_right);
142 		x = _mm512_fmadd_ps(scale_ps, x, offset_ps);
143 
144 		Store::mask_store16(dst_p + vec_right, mmask16_set_lo(right - vec_right), x);
145 	}
146 }
147 
148 } // namespace
149 
150 
left_shift_b2b_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)151 void left_shift_b2b_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
152 {
153 	left_shift_avx512_impl<LoadU8, StoreU8>(src, dst, shift, left, right);
154 }
155 
left_shift_b2w_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)156 void left_shift_b2w_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
157 {
158 	left_shift_avx512_impl<LoadU8, StoreU16>(src, dst, shift, left, right);
159 }
160 
left_shift_w2b_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)161 void left_shift_w2b_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
162 {
163 	left_shift_avx512_impl<LoadU16, StoreU8>(src, dst, shift, left, right);
164 }
165 
left_shift_w2w_avx512(const void * src,void * dst,unsigned shift,unsigned left,unsigned right)166 void left_shift_w2w_avx512(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
167 {
168 	left_shift_avx512_impl<LoadU16, StoreU16>(src, dst, shift, left, right);
169 }
170 
depth_convert_b2h_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)171 void depth_convert_b2h_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
172 {
173 	depth_convert_avx512_impl<LoadU8, StoreF16>(src, dst, scale, offset, left, right);
174 }
175 
depth_convert_b2f_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)176 void depth_convert_b2f_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
177 {
178 	depth_convert_avx512_impl<LoadU8, StoreF32>(src, dst, scale, offset, left, right);
179 }
180 
depth_convert_w2h_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)181 void depth_convert_w2h_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
182 {
183 	depth_convert_avx512_impl<LoadU16, StoreF16>(src, dst, scale, offset, left, right);
184 }
185 
depth_convert_w2f_avx512(const void * src,void * dst,float scale,float offset,unsigned left,unsigned right)186 void depth_convert_w2f_avx512(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
187 {
188 	depth_convert_avx512_impl<LoadU16, StoreF32>(src, dst, scale, offset, left, right);
189 }
190 
191 } // namespace depth
192 } // namespace zimg
193 
194 #endif // ZIMG_X86_AVX512
195