1 /*****************************************************************************
2 
3         Scaler.cpp
4         Author: Laurent de Soras, 2015
5 
6 To be compiled with /arch:AVX2 in order to avoid SSE/AVX state switch
7 slowdown.
8 
9 --- Legal stuff ---
10 
11 This program is free software. It comes without any warranty, to
12 the extent permitted by applicable law. You can redistribute it
13 and/or modify it under the terms of the Do What The Fuck You Want
14 To Public License, Version 2, as published by Sam Hocevar. See
15 http://sam.zoy.org/wtfpl/COPYING for more details.
16 
17 *Tab=3***********************************************************************/
18 
19 #if defined (_MSC_VER)
20 	#pragma warning (1 : 4130 4223 4705 4706)
21 	#pragma warning (4 : 4355 4786 4800)
22 #endif
23 
24 
25 
26 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
27 
28 #include "fmtcl/ContFirInterface.h"
29 #include "fmtcl/ProxyRwAvx2.h"
30 #include "fmtcl/ReadWrapperFlt.h"
31 #include "fmtcl/ReadWrapperInt.h"
32 #include "fmtcl/Scaler.h"
33 #include "fmtcl/ScalerCopy.h"
34 #include "fstb/fnc.h"
35 #include "fstb/ToolsSse2.h"
36 
37 #include <algorithm>
38 
39 #include <cassert>
40 #include <climits>
41 
42 
43 
44 namespace fmtcl
45 {
46 
47 
48 
49 /*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
50 
51 
52 
53 /*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
54 
55 
56 
57 /*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
58 
59 
60 
61 #define fmtcl_Scaler_INIT_F_AVX2(DT, ST, DE, SE, FN) \
62 	_process_plane_flt_##FN##_ptr = &ThisType::process_plane_flt_avx2 <ProxyRwAvx2 <SplFmt_##DE>, ProxyRwAvx2 <SplFmt_##SE> >;
63 
64 #define fmtcl_Scaler_INIT_I_AVX2(DT, ST, DE, SE, DB, SB, FN) \
65 	_process_plane_int_##FN##_ptr = &ThisType::process_plane_int_avx2 <ProxyRwAvx2 <SplFmt_##DE>, DB, ProxyRwAvx2 <SplFmt_##SE>, SB>;
66 
setup_avx2()67 void  Scaler::setup_avx2 ()
68 {
69 	fmtcl_Scaler_SPAN_F (fmtcl_Scaler_INIT_F_AVX2)
70 #if ! defined (fmtcl_Scaler_SSE2_16BITS)
71 	fmtcl_Scaler_SPAN_I (fmtcl_Scaler_INIT_I_AVX2)
72 #endif
73 }
74 
75 #undef fmtcl_Scaler_INIT_F_AVX2
76 #undef fmtcl_Scaler_INIT_I_AVX2
77 
78 
79 
80 template <class SRC, bool PF>
Scaler_process_vect_flt_avx2(__m256 & sum0,__m256 & sum1,int kernel_size,const float * coef_base_ptr,typename SRC::PtrConst::Type pix_ptr,const __m256i & zero,int src_stride,const __m256 & add_cst,int len)81 static fstb_FORCEINLINE void	Scaler_process_vect_flt_avx2 (__m256 &sum0, __m256 &sum1, int kernel_size, const float *coef_base_ptr, typename SRC::PtrConst::Type pix_ptr, const __m256i &zero, int src_stride, const __m256 &add_cst, int len)
82 {
83 	// Possible optimization: initialize the sum with DST::OFFSET + _add_cst_flt
84 	// and save the add in the write proxy.
85 	sum0 = add_cst;
86 	sum1 = add_cst;
87 
88 	for (int k = 0; k < kernel_size; ++k)
89 	{
90 		__m256         coef = _mm256_set1_ps (coef_base_ptr [k]);
91 		__m256         src0;
92 		__m256         src1;
93 		ReadWrapperFlt <SRC, PF>::read (pix_ptr, src0, src1, zero, len);
94 		const __m256   val0 = _mm256_mul_ps (src0, coef);
95 		const __m256   val1 = _mm256_mul_ps (src1, coef);
96 		sum0 = _mm256_add_ps (sum0, val0);
97 		sum1 = _mm256_add_ps (sum1, val1);
98 
99 		SRC::PtrConst::jump (pix_ptr, src_stride);
100 	}
101 }
102 
103 
104 
105 // DST and SRC are ProxyRwAvx2 classes
106 // Stride offsets in pixels
107 // Source pointer may be unaligned.
108 template <class DST, class SRC>
process_plane_flt_avx2(typename DST::Ptr::Type dst_ptr,typename SRC::PtrConst::Type src_ptr,int dst_stride,int src_stride,int width,int y_dst_beg,int y_dst_end) const109 void	Scaler::process_plane_flt_avx2 (typename DST::Ptr::Type dst_ptr, typename SRC::PtrConst::Type src_ptr, int dst_stride, int src_stride, int width, int y_dst_beg, int y_dst_end) const
110 {
111 	assert (DST::Ptr::check_ptr (dst_ptr, DST::ALIGN_W));
112 	assert (SRC::PtrConst::check_ptr (src_ptr, SRC::ALIGN_R));
113 	// When the destination is a buffer:
114 	// mod4 is enough to guarantee alignment, but since we process pairs of
115 	// vectors and write_partial() is not different from write() with float
116 	// data (overwriting all the 64 bytes), we must take extra-care not to
117 	// overflow from the output buffer.
118 	// When the destination is not a buffer (output frame data), data may be
119 	// unaligned anyway. (TO DO: check the algorithm and make sure this
120 	// constraint is actual).
121 	assert ((dst_stride & 15) == 0);
122 	assert ((src_stride & 3) == 0);
123 	assert (width > 0);
124 	assert (y_dst_beg >= 0);
125 	assert (y_dst_beg < y_dst_end);
126 	assert (y_dst_end <= _dst_height);
127 	assert (width <= dst_stride);
128 	assert (width <= src_stride);
129 
130 	const __m256i  zero     = _mm256_setzero_si256 ();
131 	const __m256i  mask_lsb = _mm256_set1_epi16 (0x00FF);
132 	const __m256i  sign_bit = _mm256_set1_epi16 (-0x8000);
133 	const __m256   offset   = _mm256_set1_ps (float (DST::OFFSET));
134 	const __m256   add_cst  = _mm256_set1_ps (float (_add_cst_flt));
135 
136 	const int      w16 = width & -16;
137 	const int      w15 = width - w16;
138 
139 	for (int y = y_dst_beg; y < y_dst_end; ++y)
140 	{
141 		const KernelInfo& kernel_info   = _kernel_info_arr [y];
142 		const int         kernel_size   = kernel_info._kernel_size;
143 		const float *     coef_base_ptr = &_coef_flt_arr [kernel_info._coef_index];
144 		const int         ofs_y         = kernel_info._start_line;
145 
146 		typename SRC::PtrConst::Type  col_src_ptr = src_ptr;
147 		SRC::PtrConst::jump (col_src_ptr, src_stride * ofs_y);
148 		typename DST::Ptr::Type       col_dst_ptr = dst_ptr;
149 
150 		typedef ScalerCopy <DST, 0, SRC, 0> ScCopy;
151 
152 		if (ScCopy::can_copy (kernel_info._copy_flt_flag))
153 		{
154 			ScCopy::copy (col_dst_ptr, col_src_ptr, width);
155 		}
156 
157 		else
158 		{
159 			__m256         sum0;
160 			__m256         sum1;
161 
162 			for (int x = 0; x < w16; x += 16)
163 			{
164 				typename SRC::PtrConst::Type  pix_ptr = col_src_ptr;
165 
166 				Scaler_process_vect_flt_avx2 <SRC, false> (
167 					sum0, sum1, kernel_size, coef_base_ptr,
168 					pix_ptr, zero, src_stride, add_cst, 0
169 				);
170 				DST::write_flt (
171 					col_dst_ptr, sum0, sum1, mask_lsb, sign_bit, offset
172 				);
173 
174 				DST::Ptr::jump (col_dst_ptr, 16);
175 				SRC::PtrConst::jump (col_src_ptr, 16);
176 			}
177 
178 			if (w15 > 0)
179 			{
180 				typename SRC::PtrConst::Type  pix_ptr = col_src_ptr;
181 
182 				Scaler_process_vect_flt_avx2 <SRC, true> (
183 					sum0, sum1, kernel_size, coef_base_ptr,
184 					pix_ptr, zero, src_stride, add_cst, w15
185 				);
186 				DST::write_flt_partial (
187 					col_dst_ptr, sum0, sum1, mask_lsb, sign_bit, offset, w15
188 				);
189 			}
190 		}
191 
192 		DST::Ptr::jump (dst_ptr, dst_stride);
193 	}
194 
195 	_mm256_zeroupper ();	// Back to SSE state
196 }
197 
198 
199 
200 template <class DST, int DB, class SRC, int SB, bool PF>
Scaler_process_vect_int_avx2(const __m256i & add_cst,int kernel_size,const __m256i coef_base_ptr[],typename SRC::PtrConst::Type pix_ptr,const __m256i & zero,int src_stride,const __m256i & sign_bit,int len)201 static fstb_FORCEINLINE __m256i	Scaler_process_vect_int_avx2 (const __m256i &add_cst, int kernel_size, const __m256i coef_base_ptr [], typename SRC::PtrConst::Type pix_ptr, const __m256i &zero, int src_stride, const __m256i &sign_bit, int len)
202 {
203 	typedef typename SRC::template S16 <false, (SB == 16)> SrcS16R;
204 
205 	__m256i        sum0 = add_cst;
206 	__m256i        sum1 = add_cst;
207 
208 	for (int k = 0; k < kernel_size; ++k)
209 	{
210 		const __m256i  coef = _mm256_load_si256 (coef_base_ptr + k);
211 		const __m256i  src  = ReadWrapperInt <SRC, SrcS16R, PF>::read (
212 			pix_ptr, zero, sign_bit, len
213 		);
214 
215 		fstb::ToolsAvx2::mac_s16_s16_s32 (sum0, sum1, src, coef);
216 
217 		SRC::PtrConst::jump (pix_ptr, src_stride);
218 	}
219 
220 	sum0 = _mm256_srai_epi32 (sum0, Scaler::SHIFT_INT + SB - DB);
221 	sum1 = _mm256_srai_epi32 (sum1, Scaler::SHIFT_INT + SB - DB);
222 
223 	const __m256i  val = _mm256_packs_epi32 (sum0, sum1);
224 
225 	return (val);
226 }
227 
228 
229 
230 template <class DST, int DB, class SRC, int SB>
process_plane_int_avx2(typename DST::Ptr::Type dst_ptr,typename SRC::PtrConst::Type src_ptr,int dst_stride,int src_stride,int width,int y_dst_beg,int y_dst_end) const231 void	Scaler::process_plane_int_avx2 (typename DST::Ptr::Type dst_ptr, typename SRC::PtrConst::Type src_ptr, int dst_stride, int src_stride, int width, int y_dst_beg, int y_dst_end) const
232 {
233 	assert (_can_int_flag);
234 	assert (DST::Ptr::check_ptr (dst_ptr, DST::ALIGN_W));
235 	assert (SRC::PtrConst::check_ptr (src_ptr, SRC::ALIGN_R));
236 	assert ((dst_stride & 15) == 0);
237 	assert (width > 0);
238 	assert (y_dst_beg >= 0);
239 	assert (y_dst_beg < y_dst_end);
240 	assert (y_dst_end <= _dst_height);
241 	assert (width <= dst_stride);
242 	assert (width <= src_stride);
243 
244 	// Rounding constant for the final shift
245 	const int      r_cst    = 1 << (SHIFT_INT + SB - DB - 1);
246 
247 	// Sign constants: when we have 16-bit data at one end only,
248 	// we need to make data signed at the oposite end. This sign
249 	// constant is reported on the summing constant.
250 	const int      s_in     = (SB < 16) ? -(0x8000 << (SHIFT_INT + SB - DB)) : 0;
251 	const int      s_out    = (DB < 16) ?   0x8000 << (SHIFT_INT + SB - DB)  : 0;
252 	const int      s_cst    = s_in + s_out;
253 
254 	const __m256i  zero     = _mm256_setzero_si256 ();
255 	const __m256i  mask_lsb = _mm256_set1_epi16 (0x00FF);
256 	const __m256i  sign_bit = _mm256_set1_epi16 (-0x8000);
257 	const __m256i  ma       = _mm256_set1_epi16 (int16_t (uint16_t ((1 << DB) - 1)));
258 	const __m256i  add_cst  = _mm256_set1_epi32 (_add_cst_int + s_cst + r_cst);
259 
260 	const int      w16 = width & -16;
261 	const int      w15 = width - w16;
262 
263 	for (int y = y_dst_beg; y < y_dst_end; ++y)
264 	{
265 		const KernelInfo&    kernel_info   = _kernel_info_arr [y];
266 		const int            kernel_size   = kernel_info._kernel_size;
267 		const int            ofs_y         = kernel_info._start_line;
268 		const __m256i *      coef_base_ptr = reinterpret_cast <const __m256i *> (
269 			_coef_int_arr.use_vect_avx2 (kernel_info._coef_index)
270 		);
271 
272 		typename SRC::PtrConst::Type  col_src_ptr = src_ptr;
273 		SRC::PtrConst::jump (col_src_ptr, src_stride * ofs_y);
274 		typename DST::Ptr::Type       col_dst_ptr = dst_ptr;
275 
276 		typedef ScalerCopy <DST, DB, SRC, SB> ScCopy;
277 
278 		if (ScCopy::can_copy (kernel_info._copy_int_flag))
279 		{
280 			ScCopy::copy (col_dst_ptr, col_src_ptr, width);
281 		}
282 
283 		else
284 		{
285 			typedef typename DST::template S16 <false, (DB == 16)> DstS16W;
286 
287 			for (int x = 0; x < w16; x += 16)
288 			{
289 				typename SRC::PtrConst::Type  pix_ptr = col_src_ptr;
290 
291 				const __m256i  val = Scaler_process_vect_int_avx2 <
292 					DST, DB, SRC, SB, false
293 				> (
294 					add_cst, kernel_size, coef_base_ptr,
295 					pix_ptr, zero, src_stride, sign_bit, 0
296 				);
297 
298 				DstS16W::write_clip (
299 					col_dst_ptr,
300 					val,
301 					mask_lsb,
302 					zero,
303 					ma,
304 					sign_bit
305 				);
306 
307 				DST::Ptr::jump (col_dst_ptr, 16);
308 				SRC::PtrConst::jump (col_src_ptr, 16);
309 			}
310 
311 			if (w15 > 0)
312 			{
313 				typename SRC::PtrConst::Type  pix_ptr = col_src_ptr;
314 
315 				const __m256i  val = Scaler_process_vect_int_avx2 <
316 					DST, DB, SRC, SB, true
317 				> (
318 					add_cst, kernel_size, coef_base_ptr,
319 					pix_ptr, zero, src_stride, sign_bit, w15
320 				);
321 
322 				DstS16W::write_clip_partial (
323 					col_dst_ptr,
324 					val,
325 					mask_lsb,
326 					zero,
327 					ma,
328 					sign_bit,
329 					w15
330 				);
331 			}
332 		}
333 
334 		DST::Ptr::jump (dst_ptr, dst_stride);
335 	}
336 }
337 
338 
339 
340 }	// namespace fmtcl
341 
342 
343 
344 /*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
345