1 /*****************************************************************************
2
3 Scaler.cpp
4 Author: Laurent de Soras, 2015
5
6 To be compiled with /arch:AVX2 in order to avoid SSE/AVX state switch
7 slowdown.
8
9 --- Legal stuff ---
10
11 This program is free software. It comes without any warranty, to
12 the extent permitted by applicable law. You can redistribute it
13 and/or modify it under the terms of the Do What The Fuck You Want
14 To Public License, Version 2, as published by Sam Hocevar. See
15 http://sam.zoy.org/wtfpl/COPYING for more details.
16
17 *Tab=3***********************************************************************/
18
19 #if defined (_MSC_VER)
20 #pragma warning (1 : 4130 4223 4705 4706)
21 #pragma warning (4 : 4355 4786 4800)
22 #endif
23
24
25
26 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
27
28 #include "fmtcl/ContFirInterface.h"
29 #include "fmtcl/ProxyRwAvx2.h"
30 #include "fmtcl/ReadWrapperFlt.h"
31 #include "fmtcl/ReadWrapperInt.h"
32 #include "fmtcl/Scaler.h"
33 #include "fmtcl/ScalerCopy.h"
34 #include "fstb/fnc.h"
35 #include "fstb/ToolsSse2.h"
36
37 #include <algorithm>
38
39 #include <cassert>
40 #include <climits>
41
42
43
44 namespace fmtcl
45 {
46
47
48
49 /*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
50
51
52
53 /*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
54
55
56
57 /*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
58
59
60
61 #define fmtcl_Scaler_INIT_F_AVX2(DT, ST, DE, SE, FN) \
62 _process_plane_flt_##FN##_ptr = &ThisType::process_plane_flt_avx2 <ProxyRwAvx2 <SplFmt_##DE>, ProxyRwAvx2 <SplFmt_##SE> >;
63
64 #define fmtcl_Scaler_INIT_I_AVX2(DT, ST, DE, SE, DB, SB, FN) \
65 _process_plane_int_##FN##_ptr = &ThisType::process_plane_int_avx2 <ProxyRwAvx2 <SplFmt_##DE>, DB, ProxyRwAvx2 <SplFmt_##SE>, SB>;
66
setup_avx2()67 void Scaler::setup_avx2 ()
68 {
69 fmtcl_Scaler_SPAN_F (fmtcl_Scaler_INIT_F_AVX2)
70 #if ! defined (fmtcl_Scaler_SSE2_16BITS)
71 fmtcl_Scaler_SPAN_I (fmtcl_Scaler_INIT_I_AVX2)
72 #endif
73 }
74
75 #undef fmtcl_Scaler_INIT_F_AVX2
76 #undef fmtcl_Scaler_INIT_I_AVX2
77
78
79
80 template <class SRC, bool PF>
Scaler_process_vect_flt_avx2(__m256 & sum0,__m256 & sum1,int kernel_size,const float * coef_base_ptr,typename SRC::PtrConst::Type pix_ptr,const __m256i & zero,int src_stride,const __m256 & add_cst,int len)81 static fstb_FORCEINLINE void Scaler_process_vect_flt_avx2 (__m256 &sum0, __m256 &sum1, int kernel_size, const float *coef_base_ptr, typename SRC::PtrConst::Type pix_ptr, const __m256i &zero, int src_stride, const __m256 &add_cst, int len)
82 {
83 // Possible optimization: initialize the sum with DST::OFFSET + _add_cst_flt
84 // and save the add in the write proxy.
85 sum0 = add_cst;
86 sum1 = add_cst;
87
88 for (int k = 0; k < kernel_size; ++k)
89 {
90 __m256 coef = _mm256_set1_ps (coef_base_ptr [k]);
91 __m256 src0;
92 __m256 src1;
93 ReadWrapperFlt <SRC, PF>::read (pix_ptr, src0, src1, zero, len);
94 const __m256 val0 = _mm256_mul_ps (src0, coef);
95 const __m256 val1 = _mm256_mul_ps (src1, coef);
96 sum0 = _mm256_add_ps (sum0, val0);
97 sum1 = _mm256_add_ps (sum1, val1);
98
99 SRC::PtrConst::jump (pix_ptr, src_stride);
100 }
101 }
102
103
104
105 // DST and SRC are ProxyRwAvx2 classes
106 // Stride offsets in pixels
107 // Source pointer may be unaligned.
108 template <class DST, class SRC>
process_plane_flt_avx2(typename DST::Ptr::Type dst_ptr,typename SRC::PtrConst::Type src_ptr,int dst_stride,int src_stride,int width,int y_dst_beg,int y_dst_end) const109 void Scaler::process_plane_flt_avx2 (typename DST::Ptr::Type dst_ptr, typename SRC::PtrConst::Type src_ptr, int dst_stride, int src_stride, int width, int y_dst_beg, int y_dst_end) const
110 {
111 assert (DST::Ptr::check_ptr (dst_ptr, DST::ALIGN_W));
112 assert (SRC::PtrConst::check_ptr (src_ptr, SRC::ALIGN_R));
113 // When the destination is a buffer:
114 // mod4 is enough to guarantee alignment, but since we process pairs of
115 // vectors and write_partial() is not different from write() with float
116 // data (overwriting all the 64 bytes), we must take extra-care not to
117 // overflow from the output buffer.
118 // When the destination is not a buffer (output frame data), data may be
119 // unaligned anyway. (TO DO: check the algorithm and make sure this
120 // constraint is actual).
121 assert ((dst_stride & 15) == 0);
122 assert ((src_stride & 3) == 0);
123 assert (width > 0);
124 assert (y_dst_beg >= 0);
125 assert (y_dst_beg < y_dst_end);
126 assert (y_dst_end <= _dst_height);
127 assert (width <= dst_stride);
128 assert (width <= src_stride);
129
130 const __m256i zero = _mm256_setzero_si256 ();
131 const __m256i mask_lsb = _mm256_set1_epi16 (0x00FF);
132 const __m256i sign_bit = _mm256_set1_epi16 (-0x8000);
133 const __m256 offset = _mm256_set1_ps (float (DST::OFFSET));
134 const __m256 add_cst = _mm256_set1_ps (float (_add_cst_flt));
135
136 const int w16 = width & -16;
137 const int w15 = width - w16;
138
139 for (int y = y_dst_beg; y < y_dst_end; ++y)
140 {
141 const KernelInfo& kernel_info = _kernel_info_arr [y];
142 const int kernel_size = kernel_info._kernel_size;
143 const float * coef_base_ptr = &_coef_flt_arr [kernel_info._coef_index];
144 const int ofs_y = kernel_info._start_line;
145
146 typename SRC::PtrConst::Type col_src_ptr = src_ptr;
147 SRC::PtrConst::jump (col_src_ptr, src_stride * ofs_y);
148 typename DST::Ptr::Type col_dst_ptr = dst_ptr;
149
150 typedef ScalerCopy <DST, 0, SRC, 0> ScCopy;
151
152 if (ScCopy::can_copy (kernel_info._copy_flt_flag))
153 {
154 ScCopy::copy (col_dst_ptr, col_src_ptr, width);
155 }
156
157 else
158 {
159 __m256 sum0;
160 __m256 sum1;
161
162 for (int x = 0; x < w16; x += 16)
163 {
164 typename SRC::PtrConst::Type pix_ptr = col_src_ptr;
165
166 Scaler_process_vect_flt_avx2 <SRC, false> (
167 sum0, sum1, kernel_size, coef_base_ptr,
168 pix_ptr, zero, src_stride, add_cst, 0
169 );
170 DST::write_flt (
171 col_dst_ptr, sum0, sum1, mask_lsb, sign_bit, offset
172 );
173
174 DST::Ptr::jump (col_dst_ptr, 16);
175 SRC::PtrConst::jump (col_src_ptr, 16);
176 }
177
178 if (w15 > 0)
179 {
180 typename SRC::PtrConst::Type pix_ptr = col_src_ptr;
181
182 Scaler_process_vect_flt_avx2 <SRC, true> (
183 sum0, sum1, kernel_size, coef_base_ptr,
184 pix_ptr, zero, src_stride, add_cst, w15
185 );
186 DST::write_flt_partial (
187 col_dst_ptr, sum0, sum1, mask_lsb, sign_bit, offset, w15
188 );
189 }
190 }
191
192 DST::Ptr::jump (dst_ptr, dst_stride);
193 }
194
195 _mm256_zeroupper (); // Back to SSE state
196 }
197
198
199
200 template <class DST, int DB, class SRC, int SB, bool PF>
Scaler_process_vect_int_avx2(const __m256i & add_cst,int kernel_size,const __m256i coef_base_ptr[],typename SRC::PtrConst::Type pix_ptr,const __m256i & zero,int src_stride,const __m256i & sign_bit,int len)201 static fstb_FORCEINLINE __m256i Scaler_process_vect_int_avx2 (const __m256i &add_cst, int kernel_size, const __m256i coef_base_ptr [], typename SRC::PtrConst::Type pix_ptr, const __m256i &zero, int src_stride, const __m256i &sign_bit, int len)
202 {
203 typedef typename SRC::template S16 <false, (SB == 16)> SrcS16R;
204
205 __m256i sum0 = add_cst;
206 __m256i sum1 = add_cst;
207
208 for (int k = 0; k < kernel_size; ++k)
209 {
210 const __m256i coef = _mm256_load_si256 (coef_base_ptr + k);
211 const __m256i src = ReadWrapperInt <SRC, SrcS16R, PF>::read (
212 pix_ptr, zero, sign_bit, len
213 );
214
215 fstb::ToolsAvx2::mac_s16_s16_s32 (sum0, sum1, src, coef);
216
217 SRC::PtrConst::jump (pix_ptr, src_stride);
218 }
219
220 sum0 = _mm256_srai_epi32 (sum0, Scaler::SHIFT_INT + SB - DB);
221 sum1 = _mm256_srai_epi32 (sum1, Scaler::SHIFT_INT + SB - DB);
222
223 const __m256i val = _mm256_packs_epi32 (sum0, sum1);
224
225 return (val);
226 }
227
228
229
230 template <class DST, int DB, class SRC, int SB>
process_plane_int_avx2(typename DST::Ptr::Type dst_ptr,typename SRC::PtrConst::Type src_ptr,int dst_stride,int src_stride,int width,int y_dst_beg,int y_dst_end) const231 void Scaler::process_plane_int_avx2 (typename DST::Ptr::Type dst_ptr, typename SRC::PtrConst::Type src_ptr, int dst_stride, int src_stride, int width, int y_dst_beg, int y_dst_end) const
232 {
233 assert (_can_int_flag);
234 assert (DST::Ptr::check_ptr (dst_ptr, DST::ALIGN_W));
235 assert (SRC::PtrConst::check_ptr (src_ptr, SRC::ALIGN_R));
236 assert ((dst_stride & 15) == 0);
237 assert (width > 0);
238 assert (y_dst_beg >= 0);
239 assert (y_dst_beg < y_dst_end);
240 assert (y_dst_end <= _dst_height);
241 assert (width <= dst_stride);
242 assert (width <= src_stride);
243
244 // Rounding constant for the final shift
245 const int r_cst = 1 << (SHIFT_INT + SB - DB - 1);
246
247 // Sign constants: when we have 16-bit data at one end only,
248 // we need to make data signed at the oposite end. This sign
249 // constant is reported on the summing constant.
250 const int s_in = (SB < 16) ? -(0x8000 << (SHIFT_INT + SB - DB)) : 0;
251 const int s_out = (DB < 16) ? 0x8000 << (SHIFT_INT + SB - DB) : 0;
252 const int s_cst = s_in + s_out;
253
254 const __m256i zero = _mm256_setzero_si256 ();
255 const __m256i mask_lsb = _mm256_set1_epi16 (0x00FF);
256 const __m256i sign_bit = _mm256_set1_epi16 (-0x8000);
257 const __m256i ma = _mm256_set1_epi16 (int16_t (uint16_t ((1 << DB) - 1)));
258 const __m256i add_cst = _mm256_set1_epi32 (_add_cst_int + s_cst + r_cst);
259
260 const int w16 = width & -16;
261 const int w15 = width - w16;
262
263 for (int y = y_dst_beg; y < y_dst_end; ++y)
264 {
265 const KernelInfo& kernel_info = _kernel_info_arr [y];
266 const int kernel_size = kernel_info._kernel_size;
267 const int ofs_y = kernel_info._start_line;
268 const __m256i * coef_base_ptr = reinterpret_cast <const __m256i *> (
269 _coef_int_arr.use_vect_avx2 (kernel_info._coef_index)
270 );
271
272 typename SRC::PtrConst::Type col_src_ptr = src_ptr;
273 SRC::PtrConst::jump (col_src_ptr, src_stride * ofs_y);
274 typename DST::Ptr::Type col_dst_ptr = dst_ptr;
275
276 typedef ScalerCopy <DST, DB, SRC, SB> ScCopy;
277
278 if (ScCopy::can_copy (kernel_info._copy_int_flag))
279 {
280 ScCopy::copy (col_dst_ptr, col_src_ptr, width);
281 }
282
283 else
284 {
285 typedef typename DST::template S16 <false, (DB == 16)> DstS16W;
286
287 for (int x = 0; x < w16; x += 16)
288 {
289 typename SRC::PtrConst::Type pix_ptr = col_src_ptr;
290
291 const __m256i val = Scaler_process_vect_int_avx2 <
292 DST, DB, SRC, SB, false
293 > (
294 add_cst, kernel_size, coef_base_ptr,
295 pix_ptr, zero, src_stride, sign_bit, 0
296 );
297
298 DstS16W::write_clip (
299 col_dst_ptr,
300 val,
301 mask_lsb,
302 zero,
303 ma,
304 sign_bit
305 );
306
307 DST::Ptr::jump (col_dst_ptr, 16);
308 SRC::PtrConst::jump (col_src_ptr, 16);
309 }
310
311 if (w15 > 0)
312 {
313 typename SRC::PtrConst::Type pix_ptr = col_src_ptr;
314
315 const __m256i val = Scaler_process_vect_int_avx2 <
316 DST, DB, SRC, SB, true
317 > (
318 add_cst, kernel_size, coef_base_ptr,
319 pix_ptr, zero, src_stride, sign_bit, w15
320 );
321
322 DstS16W::write_clip_partial (
323 col_dst_ptr,
324 val,
325 mask_lsb,
326 zero,
327 ma,
328 sign_bit,
329 w15
330 );
331 }
332 }
333
334 DST::Ptr::jump (dst_ptr, dst_stride);
335 }
336 }
337
338
339
340 } // namespace fmtcl
341
342
343
344 /*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
345