1 /*****************************************************************************
2
3 BitBltConv_avx2.cpp
4 Author: Laurent de Soras, 2015
5
6 To be compiled with /arch:AVX2 in order to avoid SSE/AVX state switch
7 slowdown.
8
9 --- Legal stuff ---
10
11 This program is free software. It comes without any warranty, to
12 the extent permitted by applicable law. You can redistribute it
13 and/or modify it under the terms of the Do What The Fuck You Want
14 To Public License, Version 2, as published by Sam Hocevar. See
15 http://sam.zoy.org/wtfpl/COPYING for more details.
16
17 *Tab=3***********************************************************************/
18
19
20
21 #if defined (_MSC_VER)
22 #pragma warning (1 : 4130 4223 4705 4706)
23 #pragma warning (4 : 4355 4786 4800)
24 #endif
25
26
27
28 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
29
30 #include "fmtcl/BitBltConv.h"
31 #include "fmtcl/Proxy.h"
32 #include "fmtcl/ProxyRwAvx2.h"
33 #include "fstb/fnc.h"
34
35 #include <stdexcept>
36
37 #include <cassert>
38
39
40
41 namespace fmtcl
42 {
43
44
45
46 /*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
47
48
49
50 /*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
51
52
53
54 /*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
55
56
57
bitblt_int_to_flt_avx2_switch(uint8_t * dst_ptr,int dst_stride,fmtcl::SplFmt src_fmt,int src_res,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)58 void BitBltConv::bitblt_int_to_flt_avx2_switch (uint8_t *dst_ptr, int dst_stride, fmtcl::SplFmt src_fmt, int src_res, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
59 {
60 const uint8_t * src_i08_ptr (src_ptr);
61 const Proxy::PtrInt16Const::Type src_i16_ptr (
62 reinterpret_cast <const uint16_t *> (src_ptr)
63 );
64
65 const bool scale_flag = ! is_si_neutral (scale_info_ptr);
66
67 #define fmtcl_BitBltConv_CASE(SCF, SFMT, SRES, SPTR) \
68 case ((SCF << 16) + (SplFmt_##SFMT << 8) + SRES): \
69 bitblt_int_to_flt_avx2 <SCF, ProxyRwAvx2 <SplFmt_##SFMT>, SRES> ( \
70 dst_ptr, dst_stride, src_##SPTR##_ptr, src_stride, \
71 w, h, scale_info_ptr \
72 ); \
73 break;
74
75 switch ((scale_flag << 16) + (src_fmt << 8) + src_res)
76 {
77 fmtcl_BitBltConv_CASE (false, INT16 , 16, i16)
78 fmtcl_BitBltConv_CASE (false, INT16 , 12, i16)
79 fmtcl_BitBltConv_CASE (false, INT16 , 10, i16)
80 fmtcl_BitBltConv_CASE (false, INT16 , 9, i16)
81 fmtcl_BitBltConv_CASE (false, INT8 , 8, i08)
82 fmtcl_BitBltConv_CASE (true , INT16 , 16, i16)
83 fmtcl_BitBltConv_CASE (true , INT16 , 12, i16)
84 fmtcl_BitBltConv_CASE (true , INT16 , 10, i16)
85 fmtcl_BitBltConv_CASE (true , INT16 , 9, i16)
86 fmtcl_BitBltConv_CASE (true , INT8 , 8, i08)
87 default:
88 assert (false);
89 throw std::logic_error (
90 "fmtcl::BitBltConv::bitblt: "
91 "illegal int-to-float pixel format conversion."
92 );
93 }
94
95 #undef fmtcl_BitBltConv_CASE
96 }
97
98
99
bitblt_flt_to_int_avx2_switch(fmtcl::SplFmt dst_fmt,int dst_res,uint8_t * dst_ptr,int dst_stride,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)100 void BitBltConv::bitblt_flt_to_int_avx2_switch (fmtcl::SplFmt dst_fmt, int dst_res, uint8_t *dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
101 {
102 fstb::unused (dst_res);
103
104 const Proxy::PtrInt16::Type dst_i16_ptr (
105 reinterpret_cast <uint16_t *> (dst_ptr)
106 );
107
108 const bool scale_flag = ! is_si_neutral (scale_info_ptr);
109
110 #define fmtcl_BitBltConv_CASE(SCF, DFMT, DPTR) \
111 case (SCF << 4) + SplFmt_##DFMT: \
112 bitblt_flt_to_int_avx2 <SCF, ProxyRwAvx2 <SplFmt_##DFMT> > ( \
113 dst_##DPTR##_ptr, dst_stride, src_ptr, src_stride, \
114 w, h, scale_info_ptr \
115 ); \
116 break;
117
118 switch ((scale_flag << 4) + dst_fmt)
119 {
120 fmtcl_BitBltConv_CASE (false, INT16 , i16)
121 fmtcl_BitBltConv_CASE (true , INT16 , i16)
122 default:
123 assert (false);
124 throw std::logic_error (
125 "fmtcl::BitBltConv::bitblt: "
126 "illegal float-to-int pixel format conversion."
127 );
128 }
129
130 #undef fmtcl_BitBltConv_CASE
131 }
132
133
134
bitblt_int_to_int_avx2_switch(fmtcl::SplFmt dst_fmt,int dst_res,uint8_t * dst_ptr,int dst_stride,fmtcl::SplFmt src_fmt,int src_res,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)135 void BitBltConv::bitblt_int_to_int_avx2_switch (fmtcl::SplFmt dst_fmt, int dst_res, uint8_t *dst_ptr, int dst_stride, fmtcl::SplFmt src_fmt, int src_res, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
136 {
137 fstb::unused (scale_info_ptr);
138
139 const uint8_t * src_i08_ptr (src_ptr);
140 const Proxy::PtrInt16Const::Type src_i16_ptr (
141 reinterpret_cast <const uint16_t *> (src_ptr)
142 );
143 const Proxy::PtrInt16::Type dst_i16_ptr (
144 reinterpret_cast <uint16_t *> (dst_ptr)
145 );
146
147 #define fmtcl_BitBltConv_CASE(DFMT, SFMT, DRES, SRES, DPTR, SPTR) \
148 case ((SplFmt_##DFMT << 20) + (SplFmt_##SFMT << 16) + (DRES << 8) + SRES): \
149 bitblt_ixx_to_x16_avx2 < \
150 ProxyRwAvx2 <SplFmt_##DFMT>, ProxyRwAvx2 <SplFmt_##SFMT>, \
151 DRES, SRES \
152 > (dst_##DPTR##_ptr, dst_stride, src_##SPTR##_ptr, src_stride, w, h); \
153 break;
154
155 switch ((dst_fmt << 20) + (src_fmt << 16) + (dst_res << 8) + src_res)
156 {
157 fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 12, i16, i16)
158 fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 10, i16, i16)
159 fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 9, i16, i16)
160 fmtcl_BitBltConv_CASE (INT16 , INT8 , 16, 8, i16, i08)
161 fmtcl_BitBltConv_CASE (INT16 , INT16 , 12, 10, i16, i16)
162 fmtcl_BitBltConv_CASE (INT16 , INT16 , 12, 9, i16, i16)
163 fmtcl_BitBltConv_CASE (INT16 , INT8 , 12, 8, i16, i08)
164 fmtcl_BitBltConv_CASE (INT16 , INT16 , 10, 9, i16, i16)
165 fmtcl_BitBltConv_CASE (INT16 , INT8 , 10, 8, i16, i08)
166 fmtcl_BitBltConv_CASE (INT16 , INT8 , 9, 8, i16, i08)
167 default:
168 assert (false);
169 throw std::logic_error (
170 "fmtcl::BitBltConv::bitblt: "
171 "illegal int-to-int pixel format conversion."
172 );
173 }
174
175 #undef fmtcl_BitBltConv_CASE
176 }
177
178
179
180 // Stride offsets are still in bytes
181 // Destination pointer must be 32-byte aligned!
182 template <bool SF, class SRC, int SBD>
bitblt_int_to_flt_avx2(uint8_t * dst_ptr,int dst_stride,typename SRC::PtrConst::Type src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)183 void BitBltConv::bitblt_int_to_flt_avx2 (uint8_t *dst_ptr, int dst_stride, typename SRC::PtrConst::Type src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
184 {
185 assert (fstb::ToolsAvx2::check_ptr_align (dst_ptr));
186 assert (SRC::PtrConst::check_ptr (src_ptr));
187 assert (w > 0);
188 assert (h > 0);
189 assert (! SF || scale_info_ptr != nullptr);
190
191 __m256 gain;
192 __m256 add_cst;
193 if (SF)
194 {
195 gain = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_gain ) : 1);
196 add_cst = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_add_cst) : 0);
197 }
198
199 float * dst_flt_ptr = reinterpret_cast <float *> (dst_ptr);
200
201 src_stride /= sizeof (typename SRC::PtrConst::DataType);
202 dst_stride /= sizeof (*dst_flt_ptr);
203
204 const __m256i zero = _mm256_setzero_si256 ();
205
206 const int w16 = w & -16;
207 const int w15 = w - w16;
208
209 for (int y = 0; y < h; ++y)
210 {
211 typename SRC::PtrConst::Type cur_src_ptr (src_ptr);
212 __m256 val_0007;
213 __m256 val_0815;
214
215 for (int x = 0; x < w16; x += 16)
216 {
217 SRC::read_flt (cur_src_ptr, val_0007, val_0815, zero);
218 if (SF)
219 {
220 val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
221 val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
222 }
223 _mm256_store_ps (dst_flt_ptr + x , val_0007);
224 _mm256_store_ps (dst_flt_ptr + x + 8, val_0815);
225
226 SRC::PtrConst::jump (cur_src_ptr, 16);
227 }
228
229 if (w15 > 0)
230 {
231 SRC::read_flt (cur_src_ptr, val_0007, val_0815, zero);
232 if (SF)
233 {
234 val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
235 val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
236 }
237 _mm256_store_ps (dst_flt_ptr + w16, val_0007);
238 if (w15 > 8)
239 {
240 _mm256_store_ps (dst_flt_ptr + w16 + 8, val_0815);
241 }
242 }
243
244 SRC::PtrConst::jump (src_ptr, src_stride);
245 dst_flt_ptr += dst_stride;
246 }
247 }
248
249
250
251 // Stride offsets are still in bytes
252 template <bool SF, class DST>
bitblt_flt_to_int_avx2(typename DST::Ptr::Type dst_ptr,int dst_stride,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)253 void BitBltConv::bitblt_flt_to_int_avx2 (typename DST::Ptr::Type dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
254 {
255 assert (DST::Ptr::check_ptr (dst_ptr));
256 assert (src_ptr != nullptr);
257 assert (w > 0);
258 assert (h > 0);
259 assert (! SF || scale_info_ptr != nullptr);
260
261 __m256 gain;
262 __m256 add_cst;
263 if (SF)
264 {
265 gain = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_gain ) : 1);
266 add_cst = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_add_cst) : 0);
267 }
268
269 const float * src_flt_ptr = reinterpret_cast <const float *> (src_ptr);
270
271 src_stride /= sizeof (*src_flt_ptr);
272 dst_stride /= sizeof (typename DST::Ptr::DataType);
273
274 const __m256i mask_lsb = _mm256_set1_epi16 (0x00FF);
275 const __m256i sign_bit = _mm256_set1_epi16 (-0x8000);
276 const __m256i zero = _mm256_setzero_si256 ();
277 const __m256 offset = _mm256_set1_ps (-32768);
278
279 const int w16 = w & -16;
280 const int w15 = w - w16;
281
282 for (int y = 0; y < h; ++y)
283 {
284 typename DST::Ptr::Type cur_dst_ptr = dst_ptr;
285 __m256 val_0007;
286 __m256 val_0815;
287
288 for (int x = 0; x < w16; x += 16)
289 {
290 val_0007 = _mm256_loadu_ps (src_flt_ptr + x );
291 val_0815 = _mm256_loadu_ps (src_flt_ptr + x + 8);
292 if (SF)
293 {
294 val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
295 val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
296 }
297 DST::write_flt (
298 cur_dst_ptr, val_0007, val_0815, mask_lsb, sign_bit, offset
299 );
300
301 DST::Ptr::jump (cur_dst_ptr, 16);
302 }
303
304 if (w15 > 0)
305 {
306 ProxyRwAvx2 <SplFmt_FLOAT>::read_flt_partial (
307 src_flt_ptr + w16, val_0007, val_0815, zero, w15
308 );
309 if (SF)
310 {
311 val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
312 val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
313 }
314 DST::write_flt_partial (
315 cur_dst_ptr, val_0007, val_0815, mask_lsb, sign_bit, offset, w15
316 );
317 }
318
319 DST::Ptr::jump (dst_ptr, dst_stride);
320 src_flt_ptr += src_stride;
321 }
322 }
323
324
325
326 // Stride offsets are still in bytes
327 template <class DST, class SRC, int DBD, int SBD>
bitblt_ixx_to_x16_avx2(typename DST::Ptr::Type dst_ptr,int dst_stride,typename SRC::PtrConst::Type src_ptr,int src_stride,int w,int h)328 void BitBltConv::bitblt_ixx_to_x16_avx2 (typename DST::Ptr::Type dst_ptr, int dst_stride, typename SRC::PtrConst::Type src_ptr, int src_stride, int w, int h)
329 {
330 assert (DST::Ptr::check_ptr (dst_ptr));
331 assert (SRC::PtrConst::check_ptr (src_ptr));
332 assert (w > 0);
333 assert (h > 0);
334
335 assert (DST::Ptr::check_ptr (dst_ptr));
336 assert (SRC::PtrConst::check_ptr (src_ptr));
337 assert (w > 0);
338 assert (h > 0);
339
340 src_stride /= sizeof (typename SRC::PtrConst::DataType);
341 dst_stride /= sizeof (typename DST::Ptr::DataType);
342
343 const __m256i zero = _mm256_setzero_si256 ();
344 const __m256i val_ma = _mm256_set1_epi16 ((DBD < 16) ? (1 << DBD) - 1 : 0);
345 const __m256i mask_lsb = _mm256_set1_epi16 (0x00FF);
346
347 const int w16 = w & -16;
348 const int w15 = w - w16;
349
350 for (int y = 0; y < h; ++y)
351 {
352 typename DST::Ptr::Type cur_dst_ptr = dst_ptr;
353 typename SRC::PtrConst::Type cur_src_ptr = src_ptr;
354
355 for (int x = 0; x < w16; x += 16)
356 {
357 __m256i val = SRC::read_i16 (cur_src_ptr, zero);
358 if (DBD != SBD)
359 {
360 val = _mm256_slli_epi16 (val, DBD - SBD);
361 }
362 if (DBD < 16)
363 {
364 val = _mm256_min_epi16 (val, val_ma);
365 }
366 DST::write_i16 (cur_dst_ptr, val, mask_lsb);
367
368 SRC::PtrConst::jump (cur_src_ptr, 16);
369 DST::Ptr::jump (cur_dst_ptr, 16);
370 }
371
372 if (w15 > 0)
373 {
374 __m256i val = SRC::read_i16_partial (cur_src_ptr, zero, w15);
375 if (DBD != SBD)
376 {
377 val = _mm256_slli_epi16 (val, DBD - SBD);
378 }
379 if (DBD < 16)
380 {
381 val = _mm256_min_epi16 (val, val_ma);
382 }
383 DST::write_i16_partial (cur_dst_ptr, val, mask_lsb, w15);
384 }
385
386 SRC::PtrConst::jump (src_ptr, src_stride);
387 DST::Ptr::jump (dst_ptr, dst_stride);
388 }
389 }
390
391
392
393 } // namespace fmtcl
394
395
396
397 /*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
398