1 /*****************************************************************************
2 
3         BitBltConv_avx2.cpp
4         Author: Laurent de Soras, 2015
5 
6 To be compiled with /arch:AVX2 in order to avoid SSE/AVX state switch
7 slowdown.
8 
9 --- Legal stuff ---
10 
11 This program is free software. It comes without any warranty, to
12 the extent permitted by applicable law. You can redistribute it
13 and/or modify it under the terms of the Do What The Fuck You Want
14 To Public License, Version 2, as published by Sam Hocevar. See
15 http://sam.zoy.org/wtfpl/COPYING for more details.
16 
17 *Tab=3***********************************************************************/
18 
19 
20 
21 #if defined (_MSC_VER)
22 	#pragma warning (1 : 4130 4223 4705 4706)
23 	#pragma warning (4 : 4355 4786 4800)
24 #endif
25 
26 
27 
28 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
29 
30 #include "fmtcl/BitBltConv.h"
31 #include "fmtcl/Proxy.h"
32 #include "fmtcl/ProxyRwAvx2.h"
33 #include "fstb/fnc.h"
34 
35 #include <stdexcept>
36 
37 #include <cassert>
38 
39 
40 
41 namespace fmtcl
42 {
43 
44 
45 
46 /*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
47 
48 
49 
50 /*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
51 
52 
53 
54 /*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
55 
56 
57 
bitblt_int_to_flt_avx2_switch(uint8_t * dst_ptr,int dst_stride,fmtcl::SplFmt src_fmt,int src_res,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)58 void	BitBltConv::bitblt_int_to_flt_avx2_switch (uint8_t *dst_ptr, int dst_stride, fmtcl::SplFmt src_fmt, int src_res, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
59 {
60 	const uint8_t *                    src_i08_ptr (src_ptr);
61 	const Proxy::PtrInt16Const::Type   src_i16_ptr (
62 		reinterpret_cast <const uint16_t *> (src_ptr)
63 	);
64 
65 	const bool     scale_flag = ! is_si_neutral (scale_info_ptr);
66 
67 #define	fmtcl_BitBltConv_CASE(SCF, SFMT, SRES, SPTR) \
68 	case	((SCF << 16) + (SplFmt_##SFMT << 8) + SRES): \
69 		bitblt_int_to_flt_avx2 <SCF, ProxyRwAvx2 <SplFmt_##SFMT>, SRES> ( \
70 			dst_ptr, dst_stride, src_##SPTR##_ptr, src_stride, \
71 			w, h, scale_info_ptr \
72 		); \
73 		break;
74 
75 	switch ((scale_flag << 16) + (src_fmt << 8) + src_res)
76 	{
77 	fmtcl_BitBltConv_CASE (false, INT16  , 16, i16)
78 	fmtcl_BitBltConv_CASE (false, INT16  , 12, i16)
79 	fmtcl_BitBltConv_CASE (false, INT16  , 10, i16)
80 	fmtcl_BitBltConv_CASE (false, INT16  ,  9, i16)
81 	fmtcl_BitBltConv_CASE (false, INT8   ,  8, i08)
82 	fmtcl_BitBltConv_CASE (true , INT16  , 16, i16)
83 	fmtcl_BitBltConv_CASE (true , INT16  , 12, i16)
84 	fmtcl_BitBltConv_CASE (true , INT16  , 10, i16)
85 	fmtcl_BitBltConv_CASE (true , INT16  ,  9, i16)
86 	fmtcl_BitBltConv_CASE (true , INT8   ,  8, i08)
87 	default:
88 		assert (false);
89 		throw std::logic_error (
90 			"fmtcl::BitBltConv::bitblt: "
91 			"illegal int-to-float pixel format conversion."
92 		);
93 	}
94 
95 #undef fmtcl_BitBltConv_CASE
96 }
97 
98 
99 
bitblt_flt_to_int_avx2_switch(fmtcl::SplFmt dst_fmt,int dst_res,uint8_t * dst_ptr,int dst_stride,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)100 void	BitBltConv::bitblt_flt_to_int_avx2_switch (fmtcl::SplFmt dst_fmt, int dst_res, uint8_t *dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
101 {
102 	fstb::unused (dst_res);
103 
104 	const Proxy::PtrInt16::Type   dst_i16_ptr (
105 		reinterpret_cast <uint16_t *> (dst_ptr)
106 	);
107 
108 	const bool     scale_flag = ! is_si_neutral (scale_info_ptr);
109 
110 #define	fmtcl_BitBltConv_CASE(SCF, DFMT, DPTR) \
111 	case	(SCF << 4) + SplFmt_##DFMT: \
112 		bitblt_flt_to_int_avx2 <SCF, ProxyRwAvx2 <SplFmt_##DFMT> > ( \
113 			dst_##DPTR##_ptr, dst_stride, src_ptr, src_stride, \
114 			w, h, scale_info_ptr \
115 		); \
116 		break;
117 
118 	switch ((scale_flag << 4) + dst_fmt)
119 	{
120 	fmtcl_BitBltConv_CASE (false, INT16  , i16)
121 	fmtcl_BitBltConv_CASE (true , INT16  , i16)
122 	default:
123 		assert (false);
124 		throw std::logic_error (
125 			"fmtcl::BitBltConv::bitblt: "
126 			"illegal float-to-int pixel format conversion."
127 		);
128 	}
129 
130 #undef fmtcl_BitBltConv_CASE
131 }
132 
133 
134 
bitblt_int_to_int_avx2_switch(fmtcl::SplFmt dst_fmt,int dst_res,uint8_t * dst_ptr,int dst_stride,fmtcl::SplFmt src_fmt,int src_res,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)135 void	BitBltConv::bitblt_int_to_int_avx2_switch (fmtcl::SplFmt dst_fmt, int dst_res, uint8_t *dst_ptr, int dst_stride, fmtcl::SplFmt src_fmt, int src_res, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
136 {
137 	fstb::unused (scale_info_ptr);
138 
139 	const uint8_t *                    src_i08_ptr (src_ptr);
140 	const Proxy::PtrInt16Const::Type   src_i16_ptr (
141 		reinterpret_cast <const uint16_t *> (src_ptr)
142 	);
143 	const Proxy::PtrInt16::Type        dst_i16_ptr (
144 		reinterpret_cast <uint16_t *> (dst_ptr)
145 	);
146 
147 #define	fmtcl_BitBltConv_CASE(DFMT, SFMT, DRES, SRES, DPTR, SPTR) \
148 	case	((SplFmt_##DFMT << 20) + (SplFmt_##SFMT << 16) + (DRES << 8) + SRES): \
149 		bitblt_ixx_to_x16_avx2 < \
150 			ProxyRwAvx2 <SplFmt_##DFMT>, ProxyRwAvx2 <SplFmt_##SFMT>, \
151 			DRES, SRES \
152 		> (dst_##DPTR##_ptr, dst_stride, src_##SPTR##_ptr, src_stride, w, h); \
153 		break;
154 
155 	switch ((dst_fmt << 20) + (src_fmt << 16) + (dst_res << 8) + src_res)
156 	{
157 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 16, 12, i16, i16)
158 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 16, 10, i16, i16)
159 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 16,  9, i16, i16)
160 	fmtcl_BitBltConv_CASE (INT16  , INT8   , 16,  8, i16, i08)
161 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 12, 10, i16, i16)
162 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 12,  9, i16, i16)
163 	fmtcl_BitBltConv_CASE (INT16  , INT8   , 12,  8, i16, i08)
164 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 10,  9, i16, i16)
165 	fmtcl_BitBltConv_CASE (INT16  , INT8   , 10,  8, i16, i08)
166 	fmtcl_BitBltConv_CASE (INT16  , INT8   ,  9,  8, i16, i08)
167 	default:
168 		assert (false);
169 		throw std::logic_error (
170 			"fmtcl::BitBltConv::bitblt: "
171 			"illegal int-to-int pixel format conversion."
172 		);
173 	}
174 
175 #undef fmtcl_BitBltConv_CASE
176 }
177 
178 
179 
180 // Stride offsets are still in bytes
181 // Destination pointer must be 32-byte aligned!
182 template <bool SF, class SRC, int SBD>
bitblt_int_to_flt_avx2(uint8_t * dst_ptr,int dst_stride,typename SRC::PtrConst::Type src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)183 void	BitBltConv::bitblt_int_to_flt_avx2 (uint8_t *dst_ptr, int dst_stride, typename SRC::PtrConst::Type src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
184 {
185 	assert (fstb::ToolsAvx2::check_ptr_align (dst_ptr));
186 	assert (SRC::PtrConst::check_ptr (src_ptr));
187 	assert (w > 0);
188 	assert (h > 0);
189 	assert (! SF || scale_info_ptr != nullptr);
190 
191 	__m256         gain;
192 	__m256         add_cst;
193 	if (SF)
194 	{
195 		gain    = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_gain   ) : 1);
196 		add_cst = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_add_cst) : 0);
197 	}
198 
199 	float *        dst_flt_ptr = reinterpret_cast <float *> (dst_ptr);
200 
201 	src_stride /= sizeof (typename SRC::PtrConst::DataType);
202 	dst_stride /= sizeof (*dst_flt_ptr);
203 
204 	const __m256i	zero = _mm256_setzero_si256 ();
205 
206 	const int      w16 = w & -16;
207 	const int      w15 = w - w16;
208 
209 	for (int y = 0; y < h; ++y)
210 	{
211 		typename SRC::PtrConst::Type  cur_src_ptr (src_ptr);
212 		__m256         val_0007;
213 		__m256         val_0815;
214 
215 		for (int x = 0; x < w16; x += 16)
216 		{
217 			SRC::read_flt (cur_src_ptr, val_0007, val_0815, zero);
218 			if (SF)
219 			{
220 				val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
221 				val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
222 			}
223 			_mm256_store_ps (dst_flt_ptr + x    , val_0007);
224 			_mm256_store_ps (dst_flt_ptr + x + 8, val_0815);
225 
226 			SRC::PtrConst::jump (cur_src_ptr, 16);
227 		}
228 
229 		if (w15 > 0)
230 		{
231 			SRC::read_flt (cur_src_ptr, val_0007, val_0815, zero);
232 			if (SF)
233 			{
234 				val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
235 				val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
236 			}
237 			_mm256_store_ps (dst_flt_ptr + w16, val_0007);
238 			if (w15 > 8)
239 			{
240 				_mm256_store_ps (dst_flt_ptr + w16 + 8, val_0815);
241 			}
242 		}
243 
244 		SRC::PtrConst::jump (src_ptr, src_stride);
245 		dst_flt_ptr += dst_stride;
246 	}
247 }
248 
249 
250 
251 // Stride offsets are still in bytes
252 template <bool SF, class DST>
bitblt_flt_to_int_avx2(typename DST::Ptr::Type dst_ptr,int dst_stride,const uint8_t * src_ptr,int src_stride,int w,int h,const ScaleInfo * scale_info_ptr)253 void	BitBltConv::bitblt_flt_to_int_avx2 (typename DST::Ptr::Type dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, const ScaleInfo *scale_info_ptr)
254 {
255 	assert (DST::Ptr::check_ptr (dst_ptr));
256 	assert (src_ptr != nullptr);
257 	assert (w > 0);
258 	assert (h > 0);
259 	assert (! SF || scale_info_ptr != nullptr);
260 
261 	__m256         gain;
262 	__m256         add_cst;
263 	if (SF)
264 	{
265 		gain    = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_gain   ) : 1);
266 		add_cst = _mm256_set1_ps ((SF) ? float (scale_info_ptr->_add_cst) : 0);
267 	}
268 
269 	const float *  src_flt_ptr = reinterpret_cast <const float *> (src_ptr);
270 
271 	src_stride /= sizeof (*src_flt_ptr);
272 	dst_stride /= sizeof (typename DST::Ptr::DataType);
273 
274 	const __m256i  mask_lsb = _mm256_set1_epi16 (0x00FF);
275 	const __m256i  sign_bit = _mm256_set1_epi16 (-0x8000);
276 	const __m256i  zero     = _mm256_setzero_si256 ();
277 	const __m256   offset   = _mm256_set1_ps (-32768);
278 
279 	const int      w16 = w & -16;
280 	const int      w15 = w - w16;
281 
282 	for (int y = 0; y < h; ++y)
283 	{
284 		typename DST::Ptr::Type cur_dst_ptr = dst_ptr;
285 		__m256         val_0007;
286 		__m256         val_0815;
287 
288 		for (int x = 0; x < w16; x += 16)
289 		{
290 			val_0007 = _mm256_loadu_ps (src_flt_ptr + x    );
291 			val_0815 = _mm256_loadu_ps (src_flt_ptr + x + 8);
292 			if (SF)
293 			{
294 				val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
295 				val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
296 			}
297 			DST::write_flt (
298 				cur_dst_ptr, val_0007, val_0815, mask_lsb, sign_bit, offset
299 			);
300 
301 			DST::Ptr::jump (cur_dst_ptr, 16);
302 		}
303 
304 		if (w15 > 0)
305 		{
306 			ProxyRwAvx2 <SplFmt_FLOAT>::read_flt_partial (
307 				src_flt_ptr + w16, val_0007, val_0815, zero, w15
308 			);
309 			if (SF)
310 			{
311 				val_0007 = _mm256_add_ps (_mm256_mul_ps (val_0007, gain), add_cst);
312 				val_0815 = _mm256_add_ps (_mm256_mul_ps (val_0815, gain), add_cst);
313 			}
314 			DST::write_flt_partial (
315 				cur_dst_ptr, val_0007, val_0815, mask_lsb, sign_bit, offset, w15
316 			);
317 		}
318 
319 		DST::Ptr::jump (dst_ptr, dst_stride);
320 		src_flt_ptr += src_stride;
321 	}
322 }
323 
324 
325 
326 // Stride offsets are still in bytes
327 template <class DST, class SRC, int DBD, int SBD>
bitblt_ixx_to_x16_avx2(typename DST::Ptr::Type dst_ptr,int dst_stride,typename SRC::PtrConst::Type src_ptr,int src_stride,int w,int h)328 void	BitBltConv::bitblt_ixx_to_x16_avx2 (typename DST::Ptr::Type dst_ptr, int dst_stride, typename SRC::PtrConst::Type src_ptr, int src_stride, int w, int h)
329 {
330 	assert (DST::Ptr::check_ptr (dst_ptr));
331 	assert (SRC::PtrConst::check_ptr (src_ptr));
332 	assert (w > 0);
333 	assert (h > 0);
334 
335 	assert (DST::Ptr::check_ptr (dst_ptr));
336 	assert (SRC::PtrConst::check_ptr (src_ptr));
337 	assert (w > 0);
338 	assert (h > 0);
339 
340 	src_stride /= sizeof (typename SRC::PtrConst::DataType);
341 	dst_stride /= sizeof (typename DST::Ptr::DataType);
342 
343 	const __m256i  zero     = _mm256_setzero_si256 ();
344 	const __m256i  val_ma   = _mm256_set1_epi16 ((DBD < 16) ? (1 << DBD) - 1 : 0);
345 	const __m256i  mask_lsb = _mm256_set1_epi16 (0x00FF);
346 
347 	const int      w16 = w & -16;
348 	const int      w15 = w - w16;
349 
350 	for (int y = 0; y < h; ++y)
351 	{
352 		typename DST::Ptr::Type       cur_dst_ptr = dst_ptr;
353 		typename SRC::PtrConst::Type  cur_src_ptr = src_ptr;
354 
355 		for (int x = 0; x < w16; x += 16)
356 		{
357 			__m256i        val = SRC::read_i16 (cur_src_ptr, zero);
358 			if (DBD != SBD)
359 			{
360 				val = _mm256_slli_epi16 (val, DBD - SBD);
361 			}
362 			if (DBD < 16)
363 			{
364 				val = _mm256_min_epi16 (val, val_ma);
365 			}
366 			DST::write_i16 (cur_dst_ptr, val, mask_lsb);
367 
368 			SRC::PtrConst::jump (cur_src_ptr, 16);
369 			DST::Ptr::jump (cur_dst_ptr, 16);
370 		}
371 
372 		if (w15 > 0)
373 		{
374 			__m256i        val = SRC::read_i16_partial (cur_src_ptr, zero, w15);
375 			if (DBD != SBD)
376 			{
377 				val = _mm256_slli_epi16 (val, DBD - SBD);
378 			}
379 			if (DBD < 16)
380 			{
381 				val = _mm256_min_epi16 (val, val_ma);
382 			}
383 			DST::write_i16_partial (cur_dst_ptr, val, mask_lsb, w15);
384 		}
385 
386 		SRC::PtrConst::jump (src_ptr, src_stride);
387 		DST::Ptr::jump (dst_ptr, dst_stride);
388 	}
389 }
390 
391 
392 
393 }	// namespace fmtcl
394 
395 
396 
397 /*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
398