1 /*****************************************************************************
2
3 TransLut_avx2.cpp
4 Author: Laurent de Soras, 2015
5
6 --- Legal stuff ---
7
8 This program is free software. It comes without any warranty, to
9 the extent permitted by applicable law. You can redistribute it
10 and/or modify it under the terms of the Do What The Fuck You Want
11 To Public License, Version 2, as published by Sam Hocevar. See
12 http://sam.zoy.org/wtfpl/COPYING for more details.
13
14 *Tab=3***********************************************************************/
15
16
17
18 #if defined (_MSC_VER)
19 #pragma warning (1 : 4130 4223 4705 4706)
20 #pragma warning (4 : 4355 4786 4800)
21 #endif
22
23
24
25 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
26
27 #include "fstb/def.h"
28
29 #include "fmtcl/TransLut.h"
30 #include "fstb/ToolsAvx2.h"
31
32 #include <immintrin.h>
33
34 #include <algorithm>
35
36 #include <cassert>
37
38
39
40
41
42 namespace fmtcl
43 {
44
45
46
47 template <class M>
48 class TransLut_FindIndexAvx2
49 {
50 public:
51 static constexpr int LINLUT_RES_L2 = TransLut::LINLUT_RES_L2;
52 static constexpr int LINLUT_MIN_F = TransLut::LINLUT_MIN_F;
53 static constexpr int LINLUT_MAX_F = TransLut::LINLUT_MAX_F;
54 static constexpr int LINLUT_SIZE_F = TransLut::LINLUT_SIZE_F;
55
56 static constexpr int LOGLUT_MIN_L2 = TransLut::LOGLUT_MIN_L2;
57 static constexpr int LOGLUT_MAX_L2 = TransLut::LOGLUT_MAX_L2;
58 static constexpr int LOGLUT_RES_L2 = TransLut::LOGLUT_RES_L2;
59 static constexpr int LOGLUT_HSIZE = TransLut::LOGLUT_HSIZE;
60 static constexpr int LOGLUT_SIZE = TransLut::LOGLUT_SIZE;
61
62 static inline void
63 find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) noexcept;
64 };
65
66 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LINLUT_RES_L2;
67 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LINLUT_MIN_F;
68 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LINLUT_MAX_F;
69 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LINLUT_SIZE_F;
70 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LOGLUT_MIN_L2;
71 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LOGLUT_MAX_L2;
72 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LOGLUT_RES_L2;
73 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LOGLUT_HSIZE;
74 template <class M> constexpr int TransLut_FindIndexAvx2 <M>::LOGLUT_SIZE;
75
76
77
78 template <>
find_index(const TransLut::FloatIntMix val_arr[8],__m256i & index,__m256 & frac)79 void TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) noexcept
80 {
81 assert (val_arr != nullptr);
82
83 const __m256 scale = _mm256_set1_ps (1 << LINLUT_RES_L2);
84 const __m256i offset =
85 _mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2));
86 const __m256i val_min = _mm256_setzero_si256 ();
87 const __m256i val_max = _mm256_set1_epi32 (LINLUT_SIZE_F - 2);
88
89 const __m256 v =
90 _mm256_load_ps (reinterpret_cast <const float *> (val_arr));
91 const __m256 val_scl = _mm256_mul_ps (v, scale);
92 const __m256i index_raw = _mm256_cvtps_epi32 (val_scl);
93 __m256i index_tmp = _mm256_add_epi32 (index_raw, offset);
94 index_tmp = _mm256_min_epi32 (index_tmp, val_max);
95 index = _mm256_max_epi32 (index_tmp, val_min);
96 frac = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw));
97 }
98
99
100
101 template <>
find_index(const TransLut::FloatIntMix val_arr[8],__m256i & index,__m256 & frac)102 void TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) noexcept
103 {
104 assert (val_arr != nullptr);
105
106 // Constants
107 constexpr int mant_size = 23;
108 constexpr int exp_bias = 127;
109 constexpr uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size;
110 constexpr float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
111 // constexpr float val_max = float (int64_t (1) << LOGLUT_MAX_L2);
112 constexpr int frac_size = mant_size - LOGLUT_RES_L2;
113 constexpr uint32_t frac_mask = (1 << frac_size) - 1;
114
115 const __m256 zero_f = _mm256_setzero_ps ();
116 const __m256 one_f = _mm256_set1_ps (1);
117 const __m256 frac_mul = _mm256_set1_ps (1.0f / (1 << frac_size));
118 const __m256 mul_eps = _mm256_set1_ps (1.0f / val_min);
119 const __m256 mask_abs_f = _mm256_load_ps (
120 reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs)
121 );
122
123 const __m256i zero_i = _mm256_setzero_si256 ();
124 const __m256i mask_abs_epi32 = _mm256_set1_epi32 (0x7FFFFFFF);
125 const __m256i one_epi32 = _mm256_set1_epi32 (1);
126 const __m256i base_epi32 = _mm256_set1_epi32 (int (base));
127 const __m256i frac_mask_epi32 = _mm256_set1_epi32 (frac_mask);
128 const __m256i val_min_epi32 =
129 _mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size);
130 const __m256i val_max_epi32 =
131 _mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size);
132 const __m256i index_max_epi32 =
133 _mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2);
134 const __m256i hsize_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE);
135 const __m256i mirror_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE - 1);
136
137 // It really starts here
138 const __m256 val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr));
139 const __m256 val_a = _mm256_and_ps (val_f, mask_abs_f);
140 const __m256i val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr));
141 const __m256i val_u = _mm256_and_si256 (val_i, mask_abs_epi32);
142
143 // Standard path
144 __m256i index_std = _mm256_sub_epi32 (val_u, base_epi32);
145 index_std = _mm256_srli_epi32 (index_std, frac_size);
146 index_std = _mm256_add_epi32 (index_std, one_epi32);
147 __m256i frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32);
148 __m256 frac_std = _mm256_cvtepi32_ps (frac_stdi);
149 frac_std = _mm256_mul_ps (frac_std, frac_mul);
150
151 // Epsilon path
152 __m256 frac_eps = _mm256_max_ps (val_a, zero_f);
153 frac_eps = _mm256_mul_ps (frac_eps, mul_eps);
154
155 // Range cases
156 const __m256i eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u);
157 const __m256i std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u);
158 const __m256 eps_flag_f = _mm256_castsi256_ps (eps_flag_i);
159 const __m256 std_flag_f = _mm256_castsi256_ps (std_flag_i);
160 __m256i index_tmp =
161 fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32);
162 __m256 frac_tmp =
163 fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f);
164 index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp);
165 frac_tmp = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp);
166
167 // Sign cases
168 const __m256i neg_flag_i = _mm256_srai_epi32 (val_i, 31);
169 const __m256 neg_flag_f = _mm256_castsi256_ps (neg_flag_i);
170 const __m256i index_neg = _mm256_sub_epi32 (mirror_epi32, index_tmp);
171 const __m256i index_pos = _mm256_add_epi32 (hsize_epi32, index_tmp);
172 const __m256 frac_neg = _mm256_sub_ps (one_f, frac_tmp);
173 index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos);
174 frac = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp);
175 }
176
177
178
179 template <class T>
TransLut_store_avx2(T * dst_ptr,__m256 val)180 static fstb_FORCEINLINE void TransLut_store_avx2 (T *dst_ptr, __m256 val) noexcept
181 {
182 _mm256_store_si256 (
183 reinterpret_cast <__m256i *> (dst_ptr),
184 _mm256_cvtps_epi32 (val)
185 );
186 }
187
TransLut_store_avx2(float * dst_ptr,__m256 val)188 static fstb_FORCEINLINE void TransLut_store_avx2 (float *dst_ptr, __m256 val) noexcept
189 {
190 _mm256_store_ps (dst_ptr, val);
191 }
192
193
194
195 /*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
196
197
198
199 /*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
200
201
202
203 /*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
204
205
206
init_proc_fnc_avx2(int selector)207 void TransLut::init_proc_fnc_avx2 (int selector)
208 {
209 if (_avx2_flag)
210 {
211 switch (selector)
212 {
213 case 0*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_avx2 <float , MapperLog>; break;
214 case 0*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_avx2 <float , MapperLin>; break;
215 case 1*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_avx2 <uint16_t, MapperLog>; break;
216 case 1*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_avx2 <uint16_t, MapperLin>; break;
217 case 2*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_avx2 <uint8_t , MapperLog>; break;
218 case 2*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_avx2 <uint8_t , MapperLin>; break;
219
220 default:
221 // Nothing
222 break;
223 }
224 }
225 }
226
227
228
229 template <class TD, class M>
process_plane_flt_any_avx2(Plane<> dst,PlaneRO<> src,int w,int h) const230 void TransLut::process_plane_flt_any_avx2 (Plane <> dst, PlaneRO <> src, int w, int h) const noexcept
231 {
232 assert (dst.is_valid (h));
233 assert (src.is_valid (h));
234 assert (w > 0);
235 assert (h > 0);
236
237 for (int y = 0; y < h; ++y)
238 {
239 const PlaneRO <FloatIntMix> s { src };
240 const Plane <TD> d { dst };
241
242 for (int x = 0; x < w; x += 8)
243 {
244 union
245 {
246 __m256i _vect;
247 uint32_t _scal [8];
248 } index;
249 __m256 lerp;
250 TransLut_FindIndexAvx2 <M>::find_index (s._ptr + x, index._vect, lerp);
251 #if 1 // Looks as fast as _mm256_set_ps
252 // G++ complains about sizeof() as argument
253 __m256 val = _mm256_i32gather_ps (
254 &_lut.use <float> (0), index._vect, 4 // 4 == sizeof (float)
255 );
256 const __m256 va2 = _mm256_i32gather_ps (
257 &_lut.use <float> (1), index._vect, 4 // 4 == sizeof (float)
258 );
259 #else
260 __m256 val = _mm256_set_ps (
261 _lut.use <float> (index._scal [7] ),
262 _lut.use <float> (index._scal [6] ),
263 _lut.use <float> (index._scal [5] ),
264 _lut.use <float> (index._scal [4] ),
265 _lut.use <float> (index._scal [3] ),
266 _lut.use <float> (index._scal [2] ),
267 _lut.use <float> (index._scal [1] ),
268 _lut.use <float> (index._scal [0] )
269 );
270 const __m256 va2 = _mm256_set_ps (
271 _lut.use <float> (index._scal [7] + 1),
272 _lut.use <float> (index._scal [6] + 1),
273 _lut.use <float> (index._scal [5] + 1),
274 _lut.use <float> (index._scal [4] + 1),
275 _lut.use <float> (index._scal [3] + 1),
276 _lut.use <float> (index._scal [2] + 1),
277 _lut.use <float> (index._scal [1] + 1),
278 _lut.use <float> (index._scal [0] + 1)
279 );
280 #endif
281 const __m256 dif = _mm256_sub_ps (va2, val);
282 val = _mm256_add_ps (val, _mm256_mul_ps (dif, lerp));
283 TransLut_store_avx2 (&d._ptr [x], val);
284 }
285
286 src.step_line ();
287 dst.step_line ();
288 }
289
290 _mm256_zeroupper (); // Back to SSE state
291 }
292
293
294
295 } // namespace fmtcl
296
297
298
299 /*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
300