1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file utils.h
24  *
25  * @brief Utilities used by SWR core related to pixel formats.
26  *
27  ******************************************************************************/
28 #pragma once
29 
30 #include "core/utils.h"
31 #include "common/simdintrin.h"
32 
33 INLINE
vTranspose(simd4scalar & row0,simd4scalar & row1,simd4scalar & row2,simd4scalar & row3)34 void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3)
35 {
36     simd4scalari row0i = SIMD128::castps_si(row0);
37     simd4scalari row1i = SIMD128::castps_si(row1);
38     simd4scalari row2i = SIMD128::castps_si(row2);
39     simd4scalari row3i = SIMD128::castps_si(row3);
40 
41     simd4scalari vTemp = row2i;
42     row2i              = SIMD128::unpacklo_epi32(row2i, row3i);
43     vTemp              = SIMD128::unpackhi_epi32(vTemp, row3i);
44 
45     row3i = row0i;
46     row0i = SIMD128::unpacklo_epi32(row0i, row1i);
47     row3i = SIMD128::unpackhi_epi32(row3i, row1i);
48 
49     row1i = row0i;
50     row0i = SIMD128::unpacklo_epi64(row0i, row2i);
51     row1i = SIMD128::unpackhi_epi64(row1i, row2i);
52 
53     row2i = row3i;
54     row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
55     row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
56 
57     row0 = SIMD128::castsi_ps(row0i);
58     row1 = SIMD128::castsi_ps(row1i);
59     row2 = SIMD128::castsi_ps(row2i);
60     row3 = SIMD128::castsi_ps(row3i);
61 }
62 
63 INLINE
vTranspose(simd4scalari & row0,simd4scalari & row1,simd4scalari & row2,simd4scalari & row3)64 void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3)
65 {
66     simd4scalari vTemp = row2;
67     row2               = SIMD128::unpacklo_epi32(row2, row3);
68     vTemp              = SIMD128::unpackhi_epi32(vTemp, row3);
69 
70     row3 = row0;
71     row0 = SIMD128::unpacklo_epi32(row0, row1);
72     row3 = SIMD128::unpackhi_epi32(row3, row1);
73 
74     row1 = row0;
75     row0 = SIMD128::unpacklo_epi64(row0, row2);
76     row1 = SIMD128::unpackhi_epi64(row1, row2);
77 
78     row2 = row3;
79     row2 = SIMD128::unpacklo_epi64(row2, vTemp);
80     row3 = SIMD128::unpackhi_epi64(row3, vTemp);
81 }
82 
83 #if KNOB_SIMD_WIDTH == 8
84 INLINE
vTranspose3x8(simd4scalar (& vDst)[8],const simdscalar & vSrc0,const simdscalar & vSrc1,const simdscalar & vSrc2)85 void vTranspose3x8(simd4scalar (&vDst)[8],
86                    const simdscalar& vSrc0,
87                    const simdscalar& vSrc1,
88                    const simdscalar& vSrc2)
89 {
90     simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2);              // x0z0x1z1 x4z4x5z5
91     simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
92     simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);                // x0y0z0w0 x4y4z4w4
93     simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);                // x1y1z1w1 x5y5z5w5
94 
95     r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2);              // x2z2x3z3 x6z6x7z7
96     r1rx                  = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
97     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);                // x2y2z2w2 x6y6z6w6
98     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);                // x3y3z3w3 x7y7z7w7
99 
100     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
101     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
102     vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
103     vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
104 
105     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
106     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
107     vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
108     vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
109 }
110 
111 INLINE
vTranspose4x8(simd4scalar (& vDst)[8],const simdscalar & vSrc0,const simdscalar & vSrc1,const simdscalar & vSrc2,const simdscalar & vSrc3)112 void vTranspose4x8(simd4scalar (&vDst)[8],
113                    const simdscalar& vSrc0,
114                    const simdscalar& vSrc1,
115                    const simdscalar& vSrc2,
116                    const simdscalar& vSrc3)
117 {
118     simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
119     simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5
120     simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);   // x0y0z0w0 x4y4z4w4
121     simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);   // x1y1z1w1 x5y5z5w5
122 
123     r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
124     r1rx                  = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77
125     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);   // x2y2z2w2 x6y6z6w6
126     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);   // x3y3z3w3 x7y7z7w7
127 
128     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
129     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
130     vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
131     vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
132 
133     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
134     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
135     vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
136     vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
137 }
138 
139 INLINE
vTranspose4x16(simd16scalar (& dst)[4],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2,const simd16scalar & src3)140 void vTranspose4x16(simd16scalar (&dst)[4],
141                     const simd16scalar& src0,
142                     const simd16scalar& src1,
143                     const simd16scalar& src2,
144                     const simd16scalar& src3)
145 {
146     const simd16scalari perm =
147         _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
148 
149     // pre-permute input to setup the right order after all the unpacking
150 
151     simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
152     simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
153     simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
154     simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
155 
156     simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
157     simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
158     simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
159     simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
160 
161     dst[0] = _simd16_unpacklo_ps(rblo, galo);
162     dst[1] = _simd16_unpackhi_ps(rblo, galo);
163     dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
164     dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
165 }
166 
167 INLINE
vTranspose8x8(simdscalar (& vDst)[8],const simdscalar & vMask0,const simdscalar & vMask1,const simdscalar & vMask2,const simdscalar & vMask3,const simdscalar & vMask4,const simdscalar & vMask5,const simdscalar & vMask6,const simdscalar & vMask7)168 void vTranspose8x8(simdscalar (&vDst)[8],
169                    const simdscalar& vMask0,
170                    const simdscalar& vMask1,
171                    const simdscalar& vMask2,
172                    const simdscalar& vMask3,
173                    const simdscalar& vMask4,
174                    const simdscalar& vMask5,
175                    const simdscalar& vMask6,
176                    const simdscalar& vMask7)
177 {
178     simdscalar __t0  = _simd_unpacklo_ps(vMask0, vMask1);
179     simdscalar __t1  = _simd_unpackhi_ps(vMask0, vMask1);
180     simdscalar __t2  = _simd_unpacklo_ps(vMask2, vMask3);
181     simdscalar __t3  = _simd_unpackhi_ps(vMask2, vMask3);
182     simdscalar __t4  = _simd_unpacklo_ps(vMask4, vMask5);
183     simdscalar __t5  = _simd_unpackhi_ps(vMask4, vMask5);
184     simdscalar __t6  = _simd_unpacklo_ps(vMask6, vMask7);
185     simdscalar __t7  = _simd_unpackhi_ps(vMask6, vMask7);
186     simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
187     simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
188     simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
189     simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
190     simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
191     simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
192     simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
193     simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
194     vDst[0]          = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
195     vDst[1]          = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
196     vDst[2]          = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
197     vDst[3]          = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
198     vDst[4]          = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
199     vDst[5]          = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
200     vDst[6]          = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
201     vDst[7]          = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
202 }
203 
204 INLINE
vTranspose8x8(simdscalar (& vDst)[8],const simdscalari & vMask0,const simdscalari & vMask1,const simdscalari & vMask2,const simdscalari & vMask3,const simdscalari & vMask4,const simdscalari & vMask5,const simdscalari & vMask6,const simdscalari & vMask7)205 void vTranspose8x8(simdscalar (&vDst)[8],
206                    const simdscalari& vMask0,
207                    const simdscalari& vMask1,
208                    const simdscalari& vMask2,
209                    const simdscalari& vMask3,
210                    const simdscalari& vMask4,
211                    const simdscalari& vMask5,
212                    const simdscalari& vMask6,
213                    const simdscalari& vMask7)
214 {
215     vTranspose8x8(vDst,
216                   _simd_castsi_ps(vMask0),
217                   _simd_castsi_ps(vMask1),
218                   _simd_castsi_ps(vMask2),
219                   _simd_castsi_ps(vMask3),
220                   _simd_castsi_ps(vMask4),
221                   _simd_castsi_ps(vMask5),
222                   _simd_castsi_ps(vMask6),
223                   _simd_castsi_ps(vMask7));
224 }
225 #endif
226 
227 //////////////////////////////////////////////////////////////////////////
228 /// TranposeSingleComponent
229 //////////////////////////////////////////////////////////////////////////
230 template <uint32_t bpp>
231 struct TransposeSingleComponent
232 {
233     //////////////////////////////////////////////////////////////////////////
234     /// @brief Pass-thru for single component.
235     /// @param pSrc - source data in SOA form
236     /// @param pDst - output data in AOS form
TransposeTransposeSingleComponent237     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
238     {
239         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
240     }
241 
Transpose_simd16TransposeSingleComponent242     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
243     {
244         memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
245     }
246 };
247 
248 //////////////////////////////////////////////////////////////////////////
249 /// Transpose8_8_8_8
250 //////////////////////////////////////////////////////////////////////////
251 struct Transpose8_8_8_8
252 {
253     //////////////////////////////////////////////////////////////////////////
254     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
255     /// @param pSrc - source data in SOA form
256     /// @param pDst - output data in AOS form
TransposeTranspose8_8_8_8257     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
258     {
259         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
260 
261 #if KNOB_SIMD_WIDTH == 8
262 #if KNOB_ARCH <= KNOB_ARCH_AVX
263         simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
264         simd4scalari c2c3 =
265             SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
266         simd4scalari c0c2    = SIMD128::unpacklo_epi64(c0c1, c2c3);            // rrrrrrrrbbbbbbbb
267         simd4scalari c1c3    = SIMD128::unpackhi_epi64(c0c1, c2c3);            // ggggggggaaaaaaaa
268         simd4scalari c01     = SIMD128::unpacklo_epi8(c0c2, c1c3);             // rgrgrgrgrgrgrgrg
269         simd4scalari c23     = SIMD128::unpackhi_epi8(c0c2, c1c3);             // babababababababa
270         simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);              // rgbargbargbargba
271         simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);              // rgbargbargbargba
272         SIMD128::store_si((simd4scalari*)pDst, c0123lo);
273         SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
274 #else
275         simdscalari dst01 = _simd_shuffle_epi8(src,
276                                                _simd_set_epi32(0x0f078080,
277                                                                0x0e068080,
278                                                                0x0d058080,
279                                                                0x0c048080,
280                                                                0x80800b03,
281                                                                0x80800a02,
282                                                                0x80800901,
283                                                                0x80800800));
284         simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
285         dst23             = _simd_shuffle_epi8(dst23,
286                                    _simd_set_epi32(0x80800f07,
287                                                    0x80800e06,
288                                                    0x80800d05,
289                                                    0x80800c04,
290                                                    0x0b038080,
291                                                    0x0a028080,
292                                                    0x09018080,
293                                                    0x08008080));
294         simdscalari dst   = _simd_or_si(dst01, dst23);
295         _simd_store_si((simdscalari*)pDst, dst);
296 #endif
297 #else
298 #error Unsupported vector width
299 #endif
300     }
301 
Transpose_simd16Transpose8_8_8_8302     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
303     {
304 #if KNOB_SIMD16_WIDTH == 16
305         // clang-format off
306 
307         simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc));      // rrrrrrrrrrrrrrrr
308         simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1);  // gggggggggggggggg
309         simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2);  // bbbbbbbbbbbbbbbb
310         simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3);  // aaaaaaaaaaaaaaaa
311 
312         simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
313         simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
314         simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
315         simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
316 
317         simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
318         simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
319         simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
320 
321         simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
322 
323         _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst);  // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
324 
325         // clang-format on
326 #else
327 #error Unsupported vector width
328 #endif
329     }
330 };
331 
332 //////////////////////////////////////////////////////////////////////////
333 /// Transpose8_8_8
334 //////////////////////////////////////////////////////////////////////////
335 struct Transpose8_8_8
336 {
337     //////////////////////////////////////////////////////////////////////////
338     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
339     /// @param pSrc - source data in SOA form
340     /// @param pDst - output data in AOS form
341     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
342     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
343 };
344 
345 //////////////////////////////////////////////////////////////////////////
346 /// Transpose8_8
347 //////////////////////////////////////////////////////////////////////////
348 struct Transpose8_8
349 {
350     //////////////////////////////////////////////////////////////////////////
351     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
352     /// @param pSrc - source data in SOA form
353     /// @param pDst - output data in AOS form
TransposeTranspose8_8354     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
355     {
356 #if KNOB_SIMD_WIDTH == 8
357         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
358 
359         simd4scalari rg = src.v4[0];                       // rrrrrrrr gggggggg
360         simd4scalari g  = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
361         rg              = SIMD128::unpacklo_epi8(rg, g);
362         SIMD128::store_si((simd4scalari*)pDst, rg);
363 #else
364 #error Unsupported vector width
365 #endif
366     }
367 
Transpose_simd16Transpose8_8368     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
369     {
370 #if KNOB_SIMD16_WIDTH == 16
371         // clang-format off
372 
373         simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc));      // rrrrrrrrrrrrrrrr
374         simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1);  // gggggggggggggggg
375 
376         simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
377         simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
378 
379         simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
380 
381         simdscalari dst = _simd_or_si(cvt0, shl1);
382 
383         _simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst);  // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
384 
385         // clang-format on
386 #else
387 #error Unsupported vector width
388 #endif
389     }
390 };
391 
392 //////////////////////////////////////////////////////////////////////////
393 /// Transpose32_32_32_32
394 //////////////////////////////////////////////////////////////////////////
395 struct Transpose32_32_32_32
396 {
397     //////////////////////////////////////////////////////////////////////////
398     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
399     /// @param pSrc - source data in SOA form
400     /// @param pDst - output data in AOS form
TransposeTranspose32_32_32_32401     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
402     {
403 #if KNOB_SIMD_WIDTH == 8
404         simdscalar src0 = _simd_load_ps((const float*)pSrc);
405         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
406         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
407         simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
408 
409         simd4scalar vDst[8];
410         vTranspose4x8(vDst, src0, src1, src2, src3);
411         SIMD128::store_ps((float*)pDst, vDst[0]);
412         SIMD128::store_ps((float*)pDst + 4, vDst[1]);
413         SIMD128::store_ps((float*)pDst + 8, vDst[2]);
414         SIMD128::store_ps((float*)pDst + 12, vDst[3]);
415         SIMD128::store_ps((float*)pDst + 16, vDst[4]);
416         SIMD128::store_ps((float*)pDst + 20, vDst[5]);
417         SIMD128::store_ps((float*)pDst + 24, vDst[6]);
418         SIMD128::store_ps((float*)pDst + 28, vDst[7]);
419 #else
420 #error Unsupported vector width
421 #endif
422     }
423 
Transpose_simd16Transpose32_32_32_32424     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
425     {
426 #if KNOB_SIMD16_WIDTH == 16
427         // clang-format off
428 
429         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
430         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
431         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
432         simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
433 
434         simd16scalar dst[4];
435 
436         vTranspose4x16(dst, src0, src1, src2, src3);
437 
438         _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst[0]);
439         _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
440         _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
441         _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
442 
443         // clang-format on
444 #else
445 #error Unsupported vector width
446 #endif
447     }
448 };
449 
450 //////////////////////////////////////////////////////////////////////////
451 /// Transpose32_32_32
452 //////////////////////////////////////////////////////////////////////////
453 struct Transpose32_32_32
454 {
455     //////////////////////////////////////////////////////////////////////////
456     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
457     /// @param pSrc - source data in SOA form
458     /// @param pDst - output data in AOS form
TransposeTranspose32_32_32459     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
460     {
461 #if KNOB_SIMD_WIDTH == 8
462         simdscalar src0 = _simd_load_ps((const float*)pSrc);
463         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
464         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
465 
466         simd4scalar vDst[8];
467         vTranspose3x8(vDst, src0, src1, src2);
468         SIMD128::store_ps((float*)pDst, vDst[0]);
469         SIMD128::store_ps((float*)pDst + 4, vDst[1]);
470         SIMD128::store_ps((float*)pDst + 8, vDst[2]);
471         SIMD128::store_ps((float*)pDst + 12, vDst[3]);
472         SIMD128::store_ps((float*)pDst + 16, vDst[4]);
473         SIMD128::store_ps((float*)pDst + 20, vDst[5]);
474         SIMD128::store_ps((float*)pDst + 24, vDst[6]);
475         SIMD128::store_ps((float*)pDst + 28, vDst[7]);
476 #else
477 #error Unsupported vector width
478 #endif
479     }
480 
Transpose_simd16Transpose32_32_32481     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
482     {
483 #if KNOB_SIMD16_WIDTH == 16
484         // clang-format off
485 
486         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
487         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
488         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
489         simd16scalar src3 = _simd16_setzero_ps();
490 
491         simd16scalar dst[4];
492 
493         vTranspose4x16(dst, src0, src1, src2, src3);
494 
495         _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst[0]);
496         _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
497         _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
498         _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
499 
500         // clang-format on
501 #else
502 #error Unsupported vector width
503 #endif
504     }
505 };
506 
507 //////////////////////////////////////////////////////////////////////////
508 /// Transpose32_32
509 //////////////////////////////////////////////////////////////////////////
510 struct Transpose32_32
511 {
512     //////////////////////////////////////////////////////////////////////////
513     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
514     /// @param pSrc - source data in SOA form
515     /// @param pDst - output data in AOS form
TransposeTranspose32_32516     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
517     {
518 #if KNOB_SIMD_WIDTH == 8
519         const float* pfSrc  = (const float*)pSrc;
520         simd4scalar  src_r0 = SIMD128::load_ps(pfSrc + 0);
521         simd4scalar  src_r1 = SIMD128::load_ps(pfSrc + 4);
522         simd4scalar  src_g0 = SIMD128::load_ps(pfSrc + 8);
523         simd4scalar  src_g1 = SIMD128::load_ps(pfSrc + 12);
524 
525         simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
526         simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
527         simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
528         simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
529 
530         float* pfDst = (float*)pDst;
531         SIMD128::store_ps(pfDst + 0, dst0);
532         SIMD128::store_ps(pfDst + 4, dst1);
533         SIMD128::store_ps(pfDst + 8, dst2);
534         SIMD128::store_ps(pfDst + 12, dst3);
535 #else
536 #error Unsupported vector width
537 #endif
538     }
539 
Transpose_simd16Transpose32_32540     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
541     {
542 #if KNOB_SIMD16_WIDTH == 16
543         // clang-format off
544 
545         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));      // rrrrrrrrrrrrrrrr
546         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
547 
548         simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                            // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
549         simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                            // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
550 
551         simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0)  // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
552         simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2)  // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
553 
554         simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0)  // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
555         simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0)  // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
556 
557         _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst0);                    // rgrgrgrgrgrgrgrg
558         _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1);                    // rgrgrgrgrgrgrgrg
559 
560         // clang-format on
561 #else
562 #error Unsupported vector width
563 #endif
564     }
565 };
566 
567 //////////////////////////////////////////////////////////////////////////
568 /// Transpose16_16_16_16
569 //////////////////////////////////////////////////////////////////////////
570 struct Transpose16_16_16_16
571 {
572     //////////////////////////////////////////////////////////////////////////
573     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
574     /// @param pSrc - source data in SOA form
575     /// @param pDst - output data in AOS form
TransposeTranspose16_16_16_16576     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
577     {
578 #if KNOB_SIMD_WIDTH == 8
579         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
580         simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
581 
582         simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
583         simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
584         simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
585         simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
586 
587         simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
588         simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
589         simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
590         simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
591 
592         simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
593         simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
594         simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
595         simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
596 
597         SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
598         SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
599         SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
600         SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
601 #else
602 #error Unsupported vector width
603 #endif
604     }
605 
Transpose_simd16Transpose16_16_16_16606     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
607     {
608 #if KNOB_SIMD16_WIDTH == 16
609         // clang-format off
610 
611         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
612         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
613         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2);   // bbbbbbbbbbbbbbbb
614         simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3);   // aaaaaaaaaaaaaaaa
615 
616         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
617         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
618         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                    // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
619         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                    // ba4 ba5 ba6 ba7 baC baD baE baF
620 
621         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                    // rbga0 rbga1 rbga8 rbga9
622         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                    // rbga2 rbga3 rbgaA rbgaB
623         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                    // rbga4 rbga5 rgbaC rbgaD
624         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                    // rbga6 rbga7 rbgaE rbgaF
625 
626         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)   // rbga0 rbga1 rbga2 rbga3
627         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)   // rbga4 rbga5 rbga6 rbga7
628         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)   // rbga8 rbga9 rbgaA rbgaB
629         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)   // rbgaC rbgaD rbgaE rbgaF
630 
631         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgbargbargbargba
632         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgbargbargbargba
633         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2);         // rgbargbargbargba
634         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3);         // rgbargbargbargba
635 
636         // clang-format on
637 #else
638 #error Unsupported vector width
639 #endif
640     }
641 };
642 
643 //////////////////////////////////////////////////////////////////////////
644 /// Transpose16_16_16
645 //////////////////////////////////////////////////////////////////////////
646 struct Transpose16_16_16
647 {
648     //////////////////////////////////////////////////////////////////////////
649     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
650     /// @param pSrc - source data in SOA form
651     /// @param pDst - output data in AOS form
TransposeTranspose16_16_16652     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
653     {
654 #if KNOB_SIMD_WIDTH == 8
655         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
656 
657         simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
658         simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
659         simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
660         simd4scalari src_a = SIMD128::setzero_si();
661 
662         simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
663         simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
664         simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
665         simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
666 
667         simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
668         simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
669         simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
670         simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
671 
672         SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
673         SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
674         SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
675         SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
676 #else
677 #error Unsupported vector width
678 #endif
679     }
680 
Transpose_simd16Transpose16_16_16681     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
682     {
683 #if KNOB_SIMD16_WIDTH == 16
684         // clang-format off
685 
686         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
687         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
688         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2);   // bbbbbbbbbbbbbbbb
689         simdscalari src3 = _simd_setzero_si();                                              // aaaaaaaaaaaaaaaa
690 
691         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
692         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
693         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                    // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
694         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                    // ba4 ba5 ba6 ba7 baC baD baE baF
695 
696         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                    // rbga0 rbga1 rbga8 rbga9
697         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                    // rbga2 rbga3 rbgaA rbgaB
698         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                    // rbga4 rbga5 rgbaC rbgaD
699         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                    // rbga6 rbga7 rbgaE rbgaF
700 
701         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)  // rbga0 rbga1 rbga2 rbga3
702         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)  // rbga4 rbga5 rbga6 rbga7
703         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)  // rbga8 rbga9 rbgaA rbgaB
704         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)  // rbgaC rbgaD rbgaE rbgaF
705 
706         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgbargbargbargba
707         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgbargbargbargba
708         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2);         // rgbargbargbargba
709         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3);         // rgbargbargbargba
710 
711         // clang-format on
712 #else
713 #error Unsupported vector width
714 #endif
715     }
716 };
717 
718 //////////////////////////////////////////////////////////////////////////
719 /// Transpose16_16
720 //////////////////////////////////////////////////////////////////////////
721 struct Transpose16_16
722 {
723     //////////////////////////////////////////////////////////////////////////
724     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
725     /// @param pSrc - source data in SOA form
726     /// @param pDst - output data in AOS form
TransposeTranspose16_16727     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
728     {
729 #if KNOB_SIMD_WIDTH == 8
730         simdscalar src = _simd_load_ps((const float*)pSrc);
731 
732         simd4scalar comp0 = _simd_extractf128_ps(src, 0);
733         simd4scalar comp1 = _simd_extractf128_ps(src, 1);
734 
735         simd4scalari comp0i = SIMD128::castps_si(comp0);
736         simd4scalari comp1i = SIMD128::castps_si(comp1);
737 
738         simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
739         simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
740 
741         SIMD128::store_si((simd4scalari*)pDst, resLo);
742         SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
743 #else
744 #error Unsupported vector width
745 #endif
746     }
747 
Transpose_simd16Transpose16_16748     INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
749     {
750 #if KNOB_SIMD16_WIDTH == 16
751         // clang-format off
752 
753         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
754         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
755 
756         simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
757         simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
758 
759         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
760         simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)   // rg8 rg9 rgA rgB rgC rgD rgE rgF
761 
762         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgrgrgrgrgrgrgrg
763         _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgrgrgrgrgrgrgrg
764 
765         // clang-format on
766 #else
767 #error Unsupported vector width
768 #endif
769     }
770 };
771 
772 //////////////////////////////////////////////////////////////////////////
773 /// Transpose24_8
774 //////////////////////////////////////////////////////////////////////////
775 struct Transpose24_8
776 {
777     //////////////////////////////////////////////////////////////////////////
778     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
779     /// @param pSrc - source data in SOA form
780     /// @param pDst - output data in AOS form
781     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
782     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
783 };
784 
785 //////////////////////////////////////////////////////////////////////////
786 /// Transpose32_8_24
787 //////////////////////////////////////////////////////////////////////////
788 struct Transpose32_8_24
789 {
790     //////////////////////////////////////////////////////////////////////////
791     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
792     /// @param pSrc - source data in SOA form
793     /// @param pDst - output data in AOS form
794     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
795     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
796 };
797 
798 //////////////////////////////////////////////////////////////////////////
799 /// Transpose4_4_4_4
800 //////////////////////////////////////////////////////////////////////////
801 struct Transpose4_4_4_4
802 {
803     //////////////////////////////////////////////////////////////////////////
804     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
805     /// @param pSrc - source data in SOA form
806     /// @param pDst - output data in AOS form
807     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
808     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
809 };
810 
811 //////////////////////////////////////////////////////////////////////////
812 /// Transpose5_6_5
813 //////////////////////////////////////////////////////////////////////////
814 struct Transpose5_6_5
815 {
816     //////////////////////////////////////////////////////////////////////////
817     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
818     /// @param pSrc - source data in SOA form
819     /// @param pDst - output data in AOS form
820     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
821     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
822 };
823 
824 //////////////////////////////////////////////////////////////////////////
825 /// Transpose9_9_9_5
826 //////////////////////////////////////////////////////////////////////////
827 struct Transpose9_9_9_5
828 {
829     //////////////////////////////////////////////////////////////////////////
830     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
831     /// @param pSrc - source data in SOA form
832     /// @param pDst - output data in AOS form
833     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
834     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
835 };
836 
837 //////////////////////////////////////////////////////////////////////////
838 /// Transpose5_5_5_1
839 //////////////////////////////////////////////////////////////////////////
840 struct Transpose5_5_5_1
841 {
842     //////////////////////////////////////////////////////////////////////////
843     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
844     /// @param pSrc - source data in SOA form
845     /// @param pDst - output data in AOS form
846     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
847     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
848 };
849 
850 //////////////////////////////////////////////////////////////////////////
851 /// Transpose1_5_5_5
852 //////////////////////////////////////////////////////////////////////////
853 struct Transpose1_5_5_5
854 {
855     //////////////////////////////////////////////////////////////////////////
856     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
857     /// @param pSrc - source data in SOA form
858     /// @param pDst - output data in AOS form
859     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
860     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
861 };
862 
863 //////////////////////////////////////////////////////////////////////////
864 /// Transpose10_10_10_2
865 //////////////////////////////////////////////////////////////////////////
866 struct Transpose10_10_10_2
867 {
868     //////////////////////////////////////////////////////////////////////////
869     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
870     /// @param pSrc - source data in SOA form
871     /// @param pDst - output data in AOS form
872     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
873     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
874 };
875 
876 //////////////////////////////////////////////////////////////////////////
877 /// Transpose11_11_10
878 //////////////////////////////////////////////////////////////////////////
879 struct Transpose11_11_10
880 {
881     //////////////////////////////////////////////////////////////////////////
882     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
883     /// @param pSrc - source data in SOA form
884     /// @param pDst - output data in AOS form
885     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
886     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
887 };
888 
889 //////////////////////////////////////////////////////////////////////////
890 /// Transpose64
891 //////////////////////////////////////////////////////////////////////////
892 struct Transpose64
893 {
894     //////////////////////////////////////////////////////////////////////////
895     /// @brief Performs an SOA to AOS conversion
896     /// @param pSrc - source data in SOA form
897     /// @param pDst - output data in AOS form
898     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
899     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
900 };
901 
902 //////////////////////////////////////////////////////////////////////////
903 /// Transpose64_64
904 //////////////////////////////////////////////////////////////////////////
905 struct Transpose64_64
906 {
907     //////////////////////////////////////////////////////////////////////////
908     /// @brief Performs an SOA to AOS conversion
909     /// @param pSrc - source data in SOA form
910     /// @param pDst - output data in AOS form
911     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
912     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
913 };
914 
915 //////////////////////////////////////////////////////////////////////////
916 /// Transpose64_64_64
917 //////////////////////////////////////////////////////////////////////////
918 struct Transpose64_64_64
919 {
920     //////////////////////////////////////////////////////////////////////////
921     /// @brief Performs an SOA to AOS conversion
922     /// @param pSrc - source data in SOA form
923     /// @param pDst - output data in AOS form
924     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
925     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
926 };
927 
928 //////////////////////////////////////////////////////////////////////////
929 /// Transpose64_64_64_64
930 //////////////////////////////////////////////////////////////////////////
931 struct Transpose64_64_64_64
932 {
933     //////////////////////////////////////////////////////////////////////////
934     /// @brief Performs an SOA to AOS conversion
935     /// @param pSrc - source data in SOA form
936     /// @param pDst - output data in AOS form
937     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
938     static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
939 };
940