1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 {
24     /*
25     Buffer layout after shuffle
26     _________________________________________________
27     |_______Block0__________|_______Block1__________|
28     |_______Block2__________|_______Block3__________|
29     |_______Block4__________|_______Block5__________|
30     |_______Block6__________|_______Block7__________|
31 
32     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
33     _______________________________________________
34     |____R0_________R1_____|____R2_________R3_____|
35     |____G0_________G1_____|____G2_________G3_____|
36     |____B0_________B1_____|____B2_________B3_____|
37     |____A0_________A1_____|____A2_________A3_____|
38     |____R4_________R5_____|____R6_________R7_____|
39     |____G4_________G5_____|____G6_________G7_____|
40     |____B4_________B5_____|____B6_________B7_____|
41     |____A4_________A5_____|____A6_________A7_____|
42     */
43 
44     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
45 
46     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
47     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
48     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
49 
50 #pragma unroll
51     for (uchar i = 0; i < 2; i++, DstY += 8)
52     {
53         // First 8x8
54         {
55 
56             // first 8x4
57             {
58                 // R/G/B channel top half
59                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 0);
60                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 8);
61                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 16);
62                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 24);
63 
64                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 32);
65                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 40);
66                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 48);
67                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 56);
68 
69                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 0);
70                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 8);
71                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 16);
72                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 24);
73 
74                 matrix_ref<ushort, 1, 4> TempA0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 32);
75                 matrix_ref<ushort, 1, 4> TempA2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 40);
76                 matrix_ref<ushort, 1, 4> TempA4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 48);
77                 matrix_ref<ushort, 1, 4> TempA6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 56);
78 
79                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
80                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA0 << 16);
81 
82                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
83                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA2 << 16);
84 
85                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
86                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA4 << 16);
87 
88                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
89                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA6 << 16);
90 
91                 // R/G/B channel bottom half
92                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 0);
93                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 8);
94                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 16);
95                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 24);
96 
97                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 32);
98                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 40);
99                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 48);
100                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 56);
101 
102                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 0);
103                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 8);
104                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 16);
105                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 24);
106 
107                 matrix_ref<ushort, 1, 4> TempA8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 32);
108                 matrix_ref<ushort, 1, 4> TempA10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 40);
109                 matrix_ref<ushort, 1, 4> TempA12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 48);
110                 matrix_ref<ushort, 1, 4> TempA14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 56);
111 
112                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
113                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA8 << 16);
114 
115                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
116                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA10 << 16);
117 
118                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
119                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA12 << 16);
120 
121                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
122                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA14 << 16);
123 
124                 write(Dst_Surface, DstX * 8, DstY, Result);
125             }
126 
127             // second 8x4
128             {
129                 // R/G/B channel top half
130                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 0 + 4);
131                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 8 + 4);
132                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 16 + 4);
133                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 24 + 4);
134 
135                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 32 + 4);
136                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 40 + 4);
137                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 48 + 4);
138                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 56 + 4);
139 
140                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 0 + 4);
141                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 8 + 4);
142                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 16 + 4);
143                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 24 + 4);
144 
145                 matrix_ref<ushort, 1, 4> TempA0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 32 + 4);
146                 matrix_ref<ushort, 1, 4> TempA2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 40 + 4);
147                 matrix_ref<ushort, 1, 4> TempA4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 48 + 4);
148                 matrix_ref<ushort, 1, 4> TempA6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 56 + 4);
149 
150                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
151                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA0 << 16);
152 
153                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
154                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA2 << 16);
155 
156                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
157                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA4 << 16);
158 
159                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
160                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA6 << 16);
161 
162                 // R/G/B channel bottom half
163                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 0 + 4);
164                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 8 + 4);
165                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 16 + 4);
166                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 24 + 4);
167 
168                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 32 + 4);
169                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 40 + 4);
170                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 48 + 4);
171                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 56 + 4);
172 
173                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 0 + 4);
174                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 8 + 4);
175                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 16 + 4);
176                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 24 + 4);
177 
178                 matrix_ref<ushort, 1, 4> TempA8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 32 + 4);
179                 matrix_ref<ushort, 1, 4> TempA10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 40 + 4);
180                 matrix_ref<ushort, 1, 4> TempA12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 48 + 4);
181                 matrix_ref<ushort, 1, 4> TempA14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 56 + 4);
182 
183                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
184                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA8 << 16);
185 
186                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
187                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA10 << 16);
188 
189                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
190                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA12 << 16);
191 
192                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
193                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA14 << 16);
194 
195                 write(Dst_Surface, DstX * 8 + 32, DstY, Result);
196             }
197         }
198 
199         // Second 8x8
200         {
201             // first 8x4
202             {
203                 // R/G/B channel top half
204                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 0);
205                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 8);
206                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 16);
207                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 24);
208 
209                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 32);
210                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 40);
211                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 48);
212                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 56);
213 
214                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 0);
215                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 8);
216                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 16);
217                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 24);
218 
219                 matrix_ref<ushort, 1, 4> TempA0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 32);
220                 matrix_ref<ushort, 1, 4> TempA2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 40);
221                 matrix_ref<ushort, 1, 4> TempA4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 48);
222                 matrix_ref<ushort, 1, 4> TempA6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 56);
223 
224                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
225                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA0 << 16);
226 
227                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
228                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA2 << 16);
229 
230                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
231                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA4 << 16);
232 
233                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
234                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA6 << 16);
235 
236                 // R/G/B channel bottom half
237                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 0);
238                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 8);
239                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 16);
240                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 24);
241 
242                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 32);
243                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 40);
244                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 48);
245                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 56);
246 
247                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 0);
248                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 8);
249                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 16);
250                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 24);
251 
252                 matrix_ref<ushort, 1, 4> TempA8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 32);
253                 matrix_ref<ushort, 1, 4> TempA10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 40);
254                 matrix_ref<ushort, 1, 4> TempA12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 48);
255                 matrix_ref<ushort, 1, 4> TempA14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 56);
256 
257                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
258                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA8 << 16);
259 
260                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
261                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA10 << 16);
262 
263                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
264                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA12 << 16);
265 
266                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
267                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA14 << 16);
268 
269                 write(Dst_Surface, DstX * 8 + 64, DstY, Result);
270             }
271 
272             // second 8x4
273             {
274                 // R/G/B channel top half
275                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 0 + 4);
276                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 8 + 4);
277                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 16 + 4);
278                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 24 + 4);
279 
280                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 32 + 4);
281                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 40 + 4);
282                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 48 + 4);
283                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 56 + 4);
284 
285                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 0 + 4);
286                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 8 + 4);
287                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 16 + 4);
288                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 24 + 4);
289 
290                 matrix_ref<ushort, 1, 4> TempA0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 32 + 4);
291                 matrix_ref<ushort, 1, 4> TempA2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 40 + 4);
292                 matrix_ref<ushort, 1, 4> TempA4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 48 + 4);
293                 matrix_ref<ushort, 1, 4> TempA6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 56 + 4);
294 
295                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
296                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA0 << 16);
297 
298                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
299                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA2 << 16);
300 
301                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
302                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA4 << 16);
303 
304                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
305                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA6 << 16);
306 
307                 // R/G/B channel bottom half
308                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 0 + 4);
309                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 8 + 4);
310                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 16 + 4);
311                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 24 + 4);
312 
313                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 32 + 4);
314                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 40 + 4);
315                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 48 + 4);
316                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 56 + 4);
317 
318                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 0 + 4);
319                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 8 + 4);
320                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 16 + 4);
321                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 24 + 4);
322 
323                 matrix_ref<ushort, 1, 4> TempA8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 32 + 4);
324                 matrix_ref<ushort, 1, 4> TempA10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 40 + 4);
325                 matrix_ref<ushort, 1, 4> TempA12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 48 + 4);
326                 matrix_ref<ushort, 1, 4> TempA14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 56 + 4);
327 
328                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
329                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA8 << 16);
330 
331                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
332                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA10 << 16);
333 
334                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
335                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA12 << 16);
336 
337                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
338                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA14 << 16);
339 
340                 write(Dst_Surface, DstX * 8 + 96, DstY, Result);
341             }
342         }
343     }
344 }