1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 {
24     /*
25     Buffer layout after shuffle
26     _________________________________________________
27     |_______Block0__________|_______Block1__________|
28     |_______Block2__________|_______Block3__________|
29     |_______Block4__________|_______Block5__________|
30     |_______Block6__________|_______Block7__________|
31 
32     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
33     _______________________________________________
34     |____R0_________R1_____|____R2_________R3_____|
35     |____G0_________G1_____|____G2_________G3_____|
36     |____B0_________B1_____|____B2_________B3_____|
37     |____A0_________A1_____|____A2_________A3_____|
38     |____R4_________R5_____|____R6_________R7_____|
39     |____G4_________G5_____|____G6_________G7_____|
40     |____B4_________B5_____|____B6_________B7_____|
41     |____A4_________A5_____|____A6_________A7_____|
42     */
43 
44     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
45 
46     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
47     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
48     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
49 
50     ushort TempA = DestinationRGBFormat << 8;
51 
52 #pragma unroll
53     for (uchar i = 0; i < 2; i++, DstY += 8)
54     {
55         // First 8x8
56         {
57 
58             // first 8x4
59             {
60                 // R/G/B channel top half
61                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 0);
62                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 8);
63                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 16);
64                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 24);
65 
66                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 32);
67                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 40);
68                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 48);
69                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 56);
70 
71                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 0);
72                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 8);
73                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 16);
74                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 24);
75 
76                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
77                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);
78 
79                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
80                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);
81 
82                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
83                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);
84 
85                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
86                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);
87 
88                 // R/G/B channel bottom half
89                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 0);
90                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 8);
91                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 16);
92                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 24);
93 
94                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 32);
95                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 40);
96                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 48);
97                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 56);
98 
99                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 0);
100                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 8);
101                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 16);
102                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 24);
103 
104                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
105                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);
106 
107                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
108                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);
109 
110                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
111                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);
112 
113                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
114                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);
115 
116                 write(Dst_Surface, DstX * 8, DstY, Result);
117             }
118 
119             // second 8x4
120             {
121                 // R/G/B channel top half
122                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 0 + 4);
123                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 8 + 4);
124                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 16 + 4);
125                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 24 + 4);
126 
127                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 32 + 4);
128                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 40 + 4);
129                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 48 + 4);
130                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 56 + 4);
131 
132                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 0 + 4);
133                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 8 + 4);
134                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 16 + 4);
135                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 24 + 4);
136 
137                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
138                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);
139 
140                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
141                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);
142 
143                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
144                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);
145 
146                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
147                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);
148 
149                 // R/G/B channel bottom half
150                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 0 + 4);
151                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 8 + 4);
152                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 16 + 4);
153                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 24 + 4);
154 
155                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 32 + 4);
156                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 40 + 4);
157                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 48 + 4);
158                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 56 + 4);
159 
160                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 0 + 4);
161                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 8 + 4);
162                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 16 + 4);
163                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 24 + 4);
164 
165                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
166                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);
167 
168                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
169                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);
170 
171                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
172                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);
173 
174                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
175                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);
176 
177                 write(Dst_Surface, DstX * 8 + 32, DstY, Result);
178             }
179         }
180 
181         // Second 8x8
182         {
183             // first 8x4
184             {
185                 // R/G/B channel top half
186                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 0);
187                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 8);
188                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 16);
189                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 24);
190 
191                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 32);
192                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 40);
193                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 48);
194                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 56);
195 
196                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 0);
197                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 8);
198                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 16);
199                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 24);
200 
201                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
202                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);
203 
204                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
205                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);
206 
207                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
208                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);
209 
210                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
211                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);
212 
213                 // R/G/B channel bottom half
214                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 0);
215                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 8);
216                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 16);
217                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 24);
218 
219                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 32);
220                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 40);
221                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 48);
222                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 56);
223 
224                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 0);
225                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 8);
226                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 16);
227                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 24);
228 
229                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
230                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);
231 
232                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
233                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);
234 
235                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
236                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);
237 
238                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
239                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);
240 
241                 write(Dst_Surface, DstX * 8 + 64, DstY, Result);
242             }
243 
244             // second 8x4
245             {
246                 // R/G/B channel top half
247                 matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 0 + 4);
248                 matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 8 + 4);
249                 matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 16 + 4);
250                 matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 24 + 4);
251 
252                 matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 32 + 4);
253                 matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 40 + 4);
254                 matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 48 + 4);
255                 matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 56 + 4);
256 
257                 matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 0 + 4);
258                 matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 8 + 4);
259                 matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 16 + 4);
260                 matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 24 + 4);
261 
262                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
263                 TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);
264 
265                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
266                 TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);
267 
268                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
269                 TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);
270 
271                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
272                 TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);
273 
274                 // R/G/B channel bottom half
275                 matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 0 + 4);
276                 matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 8 + 4);
277                 matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 16 + 4);
278                 matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 24 + 4);
279 
280                 matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 32 + 4);
281                 matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 40 + 4);
282                 matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 48 + 4);
283                 matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 56 + 4);
284 
285                 matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 0 + 4);
286                 matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 8 + 4);
287                 matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 16 + 4);
288                 matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 24 + 4);
289 
290                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
291                 TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);
292 
293                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
294                 TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);
295 
296                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
297                 TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);
298 
299                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
300                 TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);
301 
302                 write(Dst_Surface, DstX * 8 + 96, DstY, Result);
303             }
304         }
305     }
306 }