1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 {
24     /*
25     Buffer layout after shuffle
26     _________________________________________________
27     |_______Block0__________|_______Block1__________|
28     |_______Block2__________|_______Block3__________|
29     |_______Block4__________|_______Block5__________|
30     |_______Block6__________|_______Block7__________|
31 
32     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
33     _______________________________________________
34     |____R0_________R1_____|____R2_________R3_____|
35     |____G0_________G1_____|____G2_________G3_____|
36     |____B0_________B1_____|____B2_________B3_____|
37     |____A0_________A1_____|____A2_________A3_____|
38     |____R4_________R5_____|____R6_________R7_____|
39     |____G4_________G5_____|____G6_________G7_____|
40     |____B4_________B5_____|____B6_________B7_____|
41     |____A4_________A5_____|____A6_________A7_____|
42     */
43 
44     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
45 
46     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
47 
48     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
49     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
50 #pragma unroll
51     for (uchar i = 0; i < 2; i++, DstY += 8)
52     {
53         // First 8x8
54         {
55             // R/G/B channel 1st half
56             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0);
57             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8);
58             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16);
59             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24);
60 
61             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32);
62             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40);
63             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48);
64             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56);
65 
66             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0);
67             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8);
68             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16);
69             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24);
70 
71             matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 32);
72             matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 40);
73             matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 48);
74             matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 56);
75 
76             // Rounding
77             TempR0 = cm_add<ushort>(TempR0, 0x20, SAT);
78             TempR0 = TempR0 & 0xFFC0;
79             TempR2 = cm_add<ushort>(TempR2, 0x20, SAT);
80             TempR2 = TempR2 & 0xFFC0;
81             TempR4 = cm_add<ushort>(TempR4, 0x20, SAT);
82             TempR4 = TempR4 & 0xFFC0;
83             TempR6 = cm_add<ushort>(TempR6, 0x20, SAT);
84             TempR6 = TempR6 & 0xFFC0;
85 
86             TempG0 = cm_add<ushort>(TempG0, 0x20, SAT);
87             TempG0 = TempG0 & 0xFFC0;
88             TempG2 = cm_add<ushort>(TempG2, 0x20, SAT);
89             TempG2 = TempG2 & 0xFFC0;
90             TempG4 = cm_add<ushort>(TempG4, 0x20, SAT);
91             TempG4 = TempG4 & 0xFFC0;
92             TempG6 = cm_add<ushort>(TempG6, 0x20, SAT);
93             TempG6 = TempG6 & 0xFFC0;
94 
95             TempB0 = cm_add<ushort>(TempB0, 0x20, SAT);
96             TempB0 = TempB0 & 0xFFC0;
97             TempB2 = cm_add<ushort>(TempB2, 0x20, SAT);
98             TempB2 = TempB2 & 0xFFC0;
99             TempB4 = cm_add<ushort>(TempB4, 0x20, SAT);
100             TempB4 = TempB4 & 0xFFC0;
101             TempB6 = cm_add<ushort>(TempB6, 0x20, SAT);
102             TempB6 = TempB6 & 0xFFC0;
103 
104             TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT);
105             TempA0 = TempA0 & 0xC000;
106             TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT);
107             TempA2 = TempA2 & 0xC000;
108             TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT);
109             TempA4 = TempA4 & 0xC000;
110             TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT);
111             TempA6 = TempA6 & 0xC000;
112 
113             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6;
114             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
115             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000;
116             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000;
117 
118             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6;
119             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
120             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000;
121             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000;
122 
123             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6;
124             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
125             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000;
126             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000;
127 
128             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6;
129             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
130             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000;
131             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000;
132 
133             // R/G/B channel 2nd half
134             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0);
135             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8);
136             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16);
137             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24);
138 
139             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32);
140             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40);
141             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48);
142             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56);
143 
144             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0);
145             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8);
146             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16);
147             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24);
148 
149             matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 32);
150             matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 40);
151             matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 48);
152             matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 56);
153 
154             // Rounding
155             TempR8 = cm_add<ushort>(TempR8, 0x20, SAT);
156             TempR8 = TempR8 & 0xFFC0;
157             TempR10 = cm_add<ushort>(TempR10, 0x20, SAT);
158             TempR10 = TempR10 & 0xFFC0;
159             TempR12 = cm_add<ushort>(TempR12, 0x20, SAT);
160             TempR12 = TempR12 & 0xFFC0;
161             TempR14 = cm_add<ushort>(TempR14, 0x20, SAT);
162             TempR14 = TempR14 & 0xFFC0;
163 
164             TempG8 = cm_add<ushort>(TempG8, 0x20, SAT);
165             TempG8 = TempG8 & 0xFFC0;
166             TempG10 = cm_add<ushort>(TempG10, 0x20, SAT);
167             TempG10 = TempG10 & 0xFFC0;
168             TempG12 = cm_add<ushort>(TempG12, 0x20, SAT);
169             TempG12 = TempG12 & 0xFFC0;
170             TempG14 = cm_add<ushort>(TempG14, 0x20, SAT);
171             TempG14 = TempG14 & 0xFFC0;
172 
173             TempB8 = cm_add<ushort>(TempB8, 0x20, SAT);
174             TempB8 = TempB8 & 0xFFC0;
175             TempB10 = cm_add<ushort>(TempB10, 0x20, SAT);
176             TempB10 = TempB10 & 0xFFC0;
177             TempB12 = cm_add<ushort>(TempB12, 0x20, SAT);
178             TempB12 = TempB12 & 0xFFC0;
179             TempB14 = cm_add<ushort>(TempB14, 0x20, SAT);
180             TempB14 = TempB14 & 0xFFC0;
181 
182             TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT);
183             TempA8 = TempA8 & 0xC000;
184             TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT);
185             TempA10 = TempA10 & 0xC000;
186             TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT);
187             TempA12 = TempA12 & 0xC000;
188             TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT);
189             TempA14 = TempA14 & 0xC000;
190 
191             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6;
192             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
193             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000;
194             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000;
195 
196             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6;
197             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
198             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000;
199             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000;
200 
201             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6;
202             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
203             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000;
204             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000;
205 
206             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6;
207             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
208             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000;
209             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000;
210 
211             write(Dst_Surface, DstX * 4, DstY, Result);
212         }
213 
214         // Second 8x8
215         {
216             // R/G/B channel 1st half
217             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0);
218             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8);
219             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16);
220             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24);
221 
222             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32);
223             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40);
224             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48);
225             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56);
226 
227             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0);
228             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8);
229             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16);
230             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24);
231 
232             matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 32);
233             matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 40);
234             matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 48);
235             matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 56);
236 
237             // Rounding
238             TempR0 = cm_add<ushort>(TempR0, 0x20, SAT);
239             TempR0 = TempR0 & 0xFFC0;
240             TempR2 = cm_add<ushort>(TempR2, 0x20, SAT);
241             TempR2 = TempR2 & 0xFFC0;
242             TempR4 = cm_add<ushort>(TempR4, 0x20, SAT);
243             TempR4 = TempR4 & 0xFFC0;
244             TempR6 = cm_add<ushort>(TempR6, 0x20, SAT);
245             TempR6 = TempR6 & 0xFFC0;
246 
247             TempG0 = cm_add<ushort>(TempG0, 0x20, SAT);
248             TempG0 = TempG0 & 0xFFC0;
249             TempG2 = cm_add<ushort>(TempG2, 0x20, SAT);
250             TempG2 = TempG2 & 0xFFC0;
251             TempG4 = cm_add<ushort>(TempG4, 0x20, SAT);
252             TempG4 = TempG4 & 0xFFC0;
253             TempG6 = cm_add<ushort>(TempG6, 0x20, SAT);
254             TempG6 = TempG6 & 0xFFC0;
255 
256             TempB0 = cm_add<ushort>(TempB0, 0x20, SAT);
257             TempB0 = TempB0 & 0xFFC0;
258             TempB2 = cm_add<ushort>(TempB2, 0x20, SAT);
259             TempB2 = TempB2 & 0xFFC0;
260             TempB4 = cm_add<ushort>(TempB4, 0x20, SAT);
261             TempB4 = TempB4 & 0xFFC0;
262             TempB6 = cm_add<ushort>(TempB6, 0x20, SAT);
263             TempB6 = TempB6 & 0xFFC0;
264 
265             TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT);
266             TempA0 = TempA0 & 0xC000;
267             TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT);
268             TempA2 = TempA2 & 0xC000;
269             TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT);
270             TempA4 = TempA4 & 0xC000;
271             TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT);
272             TempA6 = TempA6 & 0xC000;
273 
274             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6;
275             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
276             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000;
277             TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000;
278 
279             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6;
280             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
281             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000;
282             TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000;
283 
284             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6;
285             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
286             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000;
287             TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000;
288 
289             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6;
290             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
291             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000;
292             TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000;
293 
294             // R/G/B channel 2nd half
295             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0);
296             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8);
297             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16);
298             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24);
299 
300             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32);
301             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40);
302             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48);
303             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56);
304 
305             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0);
306             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8);
307             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16);
308             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24);
309 
310             matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 32);
311             matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 40);
312             matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 48);
313             matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 56);
314 
315             // Rounding
316             TempR8 = cm_add<ushort>(TempR8, 0x20, SAT);
317             TempR8 = TempR8 & 0xFFC0;
318             TempR10 = cm_add<ushort>(TempR10, 0x20, SAT);
319             TempR10 = TempR10 & 0xFFC0;
320             TempR12 = cm_add<ushort>(TempR12, 0x20, SAT);
321             TempR12 = TempR12 & 0xFFC0;
322             TempR14 = cm_add<ushort>(TempR14, 0x20, SAT);
323             TempR14 = TempR14 & 0xFFC0;
324 
325             TempG8 = cm_add<ushort>(TempG8, 0x20, SAT);
326             TempG8 = TempG8 & 0xFFC0;
327             TempG10 = cm_add<ushort>(TempG10, 0x20, SAT);
328             TempG10 = TempG10 & 0xFFC0;
329             TempG12 = cm_add<ushort>(TempG12, 0x20, SAT);
330             TempG12 = TempG12 & 0xFFC0;
331             TempG14 = cm_add<ushort>(TempG14, 0x20, SAT);
332             TempG14 = TempG14 & 0xFFC0;
333 
334             TempB8 = cm_add<ushort>(TempB8, 0x20, SAT);
335             TempB8 = TempB8 & 0xFFC0;
336             TempB10 = cm_add<ushort>(TempB10, 0x20, SAT);
337             TempB10 = TempB10 & 0xFFC0;
338             TempB12 = cm_add<ushort>(TempB12, 0x20, SAT);
339             TempB12 = TempB12 & 0xFFC0;
340             TempB14 = cm_add<ushort>(TempB14, 0x20, SAT);
341             TempB14 = TempB14 & 0xFFC0;
342 
343             TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT);
344             TempA8 = TempA8 & 0xC000;
345             TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT);
346             TempA10 = TempA10 & 0xC000;
347             TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT);
348             TempA12 = TempA12 & 0xC000;
349             TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT);
350             TempA14 = TempA14 & 0xC000;
351 
352             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6;
353             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
354             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000;
355             TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000;
356 
357             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6;
358             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
359             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000;
360             TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000;
361 
362             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6;
363             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
364             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000;
365             TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000;
366 
367             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6;
368             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
369             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000;
370             TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000;
371 
372             write(Dst_Surface, DstX * 4 + 32, DstY, Result);
373         }
374     }
375     //}
376 }