1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 {
24     /*
25     Buffer layout after shuffle
26     _________________________________________________
27     |_______Block0__________|_______Block1__________|
28     |_______Block2__________|_______Block3__________|
29     |_______Block4__________|_______Block5__________|
30     |_______Block6__________|_______Block7__________|
31 
32     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
33     _______________________________________________
34     |____R0_________R1_____|____R2_________R3_____|
35     |____G0_________G1_____|____G2_________G3_____|
36     |____B0_________B1_____|____B2_________B3_____|
37     |____A0_________A1_____|____A2_________A3_____|
38     |____R4_________R5_____|____R6_________R7_____|
39     |____G4_________G5_____|____G6_________G7_____|
40     |____B4_________B5_____|____B6_________B7_____|
41     |____A4_________A5_____|____A6_________A7_____|
42     */
43 
44     matrix_ref<uint, 9, 8> Msg = DataBuffer.format<uint, 96, 8>().select<9, 1, 8, 1>(64, 0);
45     matrix_ref<uint, 8, 8> Result = Msg.select<8, 1, 8, 1>(1, 0);
46     uint descriptor;
47 
48     Msg.select<1, 1, 8, 1>(0, 0) = cm_get_r0<uint>();
49     Msg.select<1, 1, 1, 1>(0, 2) = nBLOCK_WIDTH_32 + nBLOCK_HEIGHT_8;
50     descriptor = MDF_FC_OUTPUT_BTI_START + nDPMW_MSGDSC + nMSGLEN_8;
51     uchar ChannelSwap = (WAFlag >> 16) & 0x01;
52 
53     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
54     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
55 #pragma unroll
56     for (uchar i = 0; i < 2; i++, DstY += 8)
57     {
58         // First 8x8
59         {
60             // R/G/B channel 1st half
61             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0);
62             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8);
63             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16);
64             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24);
65 
66             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32);
67             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40);
68             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48);
69             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56);
70 
71             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0);
72             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8);
73             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16);
74             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24);
75 
76             matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 32);
77             matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 40);
78             matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 48);
79             matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 56);
80 
81             // Rounding
82             TempR0 = cm_add<ushort>(TempR0, 0x20, SAT);
83             TempR0 = TempR0 & 0xFFC0;
84             TempR2 = cm_add<ushort>(TempR2, 0x20, SAT);
85             TempR2 = TempR2 & 0xFFC0;
86             TempR4 = cm_add<ushort>(TempR4, 0x20, SAT);
87             TempR4 = TempR4 & 0xFFC0;
88             TempR6 = cm_add<ushort>(TempR6, 0x20, SAT);
89             TempR6 = TempR6 & 0xFFC0;
90 
91             TempG0 = cm_add<ushort>(TempG0, 0x20, SAT);
92             TempG0 = TempG0 & 0xFFC0;
93             TempG2 = cm_add<ushort>(TempG2, 0x20, SAT);
94             TempG2 = TempG2 & 0xFFC0;
95             TempG4 = cm_add<ushort>(TempG4, 0x20, SAT);
96             TempG4 = TempG4 & 0xFFC0;
97             TempG6 = cm_add<ushort>(TempG6, 0x20, SAT);
98             TempG6 = TempG6 & 0xFFC0;
99 
100             TempB0 = cm_add<ushort>(TempB0, 0x20, SAT);
101             TempB0 = TempB0 & 0xFFC0;
102             TempB2 = cm_add<ushort>(TempB2, 0x20, SAT);
103             TempB2 = TempB2 & 0xFFC0;
104             TempB4 = cm_add<ushort>(TempB4, 0x20, SAT);
105             TempB4 = TempB4 & 0xFFC0;
106             TempB6 = cm_add<ushort>(TempB6, 0x20, SAT);
107             TempB6 = TempB6 & 0xFFC0;
108 
109             TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT);
110             TempA0 = TempA0 & 0xC000;
111             TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT);
112             TempA2 = TempA2 & 0xC000;
113             TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT);
114             TempA4 = TempA4 & 0xC000;
115             TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT);
116             TempA6 = TempA6 & 0xC000;
117 
118             if (ChannelSwap)
119             {
120                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6;
121                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
122                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000;
123                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000;
124 
125                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6;
126                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
127                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000;
128                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000;
129 
130                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6;
131                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
132                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000;
133                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000;
134 
135                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6;
136                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
137                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000;
138                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000;
139             }
140             else
141             {
142                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempR0 >> 6;
143                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
144                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempB0 * 0x4000;
145                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000;
146 
147                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempR2 >> 6;
148                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
149                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempB2 * 0x4000;
150                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000;
151 
152                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempR4 >> 6;
153                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
154                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempB4 * 0x4000;
155                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000;
156 
157                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempR6 >> 6;
158                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
159                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempB6 * 0x4000;
160                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000;
161             }
162 
163             // R/G/B channel 2nd half
164             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0);
165             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8);
166             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16);
167             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24);
168 
169             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32);
170             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40);
171             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48);
172             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56);
173 
174             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0);
175             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8);
176             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16);
177             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24);
178 
179             matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 32);
180             matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 40);
181             matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 48);
182             matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 56);
183 
184             // Rounding
185             TempR8 = cm_add<ushort>(TempR8, 0x20, SAT);
186             TempR8 = TempR8 & 0xFFC0;
187             TempR10 = cm_add<ushort>(TempR10, 0x20, SAT);
188             TempR10 = TempR10 & 0xFFC0;
189             TempR12 = cm_add<ushort>(TempR12, 0x20, SAT);
190             TempR12 = TempR12 & 0xFFC0;
191             TempR14 = cm_add<ushort>(TempR14, 0x20, SAT);
192             TempR14 = TempR14 & 0xFFC0;
193 
194             TempG8 = cm_add<ushort>(TempG8, 0x20, SAT);
195             TempG8 = TempG8 & 0xFFC0;
196             TempG10 = cm_add<ushort>(TempG10, 0x20, SAT);
197             TempG10 = TempG10 & 0xFFC0;
198             TempG12 = cm_add<ushort>(TempG12, 0x20, SAT);
199             TempG12 = TempG12 & 0xFFC0;
200             TempG14 = cm_add<ushort>(TempG14, 0x20, SAT);
201             TempG14 = TempG14 & 0xFFC0;
202 
203             TempB8 = cm_add<ushort>(TempB8, 0x20, SAT);
204             TempB8 = TempB8 & 0xFFC0;
205             TempB10 = cm_add<ushort>(TempB10, 0x20, SAT);
206             TempB10 = TempB10 & 0xFFC0;
207             TempB12 = cm_add<ushort>(TempB12, 0x20, SAT);
208             TempB12 = TempB12 & 0xFFC0;
209             TempB14 = cm_add<ushort>(TempB14, 0x20, SAT);
210             TempB14 = TempB14 & 0xFFC0;
211 
212             TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT);
213             TempA8 = TempA8 & 0xC000;
214             TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT);
215             TempA10 = TempA10 & 0xC000;
216             TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT);
217             TempA12 = TempA12 & 0xC000;
218             TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT);
219             TempA14 = TempA14 & 0xC000;
220 
221             if (ChannelSwap)
222             {
223                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6;
224                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
225                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000;
226                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000;
227 
228                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6;
229                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
230                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000;
231                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000;
232 
233                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6;
234                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
235                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000;
236                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000;
237 
238                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6;
239                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
240                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000;
241                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000;
242             }
243             else
244             {
245                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempR8 >> 6;
246                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
247                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempB8 * 0x4000;
248                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000;
249 
250                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempR10 >> 6;
251                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
252                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempB10 * 0x4000;
253                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000;
254 
255                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempR12 >> 6;
256                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
257                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempB12 * 0x4000;
258                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000;
259 
260                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempR14 >> 6;
261                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
262                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempB14 * 0x4000;
263                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000;
264             }
265 
266             Msg.select<1, 1, 1, 1>(0, 0) = DstX * 4;
267             Msg.select<1, 1, 1, 1>(0, 1) = DstY;
268 
269             cm_send(NULL,
270                 Msg,
271                 nDATAPORT_DC1,
272                 descriptor,
273                 0);
274         }
275 
276         // Second 8x8
277         {
278             // R/G/B channel 1st half
279             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0);
280             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8);
281             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16);
282             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24);
283 
284             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32);
285             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40);
286             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48);
287             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56);
288 
289             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0);
290             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8);
291             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16);
292             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24);
293 
294             matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 32);
295             matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 40);
296             matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 48);
297             matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 56);
298 
299             // Rounding
300             TempR0 = cm_add<ushort>(TempR0, 0x20, SAT);
301             TempR0 = TempR0 & 0xFFC0;
302             TempR2 = cm_add<ushort>(TempR2, 0x20, SAT);
303             TempR2 = TempR2 & 0xFFC0;
304             TempR4 = cm_add<ushort>(TempR4, 0x20, SAT);
305             TempR4 = TempR4 & 0xFFC0;
306             TempR6 = cm_add<ushort>(TempR6, 0x20, SAT);
307             TempR6 = TempR6 & 0xFFC0;
308 
309             TempG0 = cm_add<ushort>(TempG0, 0x20, SAT);
310             TempG0 = TempG0 & 0xFFC0;
311             TempG2 = cm_add<ushort>(TempG2, 0x20, SAT);
312             TempG2 = TempG2 & 0xFFC0;
313             TempG4 = cm_add<ushort>(TempG4, 0x20, SAT);
314             TempG4 = TempG4 & 0xFFC0;
315             TempG6 = cm_add<ushort>(TempG6, 0x20, SAT);
316             TempG6 = TempG6 & 0xFFC0;
317 
318             TempB0 = cm_add<ushort>(TempB0, 0x20, SAT);
319             TempB0 = TempB0 & 0xFFC0;
320             TempB2 = cm_add<ushort>(TempB2, 0x20, SAT);
321             TempB2 = TempB2 & 0xFFC0;
322             TempB4 = cm_add<ushort>(TempB4, 0x20, SAT);
323             TempB4 = TempB4 & 0xFFC0;
324             TempB6 = cm_add<ushort>(TempB6, 0x20, SAT);
325             TempB6 = TempB6 & 0xFFC0;
326 
327             TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT);
328             TempA0 = TempA0 & 0xC000;
329             TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT);
330             TempA2 = TempA2 & 0xC000;
331             TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT);
332             TempA4 = TempA4 & 0xC000;
333             TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT);
334             TempA6 = TempA6 & 0xC000;
335 
336             if (ChannelSwap)
337             {
338                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6;
339                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
340                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000;
341                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000;
342 
343                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6;
344                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
345                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000;
346                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000;
347 
348                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6;
349                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
350                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000;
351                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000;
352 
353                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6;
354                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
355                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000;
356                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000;
357             }
358             else
359             {
360                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempR0 >> 6;
361                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
362                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempB0 * 0x4000;
363                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000;
364 
365                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempR2 >> 6;
366                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
367                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempB2 * 0x4000;
368                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000;
369 
370                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempR4 >> 6;
371                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
372                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempB4 * 0x4000;
373                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000;
374 
375                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempR6 >> 6;
376                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
377                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempB6 * 0x4000;
378                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000;
379             }
380 
381             // R/G/B channel 2nd half
382             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0);
383             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8);
384             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16);
385             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24);
386 
387             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32);
388             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40);
389             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48);
390             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56);
391 
392             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0);
393             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8);
394             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16);
395             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24);
396 
397             matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 32);
398             matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 40);
399             matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 48);
400             matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 56);
401 
402             // Rounding
403             TempR8 = cm_add<ushort>(TempR8, 0x20, SAT);
404             TempR8 = TempR8 & 0xFFC0;
405             TempR10 = cm_add<ushort>(TempR10, 0x20, SAT);
406             TempR10 = TempR10 & 0xFFC0;
407             TempR12 = cm_add<ushort>(TempR12, 0x20, SAT);
408             TempR12 = TempR12 & 0xFFC0;
409             TempR14 = cm_add<ushort>(TempR14, 0x20, SAT);
410             TempR14 = TempR14 & 0xFFC0;
411 
412             TempG8 = cm_add<ushort>(TempG8, 0x20, SAT);
413             TempG8 = TempG8 & 0xFFC0;
414             TempG10 = cm_add<ushort>(TempG10, 0x20, SAT);
415             TempG10 = TempG10 & 0xFFC0;
416             TempG12 = cm_add<ushort>(TempG12, 0x20, SAT);
417             TempG12 = TempG12 & 0xFFC0;
418             TempG14 = cm_add<ushort>(TempG14, 0x20, SAT);
419             TempG14 = TempG14 & 0xFFC0;
420 
421             TempB8 = cm_add<ushort>(TempB8, 0x20, SAT);
422             TempB8 = TempB8 & 0xFFC0;
423             TempB10 = cm_add<ushort>(TempB10, 0x20, SAT);
424             TempB10 = TempB10 & 0xFFC0;
425             TempB12 = cm_add<ushort>(TempB12, 0x20, SAT);
426             TempB12 = TempB12 & 0xFFC0;
427             TempB14 = cm_add<ushort>(TempB14, 0x20, SAT);
428             TempB14 = TempB14 & 0xFFC0;
429 
430             TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT);
431             TempA8 = TempA8 & 0xC000;
432             TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT);
433             TempA10 = TempA10 & 0xC000;
434             TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT);
435             TempA12 = TempA12 & 0xC000;
436             TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT);
437             TempA14 = TempA14 & 0xC000;
438 
439             if (ChannelSwap)
440             {
441                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6;
442                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
443                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000;
444                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000;
445 
446                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6;
447                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
448                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000;
449                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000;
450 
451                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6;
452                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
453                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000;
454                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000;
455 
456                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6;
457                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
458                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000;
459                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000;
460             }
461             else
462             {
463                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempR8 >> 6;
464                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
465                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempB8 * 0x4000;
466                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000;
467 
468                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempR10 >> 6;
469                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
470                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempB10 * 0x4000;
471                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000;
472 
473                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempR12 >> 6;
474                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
475                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempB12 * 0x4000;
476                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000;
477 
478                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempR14 >> 6;
479                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
480                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempB14 * 0x4000;
481                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000;
482             }
483 
484             Msg.select<1, 1, 1, 1>(0, 0) = DstX * 4 + 32;
485             Msg.select<1, 1, 1, 1>(0, 1) = DstY;
486 
487             cm_send(NULL,
488                 Msg,
489                 nDATAPORT_DC1,
490                 descriptor,
491                 0);
492         }
493     }
494     //}
495 }