1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 {
24     /*
25     Buffer layout after shuffle
26     _________________________________________________
27     |_______Block0__________|_______Block1__________|
28     |_______Block2__________|_______Block3__________|
29     |_______Block4__________|_______Block5__________|
30     |_______Block6__________|_______Block7__________|
31 
32     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
33     _______________________________________________
34     |____R0_________R1_____|____R2_________R3_____|
35     |____G0_________G1_____|____G2_________G3_____|
36     |____B0_________B1_____|____B2_________B3_____|
37     |____A0_________A1_____|____A2_________A3_____|
38     |____R4_________R5_____|____R6_________R7_____|
39     |____G4_________G5_____|____G6_________G7_____|
40     |____B4_________B5_____|____B6_________B7_____|
41     |____A4_________A5_____|____A6_________A7_____|
42     */
43 
44     matrix_ref<uint, 9, 8> Msg = DataBuffer.format<uint, 96, 8>().select<9, 1, 8, 1>(64, 0);
45     matrix_ref<uint, 8, 8> Result = Msg.select<8, 1, 8, 1>(1, 0);
46     uint descriptor;
47 
48     Msg.select<1, 1, 8, 1>(0, 0) = cm_get_r0<uint>();
49     Msg.select<1, 1, 1, 1>(0, 2) = nBLOCK_WIDTH_32 + nBLOCK_HEIGHT_8;
50     descriptor = MDF_FC_OUTPUT_BTI_START + nDPMW_MSGDSC + nMSGLEN_8;
51     uchar ChannelSwap = (WAFlag >> 16) & 0x01;
52 
53     uint TempA = (DestinationRGBFormat >> 6) << 30;
54 
55     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
56     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
57 #pragma unroll
58     for (uchar i = 0; i < 2; i++, DstY += 8)
59     {
60         // First 8x8
61         {
62             // R/G/B channel 1st half
63             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0);
64             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8);
65             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16);
66             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24);
67 
68             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32);
69             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40);
70             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48);
71             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56);
72 
73             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0);
74             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8);
75             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16);
76             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24);
77 
78             // Rounding
79             TempR0 = cm_add<ushort>(TempR0, 0x20, SAT);
80             TempR0 = TempR0 & 0xFFC0;
81             TempR2 = cm_add<ushort>(TempR2, 0x20, SAT);
82             TempR2 = TempR2 & 0xFFC0;
83             TempR4 = cm_add<ushort>(TempR4, 0x20, SAT);
84             TempR4 = TempR4 & 0xFFC0;
85             TempR6 = cm_add<ushort>(TempR6, 0x20, SAT);
86             TempR6 = TempR6 & 0xFFC0;
87 
88             TempG0 = cm_add<ushort>(TempG0, 0x20, SAT);
89             TempG0 = TempG0 & 0xFFC0;
90             TempG2 = cm_add<ushort>(TempG2, 0x20, SAT);
91             TempG2 = TempG2 & 0xFFC0;
92             TempG4 = cm_add<ushort>(TempG4, 0x20, SAT);
93             TempG4 = TempG4 & 0xFFC0;
94             TempG6 = cm_add<ushort>(TempG6, 0x20, SAT);
95             TempG6 = TempG6 & 0xFFC0;
96 
97             TempB0 = cm_add<ushort>(TempB0, 0x20, SAT);
98             TempB0 = TempB0 & 0xFFC0;
99             TempB2 = cm_add<ushort>(TempB2, 0x20, SAT);
100             TempB2 = TempB2 & 0xFFC0;
101             TempB4 = cm_add<ushort>(TempB4, 0x20, SAT);
102             TempB4 = TempB4 & 0xFFC0;
103             TempB6 = cm_add<ushort>(TempB6, 0x20, SAT);
104             TempB6 = TempB6 & 0xFFC0;
105 
106             if (ChannelSwap)
107             {
108                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6;
109                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
110                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000;
111                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA;
112 
113                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6;
114                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
115                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000;
116                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA;
117 
118                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6;
119                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
120                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000;
121                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA;
122 
123                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6;
124                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
125                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000;
126                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA;
127             }
128             else
129             {
130                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempR0 >> 6;
131                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
132                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempB0 * 0x4000;
133                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA;
134 
135                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempR2 >> 6;
136                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
137                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempB2 * 0x4000;
138                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA;
139 
140                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempR4 >> 6;
141                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
142                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempB4 * 0x4000;
143                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA;
144 
145                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempR6 >> 6;
146                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
147                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempB6 * 0x4000;
148                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA;
149             }
150 
151             // R/G/B channel 2nd half
152             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0);
153             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8);
154             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16);
155             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24);
156 
157             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32);
158             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40);
159             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48);
160             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56);
161 
162             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0);
163             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8);
164             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16);
165             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24);
166 
167             // Rounding
168             TempR8 = cm_add<ushort>(TempR8, 0x20, SAT);
169             TempR8 = TempR8 & 0xFFC0;
170             TempR10 = cm_add<ushort>(TempR10, 0x20, SAT);
171             TempR10 = TempR10 & 0xFFC0;
172             TempR12 = cm_add<ushort>(TempR12, 0x20, SAT);
173             TempR12 = TempR12 & 0xFFC0;
174             TempR14 = cm_add<ushort>(TempR14, 0x20, SAT);
175             TempR14 = TempR14 & 0xFFC0;
176 
177             TempG8 = cm_add<ushort>(TempG8, 0x20, SAT);
178             TempG8 = TempG8 & 0xFFC0;
179             TempG10 = cm_add<ushort>(TempG10, 0x20, SAT);
180             TempG10 = TempG10 & 0xFFC0;
181             TempG12 = cm_add<ushort>(TempG12, 0x20, SAT);
182             TempG12 = TempG12 & 0xFFC0;
183             TempG14 = cm_add<ushort>(TempG14, 0x20, SAT);
184             TempG14 = TempG14 & 0xFFC0;
185 
186             TempB8 = cm_add<ushort>(TempB8, 0x20, SAT);
187             TempB8 = TempB8 & 0xFFC0;
188             TempB10 = cm_add<ushort>(TempB10, 0x20, SAT);
189             TempB10 = TempB10 & 0xFFC0;
190             TempB12 = cm_add<ushort>(TempB12, 0x20, SAT);
191             TempB12 = TempB12 & 0xFFC0;
192             TempB14 = cm_add<ushort>(TempB14, 0x20, SAT);
193             TempB14 = TempB14 & 0xFFC0;
194 
195             if (ChannelSwap)
196             {
197                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6;
198                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
199                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000;
200                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA;
201 
202                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6;
203                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
204                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000;
205                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA;
206 
207                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6;
208                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
209                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000;
210                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA;
211 
212                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6;
213                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
214                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000;
215                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA;
216             }
217             else
218             {
219                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempR8 >> 6;
220                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
221                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempB8 * 0x4000;
222                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA;
223 
224                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempR10 >> 6;
225                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
226                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempB10 * 0x4000;
227                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA;
228 
229                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempR12 >> 6;
230                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
231                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempB12 * 0x4000;
232                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA;
233 
234                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempR14 >> 6;
235                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
236                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempB14 * 0x4000;
237                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA;
238             }
239 
240             Msg.select<1, 1, 1, 1>(0, 0) = DstX * 4;
241             Msg.select<1, 1, 1, 1>(0, 1) = DstY;
242 
243             cm_send(NULL,
244                 Msg,
245                 nDATAPORT_DC1,
246                 descriptor,
247                 0);
248         }
249 
250         // Second 8x8
251         {
252             // R/G/B channel 1st half
253             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0);
254             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8);
255             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16);
256             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24);
257 
258             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32);
259             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40);
260             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48);
261             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56);
262 
263             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0);
264             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8);
265             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16);
266             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24);
267 
268             // Rounding
269             TempR0 = cm_add<ushort>(TempR0, 0x20, SAT);
270             TempR0 = TempR0 & 0xFFC0;
271             TempR2 = cm_add<ushort>(TempR2, 0x20, SAT);
272             TempR2 = TempR2 & 0xFFC0;
273             TempR4 = cm_add<ushort>(TempR4, 0x20, SAT);
274             TempR4 = TempR4 & 0xFFC0;
275             TempR6 = cm_add<ushort>(TempR6, 0x20, SAT);
276             TempR6 = TempR6 & 0xFFC0;
277 
278             TempG0 = cm_add<ushort>(TempG0, 0x20, SAT);
279             TempG0 = TempG0 & 0xFFC0;
280             TempG2 = cm_add<ushort>(TempG2, 0x20, SAT);
281             TempG2 = TempG2 & 0xFFC0;
282             TempG4 = cm_add<ushort>(TempG4, 0x20, SAT);
283             TempG4 = TempG4 & 0xFFC0;
284             TempG6 = cm_add<ushort>(TempG6, 0x20, SAT);
285             TempG6 = TempG6 & 0xFFC0;
286 
287             TempB0 = cm_add<ushort>(TempB0, 0x20, SAT);
288             TempB0 = TempB0 & 0xFFC0;
289             TempB2 = cm_add<ushort>(TempB2, 0x20, SAT);
290             TempB2 = TempB2 & 0xFFC0;
291             TempB4 = cm_add<ushort>(TempB4, 0x20, SAT);
292             TempB4 = TempB4 & 0xFFC0;
293             TempB6 = cm_add<ushort>(TempB6, 0x20, SAT);
294             TempB6 = TempB6 & 0xFFC0;
295 
296             if (ChannelSwap)
297             {
298                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6;
299                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
300                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000;
301                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA;
302 
303                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6;
304                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
305                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000;
306                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA;
307 
308                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6;
309                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
310                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000;
311                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA;
312 
313                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6;
314                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
315                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000;
316                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA;
317             }
318             else
319             {
320                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempR0 >> 6;
321                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10;
322                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempB0 * 0x4000;
323                 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA;
324 
325                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempR2 >> 6;
326                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10;
327                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempB2 * 0x4000;
328                 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA;
329 
330                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempR4 >> 6;
331                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10;
332                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempB4 * 0x4000;
333                 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA;
334 
335                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempR6 >> 6;
336                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10;
337                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempB6 * 0x4000;
338                 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA;
339             }
340 
341             // R/G/B channel 2nd half
342             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0);
343             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8);
344             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16);
345             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24);
346 
347             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32);
348             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40);
349             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48);
350             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56);
351 
352             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0);
353             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8);
354             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16);
355             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24);
356 
357             // Rounding
358             TempR8 = cm_add<ushort>(TempR8, 0x20, SAT);
359             TempR8 = TempR8 & 0xFFC0;
360             TempR10 = cm_add<ushort>(TempR10, 0x20, SAT);
361             TempR10 = TempR10 & 0xFFC0;
362             TempR12 = cm_add<ushort>(TempR12, 0x20, SAT);
363             TempR12 = TempR12 & 0xFFC0;
364             TempR14 = cm_add<ushort>(TempR14, 0x20, SAT);
365             TempR14 = TempR14 & 0xFFC0;
366 
367             TempG8 = cm_add<ushort>(TempG8, 0x20, SAT);
368             TempG8 = TempG8 & 0xFFC0;
369             TempG10 = cm_add<ushort>(TempG10, 0x20, SAT);
370             TempG10 = TempG10 & 0xFFC0;
371             TempG12 = cm_add<ushort>(TempG12, 0x20, SAT);
372             TempG12 = TempG12 & 0xFFC0;
373             TempG14 = cm_add<ushort>(TempG14, 0x20, SAT);
374             TempG14 = TempG14 & 0xFFC0;
375 
376             TempB8 = cm_add<ushort>(TempB8, 0x20, SAT);
377             TempB8 = TempB8 & 0xFFC0;
378             TempB10 = cm_add<ushort>(TempB10, 0x20, SAT);
379             TempB10 = TempB10 & 0xFFC0;
380             TempB12 = cm_add<ushort>(TempB12, 0x20, SAT);
381             TempB12 = TempB12 & 0xFFC0;
382             TempB14 = cm_add<ushort>(TempB14, 0x20, SAT);
383             TempB14 = TempB14 & 0xFFC0;
384 
385             if (ChannelSwap)
386             {
387                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6;
388                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
389                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000;
390                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA;
391 
392                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6;
393                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
394                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000;
395                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA;
396 
397                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6;
398                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
399                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000;
400                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA;
401 
402                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6;
403                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
404                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000;
405                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA;
406             }
407             else
408             {
409                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempR8 >> 6;
410                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10;
411                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempB8 * 0x4000;
412                 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA;
413 
414                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempR10 >> 6;
415                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10;
416                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempB10 * 0x4000;
417                 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA;
418 
419                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempR12 >> 6;
420                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10;
421                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempB12 * 0x4000;
422                 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA;
423 
424                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempR14 >> 6;
425                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10;
426                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempB14 * 0x4000;
427                 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA;
428             }
429 
430             Msg.select<1, 1, 1, 1>(0, 0) = DstX * 4 + 32;
431             Msg.select<1, 1, 1, 1>(0, 1) = DstY;
432 
433             cm_send(NULL,
434                 Msg,
435                 nDATAPORT_DC1,
436                 descriptor,
437                 0);
438         }
439     }
440     //}
441 }