1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 {
24     /*
25     Buffer layout after shuffle
26     _________________________________________________
27     |_______Block0__________|_______Block1__________|
28     |_______Block2__________|_______Block3__________|
29     |_______Block4__________|_______Block5__________|
30     |_______Block6__________|_______Block7__________|
31 
32     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
33     _______________________________________________
34     |____R0_________R1_____|____R2_________R3_____|
35     |____G0_________G1_____|____G2_________G3_____|
36     |____B0_________B1_____|____B2_________B3_____|
37     |____A0_________A1_____|____A2_________A3_____|
38     |____R4_________R5_____|____R6_________R7_____|
39     |____G4_________G5_____|____G6_________G7_____|
40     |____B4_________B5_____|____B6_________B7_____|
41     |____A4_________A5_____|____A6_________A7_____|
42     */
43 
44     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
45 
46     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
47     matrix_ref<ushort, 4, 16> TempResult4x8_Top = Result.format<ushort, 8, 16>().select<4, 1, 16, 1>(0, 0);
48     matrix_ref<ushort, 4, 16> TempResult4x8_Bottom = Result.format<ushort, 8, 16>().select<4, 1, 16, 1>(4, 0);
49 
50 #pragma unroll
51     for (uchar i = 0; i < 2; i++, DstY += 8)
52     {
53         // First 8x8
54         {
55             // R/G/B channel 1st half
56             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0);
57             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8);
58             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16);
59             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24);
60 
61             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32);
62             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40);
63             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48);
64             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56);
65 
66             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0);
67             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8);
68             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16);
69             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24);
70 
71             matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 32);
72             matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 40);
73             matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 48);
74             matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 56);
75 
76             TempB0 = TempB0 & 0xFF00;
77             TempA0 = TempA0 & 0xFF00;
78 
79             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR0 >> 8) + (TempB0);
80             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG0 >> 8) + (TempA0);
81 
82             TempB2 = TempB2 & 0xFF00;
83             TempA2 = TempA2 & 0xFF00;
84 
85             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR2 >> 8) + (TempB2);
86             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG2 >> 8) + (TempA2);
87 
88             TempB4 = TempB4 & 0xFF00;
89             TempA4 = TempA4 & 0xFF00;
90 
91             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR4 >> 8) + (TempB4);
92             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG4 >> 8) + (TempA4);
93 
94             TempB6 = TempB6 & 0xFF00;
95             TempA6 = TempA6 & 0xFF00;
96 
97             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR6 >> 8) + (TempB6);
98             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG6 >> 8) + (TempA6);
99 
100             // R/G/B channel 2nd half
101             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0);
102             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8);
103             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16);
104             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24);
105 
106             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32);
107             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40);
108             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48);
109             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56);
110 
111             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0);
112             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8);
113             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16);
114             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24);
115 
116             matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 32);
117             matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 40);
118             matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 48);
119             matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 56);
120 
121             TempB8 = TempB8 & 0xFF00;
122             TempA8 = TempA8 & 0xFF00;
123 
124             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR8 >> 8) + (TempB8);
125             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG8 >> 8) + (TempA8);
126 
127             TempB10 = TempB10 & 0xFF00;
128             TempA10 = TempA10 & 0xFF00;
129 
130             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR10 >> 8) + (TempB10);
131             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG10 >> 8) + (TempA10);
132 
133             TempB12 = TempB12 & 0xFF00;
134             TempA12 = TempA12 & 0xFF00;
135 
136             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR12 >> 8) + (TempB12);
137             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG12 >> 8) + (TempA12);
138 
139             TempB14 = TempB14 & 0xFF00;
140             TempA14 = TempA14 & 0xFF00;
141 
142             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR14 >> 8) + (TempB14);
143             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG14 >> 8) + (TempA14);
144 
145             write(Dst_Surface, DstX * 4, DstY, Result);
146         }
147 
148         // Second 8x8
149         {
150             // R/G/B channel 1st half
151             matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0);
152             matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8);
153             matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16);
154             matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24);
155 
156             matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32);
157             matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40);
158             matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48);
159             matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56);
160 
161             matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0);
162             matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8);
163             matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16);
164             matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24);
165 
166             matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 32);
167             matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 40);
168             matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 48);
169             matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 56);
170 
171             TempB0 = TempB0 & 0xFF00;
172             TempA0 = TempA0 & 0xFF00;
173 
174             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR0 >> 8) + (TempB0);
175             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG0 >> 8) + (TempA0);
176 
177             TempB2 = TempB2 & 0xFF00;
178             TempA2 = TempA2 & 0xFF00;
179 
180             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR2 >> 8) + (TempB2);
181             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG2 >> 8) + (TempA2);
182 
183             TempB4 = TempB4 & 0xFF00;
184             TempA4 = TempA4 & 0xFF00;
185 
186             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR4 >> 8) + (TempB4);
187             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG4 >> 8) + (TempA4);
188 
189             TempB6 = TempB6 & 0xFF00;
190             TempA6 = TempA6 & 0xFF00;
191 
192             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR6 >> 8) + (TempB6);
193             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG6 >> 8) + (TempA6);
194 
195             // R/G/B channel 2nd half
196             matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0);
197             matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8);
198             matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16);
199             matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24);
200 
201             matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32);
202             matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40);
203             matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48);
204             matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56);
205 
206             matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0);
207             matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8);
208             matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16);
209             matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24);
210 
211             matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 32);
212             matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 40);
213             matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 48);
214             matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 56);
215 
216             TempB8 = TempB8 & 0xFF00;
217             TempA8 = TempA8 & 0xFF00;
218 
219             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR8 >> 8) + (TempB8);
220             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG8 >> 8) + (TempA8);
221 
222             TempB10 = TempB10 & 0xFF00;
223             TempA10 = TempA10 & 0xFF00;
224 
225             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR10 >> 8) + (TempB10);
226             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG10 >> 8) + (TempA10);
227 
228             TempB12 = TempB12 & 0xFF00;
229             TempA12 = TempA12 & 0xFF00;
230 
231             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR12 >> 8) + (TempB12);
232             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG12 >> 8) + (TempA12);
233 
234             TempB14 = TempB14 & 0xFF00;
235             TempA14 = TempA14 & 0xFF00;
236 
237             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR14 >> 8) + (TempB14);
238             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG14 >> 8) + (TempA14);
239 
240             write(Dst_Surface, DstX * 4 + 32, DstY, Result);
241         }
242     }
243 }