1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 {
23     /*
24     Buffer layout after shuffle
25     _________________________________________________
26     |_______Block0__________|_______Block1__________|
27     |_______Block2__________|_______Block3__________|
28     |_______Block4__________|_______Block5__________|
29     |_______Block6__________|_______Block7__________|
30 
31     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
32     _______________________________________________
33     |____R0_________R1_____|____R2_________R3_____|
34     |____G0_________G1_____|____G2_________G3_____|
35     |____B0_________B1_____|____B2_________B3_____|
36     |____A0_________A1_____|____A2_________A3_____|
37     |____R4_________R5_____|____R6_________R7_____|
38     |____G4_________G5_____|____G6_________G7_____|
39     |____B4_________B5_____|____B6_________B7_____|
40     |____A4_________A5_____|____A6_________A7_____|
41     */
42 
43     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
44 
45     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
46 
47     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
48     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
49     matrix_ref<ushort, 4, 16> DitherTempRB = DataBuffer4.format<ushort, 16, 16>().select<4, 1, 16, 1>(0, 0);
50     matrix_ref<ushort, 4, 16> DitherTempG  = DataBuffer4.format<ushort, 16, 16>().select<4, 1, 16, 1>(4, 0);
51 
52     //dithering algorithm
53     //const uint8_t gDitherMatrix_3Bit_4X4[4][4] = {
54     //    {  0,  4,  1,  5 },
55     //    {  6,  2,  7,  3 },
56     //    {  1,  5,  0,  4 },
57     //    {  7,  3,  6,  2 }
58     //};
59     //R2 = (R1 + gDitherMatrix_3Bit_4X4[(x+0) & 3][(y+0) & 3]  )
60     //G2 = (G1 + gDitherMatrix_3Bit_4X4[(x+1) & 3][(y+1) & 3]/2)
61     //B2 = (B1 + gDitherMatrix_3Bit_4X4[(x+2) & 3][(y+2) & 3]  )
62 
63     // -----  Dithering Buffer0  ------
64 #define TempR_Top DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0)
65 #define TempG_Top DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0)
66 #define TempB_Top DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0)
67 #define TempR_Bot DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0)
68 #define TempG_Bot DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0)
69 #define TempB_Bot DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0)
70 
71     // R
72     // mov(8) uwBUFFER_5(0, 0)<1>  0x51405140 : v //first 16pixel row
73     // mov(8) uwBUFFER_5(0, 8)<1>  0x37263726 : v //second 16pixel row
74     // mov(8) uwBUFFER_5(1, 0)<1>  0x40514051 : v //third 16pixel row
75     // mov(8) uwBUFFER_5(1, 8)<1>  0x26372637 : v //fourth 16pixel row
76 
77     {
78         vector<ushort, 8> TempDither(Dither_RB0);
79         DitherTempRB.select<1, 1, 8, 1>(0, 0) = TempDither;
80     }
81 
82     {
83         vector<ushort, 8> TempDither(Dither_RB1);
84         DitherTempRB.select<1, 1, 8, 1>(0, 8) = TempDither;
85     }
86 
87     {
88         vector<ushort, 8> TempDither(Dither_RB2);
89         DitherTempRB.select<1, 1, 8, 1>(1, 0) = TempDither;
90     }
91 
92     {
93         vector<ushort, 8> TempDither(Dither_RB3);
94         DitherTempRB.select<1, 1, 8, 1>(1, 8) = TempDither;
95     }
96 
97     DitherTempRB.select<2, 1, 16, 1>(2, 0) = DitherTempRB.select<2, 1, 16, 1>(0, 0);
98 
99     DitherTempRB = DitherTempRB << 8;
100     TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
101     TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
102 
103     // G
104     // mov(8) uwBUFFER_5(4, 0)<1>  0x63726372 : v //first 16pixel row
105     // mov(8) uwBUFFER_5(4, 8)<1>  0x14051405 : v //second 16pixel row
106     // mov(8) uwBUFFER_5(5, 0)<1>  0x72637263 : v //third 16pixel row
107     // mov(8) uwBUFFER_5(5, 8)<1>  0x05140514 : v //fourth 16pixel row
108 
109     {
110         vector<ushort, 8> TempDither(Dither_G0);
111         DitherTempG.select<1, 1, 8, 1>(0, 0) = TempDither;
112     }
113 
114     {
115         vector<ushort, 8> TempDither(Dither_G1);
116         DitherTempG.select<1, 1, 8, 1>(0, 8) = TempDither;
117     }
118 
119     {
120         vector<ushort, 8> TempDither(Dither_G2);
121         DitherTempG.select<1, 1, 8, 1>(1, 0) = TempDither;
122     }
123 
124     {
125         vector<ushort, 8> TempDither(Dither_G3);
126         DitherTempG.select<1, 1, 8, 1>(1, 8) = TempDither;
127     }
128 
129     DitherTempG.select<2, 1, 16, 1>(2, 0) = DitherTempG.select<2, 1, 16, 1>(0, 0);
130 
131     DitherTempG = DitherTempG << 7;
132     TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT);
133     TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT);
134 
135     // B
136     TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
137     TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
138 
139 #undef TempR_Top
140 #undef TempG_Top
141 #undef TempB_Top
142 #undef TempR_Bot
143 #undef TempG_Bot
144 #undef TempB_Bot
145 
146     // -----  Dithering Buffer1  ------
147 #define TempR_Top DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0)
148 #define TempG_Top DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0)
149 #define TempB_Top DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0)
150 #define TempR_Bot DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0)
151 #define TempG_Bot DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0)
152 #define TempB_Bot DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0)
153 
154     // R
155     TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
156     TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
157 
158     // G
159     TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT);
160     TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT);
161 
162     // B
163     TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
164     TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
165 
166 #undef TempR_Top
167 #undef TempG_Top
168 #undef TempB_Top
169 #undef TempR_Bot
170 #undef TempG_Bot
171 #undef TempB_Bot
172 
173     // -----  Dithering Buffer2  ------
174 #define TempR_Top DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0)
175 #define TempG_Top DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0)
176 #define TempB_Top DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0)
177 #define TempR_Bot DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0)
178 #define TempG_Bot DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0)
179 #define TempB_Bot DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0)
180 
181     // R
182     TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
183     TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
184 
185     // G
186     TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT);
187     TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT);
188 
189     // B
190     TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
191     TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
192 
193 #undef TempR_Top
194 #undef TempG_Top
195 #undef TempB_Top
196 #undef TempR_Bot
197 #undef TempG_Bot
198 #undef TempB_Bot
199 
200     // -----  Dithering Buffer3  ------
201 #define TempR_Top DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0)
202 #define TempG_Top DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0)
203 #define TempB_Top DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0)
204 #define TempR_Bot DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0)
205 #define TempG_Bot DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0)
206 #define TempB_Bot DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0)
207 
208     // R
209     TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
210     TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
211 
212     // G
213     TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT);
214     TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT);
215 
216     // B
217     TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT);
218     TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT);
219 
220 #undef TempR_Top
221 #undef TempG_Top
222 #undef TempB_Top
223 #undef TempR_Bot
224 #undef TempG_Bot
225 #undef TempB_Bot
226 
227 #pragma unroll
228     for (uchar i = 0; i < 2; i++, DstY += 8)
229     {
230         // First 8x16
231         {
232             // R/G/B/A channels
233             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(32 * i, 0) & 0xF800);
234             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12+ 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 32 * i, 0) & 0xF800);
235             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(32 * i, 8) & 0xF800);
236             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 32 * i, 8) & 0xF800);
237 
238             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 32 * i, 0) & 0xF800);
239             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 32 * i, 0) & 0xF800);
240             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 32 * i, 8) & 0xF800);
241             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 32 * i, 8) & 0xF800);
242 
243 
244             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(16 + 32 * i, 0) & 0xF800);
245             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 16 + 32 * i, 0) & 0xF800);
246             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(16 + 32 * i, 8) & 0xF800);
247             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 16 + 32 * i, 8) & 0xF800);
248 
249             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 16 + 32 * i, 0) & 0xF800);
250             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 16 + 32 * i, 0) & 0xF800);
251             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 16 + 32 * i, 8) & 0xF800);
252             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 16 + 32 * i, 8) & 0xF800);
253 
254             write(Dst_Surface, DstX * 2, DstY, Result);
255         }
256     }
257 }