1 /* 2 * Copyright (c) 2019, Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 { 23 /* 24 Buffer layout after shuffle 25 _________________________________________________ 26 |_______Block0__________|_______Block1__________| 27 |_______Block2__________|_______Block3__________| 28 |_______Block4__________|_______Block5__________| 29 |_______Block6__________|_______Block7__________| 30 31 Write back buffer layout correlate to the block number#, each box stands for 1 GRF 32 _______________________________________________ 33 |____R0_________R1_____|____R2_________R3_____| 34 |____G0_________G1_____|____G2_________G3_____| 35 |____B0_________B1_____|____B2_________B3_____| 36 |____A0_________A1_____|____A2_________A3_____| 37 |____R4_________R5_____|____R6_________R7_____| 38 |____G4_________G5_____|____G6_________G7_____| 39 |____B4_________B5_____|____B6_________B7_____| 40 |____A4_________A5_____|____A6_________A7_____| 41 */ 42 43 matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0); 44 45 SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START); 46 47 matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0); 48 matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0); 49 matrix_ref<ushort, 4, 16> DitherTempRB = DataBuffer4.format<ushort, 16, 16>().select<4, 1, 16, 1>(0, 0); 50 matrix_ref<ushort, 4, 16> DitherTempG = DataBuffer4.format<ushort, 16, 16>().select<4, 1, 16, 1>(4, 0); 51 52 //dithering algorithm 53 //const uint8_t gDitherMatrix_3Bit_4X4[4][4] = { 54 // { 0, 4, 1, 5 }, 55 // { 6, 2, 7, 3 }, 56 // { 1, 5, 0, 4 }, 57 // { 7, 3, 6, 2 } 58 //}; 59 //R2 = (R1 + gDitherMatrix_3Bit_4X4[(x+0) & 3][(y+0) & 3] ) 60 //G2 = (G1 + gDitherMatrix_3Bit_4X4[(x+1) & 3][(y+1) & 3]/2) 61 //B2 = (B1 + gDitherMatrix_3Bit_4X4[(x+2) & 3][(y+2) & 3] ) 62 63 // ----- Dithering Buffer0 ------ 64 #define TempR_Top DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0) 65 #define TempG_Top DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0) 66 #define TempB_Top DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0) 67 #define TempR_Bot DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0) 68 #define TempG_Bot DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0) 69 #define TempB_Bot DataBuffer0.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0) 70 71 // R 72 // mov(8) uwBUFFER_5(0, 0)<1> 0x51405140 : v //first 16pixel row 73 // mov(8) uwBUFFER_5(0, 8)<1> 0x37263726 : v //second 16pixel row 74 // mov(8) uwBUFFER_5(1, 0)<1> 0x40514051 : v //third 16pixel row 75 // mov(8) uwBUFFER_5(1, 8)<1> 0x26372637 : v //fourth 16pixel row 76 77 { 78 vector<ushort, 8> TempDither(Dither_RB0); 79 DitherTempRB.select<1, 1, 8, 1>(0, 0) = TempDither; 80 } 81 82 { 83 vector<ushort, 8> TempDither(Dither_RB1); 84 DitherTempRB.select<1, 1, 8, 1>(0, 8) = TempDither; 85 } 86 87 { 88 vector<ushort, 8> TempDither(Dither_RB2); 89 DitherTempRB.select<1, 1, 8, 1>(1, 0) = TempDither; 90 } 91 92 { 93 vector<ushort, 8> TempDither(Dither_RB3); 94 DitherTempRB.select<1, 1, 8, 1>(1, 8) = TempDither; 95 } 96 97 DitherTempRB.select<2, 1, 16, 1>(2, 0) = DitherTempRB.select<2, 1, 16, 1>(0, 0); 98 99 DitherTempRB = DitherTempRB << 8; 100 TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 101 TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 102 103 // G 104 // mov(8) uwBUFFER_5(4, 0)<1> 0x63726372 : v //first 16pixel row 105 // mov(8) uwBUFFER_5(4, 8)<1> 0x14051405 : v //second 16pixel row 106 // mov(8) uwBUFFER_5(5, 0)<1> 0x72637263 : v //third 16pixel row 107 // mov(8) uwBUFFER_5(5, 8)<1> 0x05140514 : v //fourth 16pixel row 108 109 { 110 vector<ushort, 8> TempDither(Dither_G0); 111 DitherTempG.select<1, 1, 8, 1>(0, 0) = TempDither; 112 } 113 114 { 115 vector<ushort, 8> TempDither(Dither_G1); 116 DitherTempG.select<1, 1, 8, 1>(0, 8) = TempDither; 117 } 118 119 { 120 vector<ushort, 8> TempDither(Dither_G2); 121 DitherTempG.select<1, 1, 8, 1>(1, 0) = TempDither; 122 } 123 124 { 125 vector<ushort, 8> TempDither(Dither_G3); 126 DitherTempG.select<1, 1, 8, 1>(1, 8) = TempDither; 127 } 128 129 DitherTempG.select<2, 1, 16, 1>(2, 0) = DitherTempG.select<2, 1, 16, 1>(0, 0); 130 131 DitherTempG = DitherTempG << 7; 132 TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT); 133 TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT); 134 135 // B 136 TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 137 TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 138 139 #undef TempR_Top 140 #undef TempG_Top 141 #undef TempB_Top 142 #undef TempR_Bot 143 #undef TempG_Bot 144 #undef TempB_Bot 145 146 // ----- Dithering Buffer1 ------ 147 #define TempR_Top DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0) 148 #define TempG_Top DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0) 149 #define TempB_Top DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0) 150 #define TempR_Bot DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0) 151 #define TempG_Bot DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0) 152 #define TempB_Bot DataBuffer1.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0) 153 154 // R 155 TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 156 TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 157 158 // G 159 TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT); 160 TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT); 161 162 // B 163 TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 164 TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 165 166 #undef TempR_Top 167 #undef TempG_Top 168 #undef TempB_Top 169 #undef TempR_Bot 170 #undef TempG_Bot 171 #undef TempB_Bot 172 173 // ----- Dithering Buffer2 ------ 174 #define TempR_Top DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0) 175 #define TempG_Top DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0) 176 #define TempB_Top DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0) 177 #define TempR_Bot DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0) 178 #define TempG_Bot DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0) 179 #define TempB_Bot DataBuffer2.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0) 180 181 // R 182 TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 183 TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 184 185 // G 186 TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT); 187 TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT); 188 189 // B 190 TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 191 TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 192 193 #undef TempR_Top 194 #undef TempG_Top 195 #undef TempB_Top 196 #undef TempR_Bot 197 #undef TempG_Bot 198 #undef TempB_Bot 199 200 // ----- Dithering Buffer3 ------ 201 #define TempR_Top DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0) 202 #define TempG_Top DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(2, 0) 203 #define TempB_Top DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(4, 0) 204 #define TempR_Bot DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(8, 0) 205 #define TempG_Bot DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(10, 0) 206 #define TempB_Bot DataBuffer3.format<ushort, 16, 16>().select<2, 1, 16, 1>(12, 0) 207 208 // R 209 TempR_Top = cm_add<ushort>(TempR_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 210 TempR_Bot = cm_add<ushort>(TempR_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 211 212 // G 213 TempG_Top = cm_add<ushort>(TempG_Top, DitherTempG.select<2, 1, 16, 1>(0, 0), SAT); 214 TempG_Bot = cm_add<ushort>(TempG_Bot, DitherTempG.select<2, 1, 16, 1>(2, 0), SAT); 215 216 // B 217 TempB_Top = cm_add<ushort>(TempB_Top, DitherTempRB.select<2, 1, 16, 1>(0, 0), SAT); 218 TempB_Bot = cm_add<ushort>(TempB_Bot, DitherTempRB.select<2, 1, 16, 1>(2, 0), SAT); 219 220 #undef TempR_Top 221 #undef TempG_Top 222 #undef TempB_Top 223 #undef TempR_Bot 224 #undef TempG_Bot 225 #undef TempB_Bot 226 227 #pragma unroll 228 for (uchar i = 0; i < 2; i++, DstY += 8) 229 { 230 // First 8x16 231 { 232 // R/G/B/A channels 233 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(32 * i, 0) & 0xF800); 234 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12+ 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 32 * i, 0) & 0xF800); 235 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(32 * i, 8) & 0xF800); 236 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 32 * i, 8) & 0xF800); 237 238 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 32 * i, 0) & 0xF800); 239 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 32 * i, 0) & 0xF800); 240 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 32 * i, 8) & 0xF800); 241 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 32 * i, 8) & 0xF800); 242 243 244 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(16 + 32 * i, 0) & 0xF800); 245 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 16 + 32 * i, 0) & 0xF800); 246 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(16 + 32 * i, 8) & 0xF800); 247 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 16 + 32 * i, 8) & 0xF800); 248 249 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 16 + 32 * i, 0) & 0xF800); 250 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 16 + 32 * i, 0) & 0xF800); 251 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 16 + 32 * i, 8) & 0xF800); 252 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 16 + 32 * i, 8) & 0xF800); 253 254 write(Dst_Surface, DstX * 2, DstY, Result); 255 } 256 } 257 }