1 /* 2 * Copyright (c) 2019, Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 { 24 /* 25 Buffer layout after shuffle 26 _________________________________________________ 27 |_______Block0__________|_______Block1__________| 28 |_______Block2__________|_______Block3__________| 29 |_______Block4__________|_______Block5__________| 30 |_______Block6__________|_______Block7__________| 31 32 Write back buffer layout correlate to the block number#, each box stands for 1 GRF 33 _______________________________________________ 34 |____R0_________R1_____|____R2_________R3_____| 35 |____G0_________G1_____|____G2_________G3_____| 36 |____B0_________B1_____|____B2_________B3_____| 37 |____A0_________A1_____|____A2_________A3_____| 38 |____R4_________R5_____|____R6_________R7_____| 39 |____G4_________G5_____|____G6_________G7_____| 40 |____B4_________B5_____|____B6_________B7_____| 41 |____A4_________A5_____|____A6_________A7_____| 42 */ 43 44 matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0); 45 46 SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START); 47 matrix_ref<ushort, 4, 16> TempResult4x8_Top = Result.format<ushort, 8, 16>().select<4, 1, 16, 1>(0, 0); 48 matrix_ref<ushort, 4, 16> TempResult4x8_Bottom = Result.format<ushort, 8, 16>().select<4, 1, 16, 1>(4, 0); 49 50 #pragma unroll 51 for (uchar i = 0; i < 2; i++, DstY += 8) 52 { 53 // First 8x8 54 { 55 // R/G/B channel 1st half 56 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0); 57 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8); 58 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16); 59 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24); 60 61 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32); 62 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40); 63 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48); 64 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56); 65 66 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0); 67 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8); 68 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16); 69 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24); 70 71 matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 32); 72 matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 40); 73 matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 48); 74 matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 56); 75 76 TempB0 = TempB0 & 0xFF00; 77 TempA0 = TempA0 & 0xFF00; 78 79 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR0 >> 8) + (TempB0); 80 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG0 >> 8) + (TempA0); 81 82 TempB2 = TempB2 & 0xFF00; 83 TempA2 = TempA2 & 0xFF00; 84 85 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR2 >> 8) + (TempB2); 86 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG2 >> 8) + (TempA2); 87 88 TempB4 = TempB4 & 0xFF00; 89 TempA4 = TempA4 & 0xFF00; 90 91 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR4 >> 8) + (TempB4); 92 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG4 >> 8) + (TempA4); 93 94 TempB6 = TempB6 & 0xFF00; 95 TempA6 = TempA6 & 0xFF00; 96 97 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR6 >> 8) + (TempB6); 98 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG6 >> 8) + (TempA6); 99 100 // R/G/B channel 2nd half 101 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0); 102 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8); 103 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16); 104 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24); 105 106 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32); 107 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40); 108 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48); 109 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56); 110 111 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0); 112 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8); 113 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16); 114 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24); 115 116 matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 32); 117 matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 40); 118 matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 48); 119 matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 56); 120 121 TempB8 = TempB8 & 0xFF00; 122 TempA8 = TempA8 & 0xFF00; 123 124 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR8 >> 8) + (TempB8); 125 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG8 >> 8) + (TempA8); 126 127 TempB10 = TempB10 & 0xFF00; 128 TempA10 = TempA10 & 0xFF00; 129 130 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR10 >> 8) + (TempB10); 131 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG10 >> 8) + (TempA10); 132 133 TempB12 = TempB12 & 0xFF00; 134 TempA12 = TempA12 & 0xFF00; 135 136 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR12 >> 8) + (TempB12); 137 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG12 >> 8) + (TempA12); 138 139 TempB14 = TempB14 & 0xFF00; 140 TempA14 = TempA14 & 0xFF00; 141 142 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR14 >> 8) + (TempB14); 143 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG14 >> 8) + (TempA14); 144 145 write(Dst_Surface, DstX * 4, DstY, Result); 146 } 147 148 // Second 8x8 149 { 150 // R/G/B channel 1st half 151 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0); 152 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8); 153 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16); 154 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24); 155 156 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32); 157 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40); 158 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48); 159 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56); 160 161 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0); 162 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8); 163 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16); 164 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24); 165 166 matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 32); 167 matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 40); 168 matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 48); 169 matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 56); 170 171 TempB0 = TempB0 & 0xFF00; 172 TempA0 = TempA0 & 0xFF00; 173 174 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR0 >> 8) + (TempB0); 175 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG0 >> 8) + (TempA0); 176 177 TempB2 = TempB2 & 0xFF00; 178 TempA2 = TempA2 & 0xFF00; 179 180 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR2 >> 8) + (TempB2); 181 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG2 >> 8) + (TempA2); 182 183 TempB4 = TempB4 & 0xFF00; 184 TempA4 = TempA4 & 0xFF00; 185 186 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR4 >> 8) + (TempB4); 187 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG4 >> 8) + (TempA4); 188 189 TempB6 = TempB6 & 0xFF00; 190 TempA6 = TempA6 & 0xFF00; 191 192 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR6 >> 8) + (TempB6); 193 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG6 >> 8) + (TempA6); 194 195 // R/G/B channel 2nd half 196 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0); 197 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8); 198 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16); 199 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24); 200 201 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32); 202 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40); 203 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48); 204 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56); 205 206 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0); 207 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8); 208 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16); 209 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24); 210 211 matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 32); 212 matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 40); 213 matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 48); 214 matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 56); 215 216 TempB8 = TempB8 & 0xFF00; 217 TempA8 = TempA8 & 0xFF00; 218 219 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR8 >> 8) + (TempB8); 220 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG8 >> 8) + (TempA8); 221 222 TempB10 = TempB10 & 0xFF00; 223 TempA10 = TempA10 & 0xFF00; 224 225 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR10 >> 8) + (TempB10); 226 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG10 >> 8) + (TempA10); 227 228 TempB12 = TempB12 & 0xFF00; 229 TempA12 = TempA12 & 0xFF00; 230 231 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR12 >> 8) + (TempB12); 232 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG12 >> 8) + (TempA12); 233 234 TempB14 = TempB14 & 0xFF00; 235 TempA14 = TempA14 & 0xFF00; 236 237 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR14 >> 8) + (TempB14); 238 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG14 >> 8) + (TempA14); 239 240 write(Dst_Surface, DstX * 4 + 32, DstY, Result); 241 } 242 } 243 }