1 /* 2 * Copyright (c) 2019, Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 { 24 /* 25 Buffer layout after shuffle 26 _________________________________________________ 27 |_______Block0__________|_______Block1__________| 28 |_______Block2__________|_______Block3__________| 29 |_______Block4__________|_______Block5__________| 30 |_______Block6__________|_______Block7__________| 31 32 Write back buffer layout correlate to the block number#, each box stands for 1 GRF 33 _______________________________________________ 34 |____R0_________R1_____|____R2_________R3_____| 35 |____G0_________G1_____|____G2_________G3_____| 36 |____B0_________B1_____|____B2_________B3_____| 37 |____A0_________A1_____|____A2_________A3_____| 38 |____R4_________R5_____|____R6_________R7_____| 39 |____G4_________G5_____|____G6_________G7_____| 40 |____B4_________B5_____|____B6_________B7_____| 41 |____A4_________A5_____|____A6_________A7_____| 42 */ 43 44 matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0); 45 46 SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START); 47 matrix_ref<ushort, 4, 16> TempResult4x8_Top = Result.format<ushort, 8, 16>().select<4, 1, 16, 1>(0, 0); 48 matrix_ref<ushort, 4, 16> TempResult4x8_Bottom = Result.format<ushort, 8, 16>().select<4, 1, 16, 1>(4, 0); 49 50 ushort TempA = DestinationRGBFormat << 8; 51 52 #pragma unroll 53 for (uchar i = 0; i < 2; i++, DstY += 8) 54 { 55 // First 8x8 56 { 57 // R/G/B channel 1st half 58 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0); 59 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8); 60 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16); 61 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24); 62 63 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32); 64 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40); 65 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48); 66 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56); 67 68 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0); 69 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8); 70 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16); 71 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24); 72 73 TempB0 = TempB0 & 0xFF00; 74 75 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR0 >> 8) + (TempB0); 76 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG0 >> 8) + (TempA); 77 78 TempB2 = TempB2 & 0xFF00; 79 80 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR2 >> 8) + (TempB2); 81 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG2 >> 8) + (TempA); 82 83 TempB4 = TempB4 & 0xFF00; 84 85 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR4 >> 8) + (TempB4); 86 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG4 >> 8) + (TempA); 87 88 TempB6 = TempB6 & 0xFF00; 89 90 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR6 >> 8) + (TempB6); 91 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG6 >> 8) + (TempA); 92 93 // R/G/B channel 2nd half 94 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0); 95 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8); 96 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16); 97 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24); 98 99 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32); 100 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40); 101 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48); 102 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56); 103 104 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0); 105 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8); 106 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16); 107 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24); 108 109 TempB8 = TempB8 & 0xFF00; 110 111 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR8 >> 8) + (TempB8); 112 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG8 >> 8) + (TempA); 113 114 TempB10 = TempB10 & 0xFF00; 115 116 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR10 >> 8) + (TempB10); 117 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG10 >> 8) + (TempA); 118 119 TempB12 = TempB12 & 0xFF00; 120 121 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR12 >> 8) + (TempB12); 122 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG12 >> 8) + (TempA); 123 124 TempB14 = TempB14 & 0xFF00; 125 126 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR14 >> 8) + (TempB14); 127 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG14 >> 8) + (TempA); 128 129 write(Dst_Surface, DstX * 4, DstY, Result); 130 } 131 132 // Second 8x8 133 { 134 // R/G/B channel 1st half 135 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0); 136 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8); 137 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16); 138 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24); 139 140 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32); 141 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40); 142 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48); 143 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56); 144 145 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0); 146 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8); 147 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16); 148 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24); 149 150 TempB0 = TempB0 & 0xFF00; 151 152 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR0 >> 8) + (TempB0); 153 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG0 >> 8) + (TempA); 154 155 TempB2 = TempB2 & 0xFF00; 156 157 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR2 >> 8) + (TempB2); 158 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG2 >> 8) + (TempA); 159 160 TempB4 = TempB4 & 0xFF00; 161 162 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR4 >> 8) + (TempB4); 163 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG4 >> 8) + (TempA); 164 165 TempB6 = TempB6 & 0xFF00; 166 167 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR6 >> 8) + (TempB6); 168 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG6 >> 8) + (TempA); 169 170 // R/G/B channel 2nd half 171 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0); 172 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8); 173 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16); 174 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24); 175 176 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32); 177 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40); 178 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48); 179 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56); 180 181 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0); 182 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8); 183 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16); 184 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24); 185 186 TempB8 = TempB8 & 0xFF00; 187 188 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = (TempR8 >> 8) + (TempB8); 189 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 1) = (TempG8 >> 8) + (TempA); 190 191 TempB10 = TempB10 & 0xFF00; 192 193 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = (TempR10 >> 8) + (TempB10); 194 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 1) = (TempG10 >> 8) + (TempA); 195 196 TempB12 = TempB12 & 0xFF00; 197 198 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = (TempR12 >> 8) + (TempB12); 199 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 1) = (TempG12 >> 8) + (TempA); 200 201 TempB14 = TempB14 & 0xFF00; 202 203 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = (TempR14 >> 8) + (TempB14); 204 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 1) = (TempG14 >> 8) + (TempA); 205 206 write(Dst_Surface, DstX * 4 + 32, DstY, Result); 207 } 208 } 209 }