1 /* 2 * Copyright (c) 2019, Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 { 24 /* 25 Buffer layout after shuffle 26 _________________________________________________ 27 |_______Block0__________|_______Block1__________| 28 |_______Block2__________|_______Block3__________| 29 |_______Block4__________|_______Block5__________| 30 |_______Block6__________|_______Block7__________| 31 32 Write back buffer layout correlate to the block number#, each box stands for 1 GRF 33 _______________________________________________ 34 |____R0_________R1_____|____R2_________R3_____| 35 |____G0_________G1_____|____G2_________G3_____| 36 |____B0_________B1_____|____B2_________B3_____| 37 |____A0_________A1_____|____A2_________A3_____| 38 |____R4_________R5_____|____R6_________R7_____| 39 |____G4_________G5_____|____G6_________G7_____| 40 |____B4_________B5_____|____B6_________B7_____| 41 |____A4_________A5_____|____A6_________A7_____| 42 */ 43 44 matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0); 45 46 SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START); 47 48 matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0); 49 matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0); 50 #pragma unroll 51 for (uchar i = 0; i < 2; i++, DstY += 8) 52 { 53 // First 8x8 54 { 55 // R/G/B channel 1st half 56 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0); 57 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8); 58 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16); 59 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24); 60 61 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32); 62 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40); 63 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48); 64 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56); 65 66 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0); 67 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8); 68 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16); 69 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24); 70 71 matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 32); 72 matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 40); 73 matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 48); 74 matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 56); 75 76 // Rounding 77 TempR0 = cm_add<ushort>(TempR0, 0x20, SAT); 78 TempR0 = TempR0 & 0xFFC0; 79 TempR2 = cm_add<ushort>(TempR2, 0x20, SAT); 80 TempR2 = TempR2 & 0xFFC0; 81 TempR4 = cm_add<ushort>(TempR4, 0x20, SAT); 82 TempR4 = TempR4 & 0xFFC0; 83 TempR6 = cm_add<ushort>(TempR6, 0x20, SAT); 84 TempR6 = TempR6 & 0xFFC0; 85 86 TempG0 = cm_add<ushort>(TempG0, 0x20, SAT); 87 TempG0 = TempG0 & 0xFFC0; 88 TempG2 = cm_add<ushort>(TempG2, 0x20, SAT); 89 TempG2 = TempG2 & 0xFFC0; 90 TempG4 = cm_add<ushort>(TempG4, 0x20, SAT); 91 TempG4 = TempG4 & 0xFFC0; 92 TempG6 = cm_add<ushort>(TempG6, 0x20, SAT); 93 TempG6 = TempG6 & 0xFFC0; 94 95 TempB0 = cm_add<ushort>(TempB0, 0x20, SAT); 96 TempB0 = TempB0 & 0xFFC0; 97 TempB2 = cm_add<ushort>(TempB2, 0x20, SAT); 98 TempB2 = TempB2 & 0xFFC0; 99 TempB4 = cm_add<ushort>(TempB4, 0x20, SAT); 100 TempB4 = TempB4 & 0xFFC0; 101 TempB6 = cm_add<ushort>(TempB6, 0x20, SAT); 102 TempB6 = TempB6 & 0xFFC0; 103 104 TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT); 105 TempA0 = TempA0 & 0xC000; 106 TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT); 107 TempA2 = TempA2 & 0xC000; 108 TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT); 109 TempA4 = TempA4 & 0xC000; 110 TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT); 111 TempA6 = TempA6 & 0xC000; 112 113 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6; 114 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10; 115 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000; 116 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000; 117 118 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6; 119 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10; 120 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000; 121 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000; 122 123 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6; 124 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10; 125 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000; 126 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000; 127 128 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6; 129 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10; 130 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000; 131 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000; 132 133 // R/G/B channel 2nd half 134 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0); 135 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8); 136 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16); 137 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24); 138 139 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32); 140 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40); 141 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48); 142 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56); 143 144 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0); 145 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8); 146 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16); 147 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24); 148 149 matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 32); 150 matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 40); 151 matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 48); 152 matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 56); 153 154 // Rounding 155 TempR8 = cm_add<ushort>(TempR8, 0x20, SAT); 156 TempR8 = TempR8 & 0xFFC0; 157 TempR10 = cm_add<ushort>(TempR10, 0x20, SAT); 158 TempR10 = TempR10 & 0xFFC0; 159 TempR12 = cm_add<ushort>(TempR12, 0x20, SAT); 160 TempR12 = TempR12 & 0xFFC0; 161 TempR14 = cm_add<ushort>(TempR14, 0x20, SAT); 162 TempR14 = TempR14 & 0xFFC0; 163 164 TempG8 = cm_add<ushort>(TempG8, 0x20, SAT); 165 TempG8 = TempG8 & 0xFFC0; 166 TempG10 = cm_add<ushort>(TempG10, 0x20, SAT); 167 TempG10 = TempG10 & 0xFFC0; 168 TempG12 = cm_add<ushort>(TempG12, 0x20, SAT); 169 TempG12 = TempG12 & 0xFFC0; 170 TempG14 = cm_add<ushort>(TempG14, 0x20, SAT); 171 TempG14 = TempG14 & 0xFFC0; 172 173 TempB8 = cm_add<ushort>(TempB8, 0x20, SAT); 174 TempB8 = TempB8 & 0xFFC0; 175 TempB10 = cm_add<ushort>(TempB10, 0x20, SAT); 176 TempB10 = TempB10 & 0xFFC0; 177 TempB12 = cm_add<ushort>(TempB12, 0x20, SAT); 178 TempB12 = TempB12 & 0xFFC0; 179 TempB14 = cm_add<ushort>(TempB14, 0x20, SAT); 180 TempB14 = TempB14 & 0xFFC0; 181 182 TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT); 183 TempA8 = TempA8 & 0xC000; 184 TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT); 185 TempA10 = TempA10 & 0xC000; 186 TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT); 187 TempA12 = TempA12 & 0xC000; 188 TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT); 189 TempA14 = TempA14 & 0xC000; 190 191 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6; 192 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10; 193 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000; 194 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000; 195 196 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6; 197 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10; 198 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000; 199 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000; 200 201 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6; 202 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10; 203 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000; 204 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000; 205 206 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6; 207 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10; 208 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000; 209 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000; 210 211 write(Dst_Surface, DstX * 4, DstY, Result); 212 } 213 214 // Second 8x8 215 { 216 // R/G/B channel 1st half 217 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0); 218 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8); 219 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16); 220 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24); 221 222 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32); 223 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40); 224 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48); 225 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56); 226 227 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0); 228 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8); 229 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16); 230 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24); 231 232 matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 32); 233 matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 40); 234 matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 48); 235 matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 56); 236 237 // Rounding 238 TempR0 = cm_add<ushort>(TempR0, 0x20, SAT); 239 TempR0 = TempR0 & 0xFFC0; 240 TempR2 = cm_add<ushort>(TempR2, 0x20, SAT); 241 TempR2 = TempR2 & 0xFFC0; 242 TempR4 = cm_add<ushort>(TempR4, 0x20, SAT); 243 TempR4 = TempR4 & 0xFFC0; 244 TempR6 = cm_add<ushort>(TempR6, 0x20, SAT); 245 TempR6 = TempR6 & 0xFFC0; 246 247 TempG0 = cm_add<ushort>(TempG0, 0x20, SAT); 248 TempG0 = TempG0 & 0xFFC0; 249 TempG2 = cm_add<ushort>(TempG2, 0x20, SAT); 250 TempG2 = TempG2 & 0xFFC0; 251 TempG4 = cm_add<ushort>(TempG4, 0x20, SAT); 252 TempG4 = TempG4 & 0xFFC0; 253 TempG6 = cm_add<ushort>(TempG6, 0x20, SAT); 254 TempG6 = TempG6 & 0xFFC0; 255 256 TempB0 = cm_add<ushort>(TempB0, 0x20, SAT); 257 TempB0 = TempB0 & 0xFFC0; 258 TempB2 = cm_add<ushort>(TempB2, 0x20, SAT); 259 TempB2 = TempB2 & 0xFFC0; 260 TempB4 = cm_add<ushort>(TempB4, 0x20, SAT); 261 TempB4 = TempB4 & 0xFFC0; 262 TempB6 = cm_add<ushort>(TempB6, 0x20, SAT); 263 TempB6 = TempB6 & 0xFFC0; 264 265 TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT); 266 TempA0 = TempA0 & 0xC000; 267 TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT); 268 TempA2 = TempA2 & 0xC000; 269 TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT); 270 TempA4 = TempA4 & 0xC000; 271 TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT); 272 TempA6 = TempA6 & 0xC000; 273 274 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6; 275 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10; 276 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000; 277 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000; 278 279 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6; 280 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10; 281 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000; 282 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000; 283 284 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6; 285 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10; 286 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000; 287 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000; 288 289 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6; 290 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10; 291 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000; 292 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000; 293 294 // R/G/B channel 2nd half 295 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0); 296 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8); 297 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16); 298 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24); 299 300 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32); 301 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40); 302 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48); 303 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56); 304 305 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0); 306 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8); 307 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16); 308 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24); 309 310 matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 32); 311 matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 40); 312 matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 48); 313 matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 56); 314 315 // Rounding 316 TempR8 = cm_add<ushort>(TempR8, 0x20, SAT); 317 TempR8 = TempR8 & 0xFFC0; 318 TempR10 = cm_add<ushort>(TempR10, 0x20, SAT); 319 TempR10 = TempR10 & 0xFFC0; 320 TempR12 = cm_add<ushort>(TempR12, 0x20, SAT); 321 TempR12 = TempR12 & 0xFFC0; 322 TempR14 = cm_add<ushort>(TempR14, 0x20, SAT); 323 TempR14 = TempR14 & 0xFFC0; 324 325 TempG8 = cm_add<ushort>(TempG8, 0x20, SAT); 326 TempG8 = TempG8 & 0xFFC0; 327 TempG10 = cm_add<ushort>(TempG10, 0x20, SAT); 328 TempG10 = TempG10 & 0xFFC0; 329 TempG12 = cm_add<ushort>(TempG12, 0x20, SAT); 330 TempG12 = TempG12 & 0xFFC0; 331 TempG14 = cm_add<ushort>(TempG14, 0x20, SAT); 332 TempG14 = TempG14 & 0xFFC0; 333 334 TempB8 = cm_add<ushort>(TempB8, 0x20, SAT); 335 TempB8 = TempB8 & 0xFFC0; 336 TempB10 = cm_add<ushort>(TempB10, 0x20, SAT); 337 TempB10 = TempB10 & 0xFFC0; 338 TempB12 = cm_add<ushort>(TempB12, 0x20, SAT); 339 TempB12 = TempB12 & 0xFFC0; 340 TempB14 = cm_add<ushort>(TempB14, 0x20, SAT); 341 TempB14 = TempB14 & 0xFFC0; 342 343 TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT); 344 TempA8 = TempA8 & 0xC000; 345 TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT); 346 TempA10 = TempA10 & 0xC000; 347 TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT); 348 TempA12 = TempA12 & 0xC000; 349 TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT); 350 TempA14 = TempA14 & 0xC000; 351 352 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6; 353 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10; 354 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000; 355 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000; 356 357 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6; 358 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10; 359 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000; 360 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000; 361 362 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6; 363 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10; 364 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000; 365 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000; 366 367 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6; 368 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10; 369 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000; 370 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000; 371 372 write(Dst_Surface, DstX * 4 + 32, DstY, Result); 373 } 374 } 375 //} 376 }