1 /* 2 * Copyright (c) 2019, Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 { 24 /* 25 Buffer layout after shuffle 26 _________________________________________________ 27 |_______Block0__________|_______Block1__________| 28 |_______Block2__________|_______Block3__________| 29 |_______Block4__________|_______Block5__________| 30 |_______Block6__________|_______Block7__________| 31 32 Write back buffer layout correlate to the block number#, each box stands for 1 GRF 33 _______________________________________________ 34 |____R0_________R1_____|____R2_________R3_____| 35 |____G0_________G1_____|____G2_________G3_____| 36 |____B0_________B1_____|____B2_________B3_____| 37 |____A0_________A1_____|____A2_________A3_____| 38 |____R4_________R5_____|____R6_________R7_____| 39 |____G4_________G5_____|____G6_________G7_____| 40 |____B4_________B5_____|____B6_________B7_____| 41 |____A4_________A5_____|____A6_________A7_____| 42 */ 43 44 matrix_ref<uint, 9, 8> Msg = DataBuffer.format<uint, 96, 8>().select<9, 1, 8, 1>(64, 0); 45 matrix_ref<uint, 8, 8> Result = Msg.select<8, 1, 8, 1>(1, 0); 46 uint descriptor; 47 48 Msg.select<1, 1, 8, 1>(0, 0) = cm_get_r0<uint>(); 49 Msg.select<1, 1, 1, 1>(0, 2) = nBLOCK_WIDTH_32 + nBLOCK_HEIGHT_8; 50 descriptor = MDF_FC_OUTPUT_BTI_START + nDPMW_MSGDSC + nMSGLEN_8; 51 uchar ChannelSwap = (WAFlag >> 16) & 0x01; 52 53 matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0); 54 matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0); 55 #pragma unroll 56 for (uchar i = 0; i < 2; i++, DstY += 8) 57 { 58 // First 8x8 59 { 60 // R/G/B channel 1st half 61 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 0); 62 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 8); 63 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 16); 64 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 24); 65 66 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i, 32); 67 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i, 40); 68 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i, 48); 69 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i, 56); 70 71 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 0); 72 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 8); 73 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 16); 74 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 24); 75 76 matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 32); 77 matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 40); 78 matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 48); 79 matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 1, 56); 80 81 // Rounding 82 TempR0 = cm_add<ushort>(TempR0, 0x20, SAT); 83 TempR0 = TempR0 & 0xFFC0; 84 TempR2 = cm_add<ushort>(TempR2, 0x20, SAT); 85 TempR2 = TempR2 & 0xFFC0; 86 TempR4 = cm_add<ushort>(TempR4, 0x20, SAT); 87 TempR4 = TempR4 & 0xFFC0; 88 TempR6 = cm_add<ushort>(TempR6, 0x20, SAT); 89 TempR6 = TempR6 & 0xFFC0; 90 91 TempG0 = cm_add<ushort>(TempG0, 0x20, SAT); 92 TempG0 = TempG0 & 0xFFC0; 93 TempG2 = cm_add<ushort>(TempG2, 0x20, SAT); 94 TempG2 = TempG2 & 0xFFC0; 95 TempG4 = cm_add<ushort>(TempG4, 0x20, SAT); 96 TempG4 = TempG4 & 0xFFC0; 97 TempG6 = cm_add<ushort>(TempG6, 0x20, SAT); 98 TempG6 = TempG6 & 0xFFC0; 99 100 TempB0 = cm_add<ushort>(TempB0, 0x20, SAT); 101 TempB0 = TempB0 & 0xFFC0; 102 TempB2 = cm_add<ushort>(TempB2, 0x20, SAT); 103 TempB2 = TempB2 & 0xFFC0; 104 TempB4 = cm_add<ushort>(TempB4, 0x20, SAT); 105 TempB4 = TempB4 & 0xFFC0; 106 TempB6 = cm_add<ushort>(TempB6, 0x20, SAT); 107 TempB6 = TempB6 & 0xFFC0; 108 109 TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT); 110 TempA0 = TempA0 & 0xC000; 111 TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT); 112 TempA2 = TempA2 & 0xC000; 113 TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT); 114 TempA4 = TempA4 & 0xC000; 115 TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT); 116 TempA6 = TempA6 & 0xC000; 117 118 if (ChannelSwap) 119 { 120 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6; 121 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10; 122 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000; 123 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000; 124 125 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6; 126 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10; 127 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000; 128 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000; 129 130 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6; 131 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10; 132 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000; 133 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000; 134 135 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6; 136 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10; 137 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000; 138 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000; 139 } 140 else 141 { 142 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempR0 >> 6; 143 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10; 144 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempB0 * 0x4000; 145 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000; 146 147 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempR2 >> 6; 148 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10; 149 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempB2 * 0x4000; 150 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000; 151 152 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempR4 >> 6; 153 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10; 154 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempB4 * 0x4000; 155 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000; 156 157 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempR6 >> 6; 158 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10; 159 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempB6 * 0x4000; 160 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000; 161 } 162 163 // R/G/B channel 2nd half 164 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 0); 165 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 8); 166 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 16); 167 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 24); 168 169 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 32); 170 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 40); 171 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 48); 172 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 4, 56); 173 174 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 0); 175 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 8); 176 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 16); 177 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 24); 178 179 matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 32); 180 matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 40); 181 matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 48); 182 matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 5, 56); 183 184 // Rounding 185 TempR8 = cm_add<ushort>(TempR8, 0x20, SAT); 186 TempR8 = TempR8 & 0xFFC0; 187 TempR10 = cm_add<ushort>(TempR10, 0x20, SAT); 188 TempR10 = TempR10 & 0xFFC0; 189 TempR12 = cm_add<ushort>(TempR12, 0x20, SAT); 190 TempR12 = TempR12 & 0xFFC0; 191 TempR14 = cm_add<ushort>(TempR14, 0x20, SAT); 192 TempR14 = TempR14 & 0xFFC0; 193 194 TempG8 = cm_add<ushort>(TempG8, 0x20, SAT); 195 TempG8 = TempG8 & 0xFFC0; 196 TempG10 = cm_add<ushort>(TempG10, 0x20, SAT); 197 TempG10 = TempG10 & 0xFFC0; 198 TempG12 = cm_add<ushort>(TempG12, 0x20, SAT); 199 TempG12 = TempG12 & 0xFFC0; 200 TempG14 = cm_add<ushort>(TempG14, 0x20, SAT); 201 TempG14 = TempG14 & 0xFFC0; 202 203 TempB8 = cm_add<ushort>(TempB8, 0x20, SAT); 204 TempB8 = TempB8 & 0xFFC0; 205 TempB10 = cm_add<ushort>(TempB10, 0x20, SAT); 206 TempB10 = TempB10 & 0xFFC0; 207 TempB12 = cm_add<ushort>(TempB12, 0x20, SAT); 208 TempB12 = TempB12 & 0xFFC0; 209 TempB14 = cm_add<ushort>(TempB14, 0x20, SAT); 210 TempB14 = TempB14 & 0xFFC0; 211 212 TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT); 213 TempA8 = TempA8 & 0xC000; 214 TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT); 215 TempA10 = TempA10 & 0xC000; 216 TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT); 217 TempA12 = TempA12 & 0xC000; 218 TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT); 219 TempA14 = TempA14 & 0xC000; 220 221 if (ChannelSwap) 222 { 223 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6; 224 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10; 225 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000; 226 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000; 227 228 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6; 229 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10; 230 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000; 231 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000; 232 233 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6; 234 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10; 235 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000; 236 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000; 237 238 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6; 239 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10; 240 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000; 241 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000; 242 } 243 else 244 { 245 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempR8 >> 6; 246 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10; 247 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempB8 * 0x4000; 248 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000; 249 250 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempR10 >> 6; 251 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10; 252 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempB10 * 0x4000; 253 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000; 254 255 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempR12 >> 6; 256 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10; 257 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempB12 * 0x4000; 258 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000; 259 260 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempR14 >> 6; 261 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10; 262 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempB14 * 0x4000; 263 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000; 264 } 265 266 Msg.select<1, 1, 1, 1>(0, 0) = DstX * 4; 267 Msg.select<1, 1, 1, 1>(0, 1) = DstY; 268 269 cm_send(NULL, 270 Msg, 271 nDATAPORT_DC1, 272 descriptor, 273 0); 274 } 275 276 // Second 8x8 277 { 278 // R/G/B channel 1st half 279 matrix_ref<ushort, 1, 8> TempR0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 0); 280 matrix_ref<ushort, 1, 8> TempR2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 8); 281 matrix_ref<ushort, 1, 8> TempR4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 16); 282 matrix_ref<ushort, 1, 8> TempR6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 24); 283 284 matrix_ref<ushort, 1, 8> TempG0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 32); 285 matrix_ref<ushort, 1, 8> TempG2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 40); 286 matrix_ref<ushort, 1, 8> TempG4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 48); 287 matrix_ref<ushort, 1, 8> TempG6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 2, 56); 288 289 matrix_ref<ushort, 1, 8> TempB0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 0); 290 matrix_ref<ushort, 1, 8> TempB2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 8); 291 matrix_ref<ushort, 1, 8> TempB4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 16); 292 matrix_ref<ushort, 1, 8> TempB6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 24); 293 294 matrix_ref<ushort, 1, 8> TempA0 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 32); 295 matrix_ref<ushort, 1, 8> TempA2 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 40); 296 matrix_ref<ushort, 1, 8> TempA4 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 48); 297 matrix_ref<ushort, 1, 8> TempA6 = DataBuffer.select<1, 1, 8, 1>(8 * i + 3, 56); 298 299 // Rounding 300 TempR0 = cm_add<ushort>(TempR0, 0x20, SAT); 301 TempR0 = TempR0 & 0xFFC0; 302 TempR2 = cm_add<ushort>(TempR2, 0x20, SAT); 303 TempR2 = TempR2 & 0xFFC0; 304 TempR4 = cm_add<ushort>(TempR4, 0x20, SAT); 305 TempR4 = TempR4 & 0xFFC0; 306 TempR6 = cm_add<ushort>(TempR6, 0x20, SAT); 307 TempR6 = TempR6 & 0xFFC0; 308 309 TempG0 = cm_add<ushort>(TempG0, 0x20, SAT); 310 TempG0 = TempG0 & 0xFFC0; 311 TempG2 = cm_add<ushort>(TempG2, 0x20, SAT); 312 TempG2 = TempG2 & 0xFFC0; 313 TempG4 = cm_add<ushort>(TempG4, 0x20, SAT); 314 TempG4 = TempG4 & 0xFFC0; 315 TempG6 = cm_add<ushort>(TempG6, 0x20, SAT); 316 TempG6 = TempG6 & 0xFFC0; 317 318 TempB0 = cm_add<ushort>(TempB0, 0x20, SAT); 319 TempB0 = TempB0 & 0xFFC0; 320 TempB2 = cm_add<ushort>(TempB2, 0x20, SAT); 321 TempB2 = TempB2 & 0xFFC0; 322 TempB4 = cm_add<ushort>(TempB4, 0x20, SAT); 323 TempB4 = TempB4 & 0xFFC0; 324 TempB6 = cm_add<ushort>(TempB6, 0x20, SAT); 325 TempB6 = TempB6 & 0xFFC0; 326 327 TempA0 = cm_add<ushort>(TempA0, 0x2000, SAT); 328 TempA0 = TempA0 & 0xC000; 329 TempA2 = cm_add<ushort>(TempA2, 0x2000, SAT); 330 TempA2 = TempA2 & 0xC000; 331 TempA4 = cm_add<ushort>(TempA4, 0x2000, SAT); 332 TempA4 = TempA4 & 0xC000; 333 TempA6 = cm_add<ushort>(TempA6, 0x2000, SAT); 334 TempA6 = TempA6 & 0xC000; 335 336 if (ChannelSwap) 337 { 338 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempB0 >> 6; 339 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10; 340 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempR0 * 0x4000; 341 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000; 342 343 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempB2 >> 6; 344 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10; 345 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempR2 * 0x4000; 346 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000; 347 348 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempB4 >> 6; 349 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10; 350 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempR4 * 0x4000; 351 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000; 352 353 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempB6 >> 6; 354 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10; 355 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempR6 * 0x4000; 356 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000; 357 } 358 else 359 { 360 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempR0 >> 6; 361 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempG0 * 0x10; 362 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempB0 * 0x4000; 363 TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(0, 0) + TempA0 * 0x10000; 364 365 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempR2 >> 6; 366 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempG2 * 0x10; 367 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempB2 * 0x4000; 368 TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(1, 0) + TempA2 * 0x10000; 369 370 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempR4 >> 6; 371 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempG4 * 0x10; 372 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempB4 * 0x4000; 373 TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(2, 0) + TempA4 * 0x10000; 374 375 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempR6 >> 6; 376 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempG6 * 0x10; 377 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempB6 * 0x4000; 378 TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Top.select<1, 1, 8, 1>(3, 0) + TempA6 * 0x10000; 379 } 380 381 // R/G/B channel 2nd half 382 matrix_ref<ushort, 1, 8> TempR8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 0); 383 matrix_ref<ushort, 1, 8> TempR10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 8); 384 matrix_ref<ushort, 1, 8> TempR12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 16); 385 matrix_ref<ushort, 1, 8> TempR14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 24); 386 387 matrix_ref<ushort, 1, 8> TempG8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 32); 388 matrix_ref<ushort, 1, 8> TempG10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 40); 389 matrix_ref<ushort, 1, 8> TempG12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 48); 390 matrix_ref<ushort, 1, 8> TempG14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 6, 56); 391 392 matrix_ref<ushort, 1, 8> TempB8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 0); 393 matrix_ref<ushort, 1, 8> TempB10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 8); 394 matrix_ref<ushort, 1, 8> TempB12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 16); 395 matrix_ref<ushort, 1, 8> TempB14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 24); 396 397 matrix_ref<ushort, 1, 8> TempA8 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 32); 398 matrix_ref<ushort, 1, 8> TempA10 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 40); 399 matrix_ref<ushort, 1, 8> TempA12 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 48); 400 matrix_ref<ushort, 1, 8> TempA14 = DataBuffer.select<1, 1, 8, 1>(8 * i + 7, 56); 401 402 // Rounding 403 TempR8 = cm_add<ushort>(TempR8, 0x20, SAT); 404 TempR8 = TempR8 & 0xFFC0; 405 TempR10 = cm_add<ushort>(TempR10, 0x20, SAT); 406 TempR10 = TempR10 & 0xFFC0; 407 TempR12 = cm_add<ushort>(TempR12, 0x20, SAT); 408 TempR12 = TempR12 & 0xFFC0; 409 TempR14 = cm_add<ushort>(TempR14, 0x20, SAT); 410 TempR14 = TempR14 & 0xFFC0; 411 412 TempG8 = cm_add<ushort>(TempG8, 0x20, SAT); 413 TempG8 = TempG8 & 0xFFC0; 414 TempG10 = cm_add<ushort>(TempG10, 0x20, SAT); 415 TempG10 = TempG10 & 0xFFC0; 416 TempG12 = cm_add<ushort>(TempG12, 0x20, SAT); 417 TempG12 = TempG12 & 0xFFC0; 418 TempG14 = cm_add<ushort>(TempG14, 0x20, SAT); 419 TempG14 = TempG14 & 0xFFC0; 420 421 TempB8 = cm_add<ushort>(TempB8, 0x20, SAT); 422 TempB8 = TempB8 & 0xFFC0; 423 TempB10 = cm_add<ushort>(TempB10, 0x20, SAT); 424 TempB10 = TempB10 & 0xFFC0; 425 TempB12 = cm_add<ushort>(TempB12, 0x20, SAT); 426 TempB12 = TempB12 & 0xFFC0; 427 TempB14 = cm_add<ushort>(TempB14, 0x20, SAT); 428 TempB14 = TempB14 & 0xFFC0; 429 430 TempA8 = cm_add<ushort>(TempA8, 0x2000, SAT); 431 TempA8 = TempA8 & 0xC000; 432 TempA10 = cm_add<ushort>(TempA10, 0x2000, SAT); 433 TempA10 = TempA10 & 0xC000; 434 TempA12 = cm_add<ushort>(TempA12, 0x2000, SAT); 435 TempA12 = TempA12 & 0xC000; 436 TempA14 = cm_add<ushort>(TempA14, 0x2000, SAT); 437 TempA14 = TempA14 & 0xC000; 438 439 if (ChannelSwap) 440 { 441 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempB8 >> 6; 442 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10; 443 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempR8 * 0x4000; 444 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000; 445 446 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempB10 >> 6; 447 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10; 448 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempR10 * 0x4000; 449 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000; 450 451 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempB12 >> 6; 452 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10; 453 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempR12 * 0x4000; 454 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000; 455 456 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempB14 >> 6; 457 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10; 458 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempR14 * 0x4000; 459 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000; 460 } 461 else 462 { 463 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempR8 >> 6; 464 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempG8 * 0x10; 465 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempB8 * 0x4000; 466 TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(0, 0) + TempA8 * 0x10000; 467 468 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempR10 >> 6; 469 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempG10 * 0x10; 470 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempB10 * 0x4000; 471 TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(1, 0) + TempA10 * 0x10000; 472 473 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempR12 >> 6; 474 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempG12 * 0x10; 475 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempB12 * 0x4000; 476 TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(2, 0) + TempA12 * 0x10000; 477 478 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempR14 >> 6; 479 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempG14 * 0x10; 480 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempB14 * 0x4000; 481 TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) = TempResult4x8_Bottom.select<1, 1, 8, 1>(3, 0) + TempA14 * 0x10000; 482 } 483 484 Msg.select<1, 1, 1, 1>(0, 0) = DstX * 4 + 32; 485 Msg.select<1, 1, 1, 1>(0, 1) = DstY; 486 487 cm_send(NULL, 488 Msg, 489 nDATAPORT_DC1, 490 descriptor, 491 0); 492 } 493 } 494 //} 495 }