1 /* 2 * Copyright (c) 2019, Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 { 23 uchar RotationFlag = (uchar)(RotationChromaSitingFlag & 0x07); 24 25 /* 26 Buffer layout after shuffle 27 _________________________________________________ 28 |_______Block0__________|_______Block1__________| 29 |_______Block2__________|_______Block3__________| 30 |_______Block4__________|_______Block5__________| 31 |_______Block6__________|_______Block7__________| 32 33 Write back buffer layout correlate to the block number#, each box stands for 1 GRF 34 _______________________________________________ 35 |____R0_________R1_____|____R2_________R3_____| 36 |____G0_________G1_____|____G2_________G3_____| 37 |____B0_________B1_____|____B2_________B3_____| 38 |____A0_________A1_____|____A2_________A3_____| 39 |____R4_________R5_____|____R6_________R7_____| 40 |____G4_________G5_____|____G6_________G7_____| 41 |____B4_________B5_____|____B6_________B7_____| 42 |____A4_________A5_____|____A6_________A7_____| 43 */ 44 45 // ==== Pre-Interlaced ===== 46 // Buffer 0 (Field 1): 47 // ---------------------------------------------- 48 // | Line 1 Left Half F1 | Line 1 Right Half F1 | 49 // | Line 2 Left Half F1 | Line 2 Right Half F1 | 50 // | Line 3 Left Half F1 | Line 3 Right Half F1 | 51 // | Line 4 Left Half F1 | Line 4 Right Half F1 | 52 // ---------------------------------------------- 53 // Buffer 1 (Field 2): 54 // ---------------------------------------------- 55 // | Line 1 Left Half F2 | Line 1 Right Half F2 | 56 // | Line 2 Left Half F2 | Line 2 Right Half F2 | 57 // | Line 3 Left Half F2 | Line 3 Right Half F2 | 58 // | Line 4 Left Half F2 | Line 4 Right Half F2 | 59 // ---------------------------------------------- 60 61 // ==== Post-Interlaced ===== 62 // Buffer 0: 63 // ---------------------------------------------- 64 // | Line 1 Left Half F1 | Line 1 Right Half F1 | 65 // | Line 1 Left Half F2 | Line 1 Right Half F2 | 66 // | Line 2 Left Half F1 | Line 2 Right Half F1 | 67 // | Line 2 Left Half F2 | Line 2 Right Half F2 | 68 // ---------------------------------------------- 69 // Buffer 1: 70 // ---------------------------------------------- 71 // | Line 3 Left Half F1 | Line 3 Right Half F1 | 72 // | Line 3 Left Half F2 | Line 3 Right Half F2 | 73 // | Line 4 Left Half F1 | Line 4 Right Half F1 | 74 // | Line 4 Left Half F2 | Line 4 Right Half F2 | 75 // ---------------------------------------------- 76 77 #ifdef BUFFER_0 78 #define WriteBackBuffer_F1 DataBuffer0 79 #define WriteBackBuffer_F2 DataBuffer1 80 #endif 81 82 #ifdef BUFFER_2 83 #define WriteBackBuffer_F1 DataBuffer2 84 #define WriteBackBuffer_F2 DataBuffer3 85 #endif 86 87 if (RotationFlag == MDF_FC_ROTATION_90 || RotationFlag == MDF_FC_ROTATION_270) 88 { 89 matrix<ushort, 1, 8> temp; 90 matrix_ref<ushort, 2, 16> temp1 = DataBuffer4.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0); 91 matrix_ref<ushort, 1, 16> temp2 = DataBuffer4.format<ushort, 16, 16>().select<1, 1, 16, 1>(2, 0); 92 93 #ifdef OUTPUT_PA 94 #pragma unroll 95 for (short j = 0; j < 4; j++) 96 { 97 // Store temp data 98 temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 16, 1>(2 * j + 8, 0); 99 temp2.select<1, 1, 4, 1>(0, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4); 100 temp2.select<1, 1, 4, 1>(0, 4) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12); 101 temp2.select<1, 1, 4, 1>(0, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4); 102 temp2.select<1, 1, 4, 1>(0, 12) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12); 103 104 // Interlace Top field 105 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0); 106 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0); 107 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp; 108 109 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8); 110 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8); 111 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp; 112 113 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0); 114 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0); 115 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp; 116 117 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8); 118 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8); 119 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp; 120 121 122 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4); 123 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 0); 124 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp; 125 126 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12); 127 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 4); 128 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp; 129 130 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4); 131 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 8); 132 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp; 133 134 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12); 135 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 12); 136 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp; 137 138 // Interlace Bottom field 139 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 0); 140 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 0); 141 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp; 142 143 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 8); 144 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 8); 145 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp; 146 147 148 149 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 0); 150 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 0); 151 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp; 152 153 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 8); 154 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 8); 155 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp; 156 157 158 159 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 4); 160 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 4); 161 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp; 162 163 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 12); 164 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 12); 165 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp; 166 167 168 169 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 4); 170 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 4); 171 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp; 172 173 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 12); 174 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 12); 175 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp; 176 } 177 #endif 178 #ifdef OUTPUT_420 179 #pragma unroll 180 for (short j = 0; j < 3; j++) 181 { 182 //temp1.format<ushort, 4, 8>().select<2, 1, 8, 1>(0, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 8, 1>(2 * j, 8); 183 //temp1.format<ushort, 4, 8>().select<2, 1, 8, 1>(2, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 8, 1>(2 * j + 8, 8); 184 185 /* 186 // Reorder lines inside field #1 187 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8); 188 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0); 189 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0); 190 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp; 191 192 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8); 193 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0); 194 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8); 195 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp; 196 197 // Reorder lines inside field #2 198 temp = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8); 199 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0); 200 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0); 201 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp; 202 203 temp = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8); 204 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0); 205 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8); 206 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp; 207 */ 208 209 // Store temp data 210 temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 16, 1>(2 * j + 8, 0); 211 temp2.select<1, 1, 4, 1>(0, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4); 212 temp2.select<1, 1, 4, 1>(0, 4) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12); 213 temp2.select<1, 1, 4, 1>(0, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4); 214 temp2.select<1, 1, 4, 1>(0, 12) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12); 215 216 // Interlace Top field 217 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0); 218 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0); 219 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp; 220 221 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8); 222 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8); 223 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp; 224 225 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0); 226 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0); 227 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp; 228 229 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8); 230 temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8); 231 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp; 232 233 234 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4); 235 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 0); 236 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp; 237 238 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12); 239 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 4); 240 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp; 241 242 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4); 243 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 8); 244 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp; 245 246 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12); 247 temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 12); 248 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp; 249 250 // Interlace Bottom field 251 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 0); 252 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 0); 253 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp; 254 255 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 8); 256 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 8); 257 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp; 258 259 260 261 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 0); 262 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 0); 263 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp; 264 265 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 8); 266 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 8); 267 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp; 268 269 270 271 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 4); 272 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 4); 273 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp; 274 275 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 12); 276 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 12); 277 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp; 278 279 280 281 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 4); 282 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 4); 283 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp; 284 285 temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 12); 286 temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 12); 287 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp; 288 } 289 #endif 290 } 291 else 292 { 293 matrix<ushort, 1, 8> temp; 294 matrix<ushort, 1, 16> temp1; 295 296 #ifdef OUTPUT_PA 297 #pragma unroll 298 for (short j = 0; j < 4; j++) 299 { 300 // RGBA channel left half 301 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8); 302 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0); 303 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0); 304 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp; 305 306 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2 * j, 8); 307 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8); 308 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0); 309 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp; 310 311 // RGBA channel right half 312 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8); 313 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0); 314 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0); 315 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp; 316 317 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2 * j, 8); 318 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8); 319 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0); 320 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp; 321 } 322 #endif 323 #ifdef OUTPUT_420 324 // Y Channel 325 { 326 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8); 327 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 0); 328 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0); 329 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0) = temp; 330 331 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2, 8); 332 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8); 333 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0); 334 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0) = temp; 335 336 // RGBA channel right half 337 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8); 338 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 0); 339 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0); 340 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0) = temp; 341 342 temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2, 8); 343 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8); 344 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0); 345 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0) = temp; 346 } 347 348 // U Channel 349 { 350 temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5, 0); 351 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4, 0); 352 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4, 0) = temp1; 353 354 temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5 + 8, 0); 355 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5 + 8, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4 + 8, 0); 356 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4 + 8, 0) = temp1; 357 } 358 359 // V Channel 360 { 361 temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1, 0); 362 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0, 0); 363 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0, 0) = temp1; 364 365 temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1 + 8, 0); 366 WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1 + 8, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0 + 8, 0); 367 WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0 + 8, 0) = temp1; 368 } 369 #endif 370 } 371 #undef WriteBackBuffer_F1 372 #undef WriteBackBuffer_F2 373 }