1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 {
23     uchar RotationFlag = (uchar)(RotationChromaSitingFlag & 0x07);
24 
25     /*
26     Buffer layout after shuffle
27     _________________________________________________
28     |_______Block0__________|_______Block1__________|
29     |_______Block2__________|_______Block3__________|
30     |_______Block4__________|_______Block5__________|
31     |_______Block6__________|_______Block7__________|
32 
33     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
34     _______________________________________________
35     |____R0_________R1_____|____R2_________R3_____|
36     |____G0_________G1_____|____G2_________G3_____|
37     |____B0_________B1_____|____B2_________B3_____|
38     |____A0_________A1_____|____A2_________A3_____|
39     |____R4_________R5_____|____R6_________R7_____|
40     |____G4_________G5_____|____G6_________G7_____|
41     |____B4_________B5_____|____B6_________B7_____|
42     |____A4_________A5_____|____A6_________A7_____|
43     */
44 
45     // ==== Pre-Interlaced =====
46     // Buffer 0 (Field 1):
47     // ----------------------------------------------
48     // | Line 1 Left Half F1 | Line 1 Right Half F1 |
49     // | Line 2 Left Half F1 | Line 2 Right Half F1 |
50     // | Line 3 Left Half F1 | Line 3 Right Half F1 |
51     // | Line 4 Left Half F1 | Line 4 Right Half F1 |
52     // ----------------------------------------------
53     // Buffer 1 (Field 2):
54     // ----------------------------------------------
55     // | Line 1 Left Half F2 | Line 1 Right Half F2 |
56     // | Line 2 Left Half F2 | Line 2 Right Half F2 |
57     // | Line 3 Left Half F2 | Line 3 Right Half F2 |
58     // | Line 4 Left Half F2 | Line 4 Right Half F2 |
59     // ----------------------------------------------
60 
61     // ==== Post-Interlaced =====
62     // Buffer 0:
63     // ----------------------------------------------
64     // | Line 1 Left Half F1 | Line 1 Right Half F1 |
65     // | Line 1 Left Half F2 | Line 1 Right Half F2 |
66     // | Line 2 Left Half F1 | Line 2 Right Half F1 |
67     // | Line 2 Left Half F2 | Line 2 Right Half F2 |
68     // ----------------------------------------------
69     // Buffer 1:
70     // ----------------------------------------------
71     // | Line 3 Left Half F1 | Line 3 Right Half F1 |
72     // | Line 3 Left Half F2 | Line 3 Right Half F2 |
73     // | Line 4 Left Half F1 | Line 4 Right Half F1 |
74     // | Line 4 Left Half F2 | Line 4 Right Half F2 |
75     // ----------------------------------------------
76 
77 #ifdef BUFFER_0
78 #define WriteBackBuffer_F1 DataBuffer0
79 #define WriteBackBuffer_F2 DataBuffer1
80 #endif
81 
82 #ifdef BUFFER_2
83 #define WriteBackBuffer_F1 DataBuffer2
84 #define WriteBackBuffer_F2 DataBuffer3
85 #endif
86 
87     if (RotationFlag == MDF_FC_ROTATION_90 || RotationFlag == MDF_FC_ROTATION_270)
88     {
89         matrix<ushort, 1, 8> temp;
90         matrix_ref<ushort, 2, 16> temp1 = DataBuffer4.format<ushort, 16, 16>().select<2, 1, 16, 1>(0, 0);
91         matrix_ref<ushort, 1, 16> temp2 = DataBuffer4.format<ushort, 16, 16>().select<1, 1, 16, 1>(2, 0);
92 
93 #ifdef OUTPUT_PA
94 #pragma unroll
95         for (short j = 0; j < 4; j++)
96         {
97             // Store temp data
98             temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 16, 1>(2 * j + 8, 0);
99             temp2.select<1, 1, 4, 1>(0, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4);
100             temp2.select<1, 1, 4, 1>(0, 4) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12);
101             temp2.select<1, 1, 4, 1>(0, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4);
102             temp2.select<1, 1, 4, 1>(0, 12) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12);
103 
104             // Interlace Top field
105             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0);
106             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0);
107             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp;
108 
109             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8);
110             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8);
111             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp;
112 
113             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0);
114             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0);
115             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp;
116 
117             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8);
118             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8);
119             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp;
120 
121 
122             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4);
123             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 0);
124             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp;
125 
126             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12);
127             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 4);
128             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp;
129 
130             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4);
131             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 8);
132             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp;
133 
134             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12);
135             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 12);
136             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp;
137 
138             // Interlace Bottom field
139             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 0);
140             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 0);
141             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp;
142 
143             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 8);
144             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 8);
145             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp;
146 
147 
148 
149             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 0);
150             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 0);
151             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp;
152 
153             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 8);
154             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 8);
155             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp;
156 
157 
158 
159             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 4);
160             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 4);
161             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp;
162 
163             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 12);
164             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 12);
165             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp;
166 
167 
168 
169             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 4);
170             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 4);
171             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp;
172 
173             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 12);
174             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 12);
175             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp;
176         }
177 #endif
178 #ifdef OUTPUT_420
179 #pragma unroll
180         for (short j = 0; j < 3; j++)
181         {
182             //temp1.format<ushort, 4, 8>().select<2, 1, 8, 1>(0, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 8, 1>(2 * j, 8);
183             //temp1.format<ushort, 4, 8>().select<2, 1, 8, 1>(2, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 8, 1>(2 * j + 8, 8);
184 
185             /*
186             // Reorder lines inside field #1
187             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8);
188             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0);
189             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0);
190             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp;
191 
192             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8);
193             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0);
194             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8);
195             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp;
196 
197             // Reorder lines inside field #2
198             temp = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8);
199             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0);
200             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0);
201             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp;
202 
203             temp = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8);
204             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0);
205             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8);
206             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp;
207             */
208 
209             // Store temp data
210             temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<2, 1, 16, 1>(2 * j + 8, 0);
211             temp2.select<1, 1, 4, 1>(0, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4);
212             temp2.select<1, 1, 4, 1>(0, 4) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12);
213             temp2.select<1, 1, 4, 1>(0, 8) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4);
214             temp2.select<1, 1, 4, 1>(0, 12) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12);
215 
216             // Interlace Top field
217             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0);
218             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 0);
219             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp;
220 
221             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8);
222             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 8);
223             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp;
224 
225             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0);
226             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 0);
227             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp;
228 
229             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8);
230             temp.select<1, 1, 4, 2>(0, 1) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 8);
231             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp;
232 
233 
234             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 4);
235             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 0);
236             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp;
237 
238             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j, 12);
239             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 4);
240             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp;
241 
242             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 4);
243             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 8);
244             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp;
245 
246             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 1, 12);
247             temp.select<1, 1, 4, 2>(0, 1) = temp2.select<1, 1, 4, 1>(0, 12);
248             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp;
249 
250             // Interlace Bottom field
251             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 0);
252             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 0);
253             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = temp;
254 
255             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 8);
256             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 8);
257             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = temp;
258 
259 
260 
261             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 0);
262             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 0);
263             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp;
264 
265             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 8);
266             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 8);
267             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 8) = temp;
268 
269 
270 
271             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 4);
272             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 4);
273             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = temp;
274 
275             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 8, 12);
276             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(0, 12);
277             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = temp;
278 
279 
280 
281             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 4);
282             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 4);
283             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp;
284 
285             temp.select<1, 1, 4, 2>(0, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 4, 1>(2 * j + 9, 12);
286             temp.select<1, 1, 4, 2>(0, 1) = temp1.select<1, 1, 4, 1>(1, 12);
287             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 8) = temp;
288     }
289 #endif
290     }
291     else
292     {
293         matrix<ushort, 1, 8> temp;
294         matrix<ushort, 1, 16> temp1;
295 
296 #ifdef OUTPUT_PA
297 #pragma unroll
298         for (short j = 0; j < 4; j++)
299         {
300             // RGBA channel left half
301             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8);
302             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0);
303             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0);
304             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp;
305 
306             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2 * j, 8);
307             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8);
308             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0);
309             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 1, 0) = temp;
310 
311             // RGBA channel right  half
312             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8);
313             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0);
314             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0);
315             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp;
316 
317             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2 * j, 8);
318             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2 * j, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8);
319             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0);
320             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 * j + 9, 0) = temp;
321         }
322 #endif
323 #ifdef OUTPUT_420
324         // Y Channel
325         {
326             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8);
327             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 0);
328             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0);
329             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0) = temp;
330 
331             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2, 8);
332             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(1 + 2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8);
333             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0);
334             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 1, 0) = temp;
335 
336             // RGBA channel right  half
337             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8);
338             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 0);
339             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 0) = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0);
340             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0) = temp;
341 
342             temp = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2, 8);
343             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 8, 1>(9 + 2, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8);
344             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 8, 8) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0);
345             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 8, 1>(2 + 9, 0) = temp;
346         }
347 
348         // U Channel
349         {
350             temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5, 0);
351             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4, 0);
352             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4, 0) = temp1;
353 
354             temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5 + 8, 0);
355             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(5 + 8, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4 + 8, 0);
356             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(4 + 8, 0) = temp1;
357         }
358 
359         // V Channel
360         {
361             temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1, 0);
362             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0, 0);
363             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0, 0) = temp1;
364 
365             temp1 = WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1 + 8, 0);
366             WriteBackBuffer_F1.format<ushort, 16, 16>().select<1, 1, 16, 1>(1 + 8, 0) = WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0 + 8, 0);
367             WriteBackBuffer_F2.format<ushort, 16, 16>().select<1, 1, 16, 1>(0 + 8, 0) = temp1;
368         }
369 #endif
370     }
371 #undef WriteBackBuffer_F1
372 #undef WriteBackBuffer_F2
373 }