1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 {
23     /*
24     Buffer layout after shuffle
25     _________________________________________________
26     |_______Block0__________|_______Block1__________|
27     |_______Block2__________|_______Block3__________|
28     |_______Block4__________|_______Block5__________|
29     |_______Block6__________|_______Block7__________|
30 
31     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
32     _______________________________________________
33     |____R0_________R1_____|____R2_________R3_____|
34     |____G0_________G1_____|____G2_________G3_____|
35     |____B0_________B1_____|____B2_________B3_____|
36     |____A0_________A1_____|____A2_________A3_____|
37     |____R4_________R5_____|____R6_________R7_____|
38     |____G4_________G5_____|____G6_________G7_____|
39     |____B4_________B5_____|____B6_________B7_____|
40     |____A4_________A5_____|____A6_________A7_____|
41     */
42 
43     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
44 
45     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
46     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
47     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
48 
49     DestinationPackedYOffset = DestinationPackedYOffset >> 1;
50     DestinationPackedUOffset = DestinationPackedUOffset >> 1;
51     DestinationPackedVOffset = DestinationPackedVOffset >> 1;
52 
53 #pragma unroll
54     for (uchar i = 0; i < 2; i++, DstY += 8)
55     {
56         // Left 8x16
57         {
58             // Y channel
59             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 0) & 0xFFC0;
60             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 8) & 0xFFC0;
61             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 0) & 0xFFC0;
62             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 8) & 0xFFC0;
63             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(18 + 32 * i, 0) & 0xFFC0;
64             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(18 + 32 * i, 8) & 0xFFC0;
65             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(19 + 32 * i, 0) & 0xFFC0;
66             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(19 + 32 * i, 8) & 0xFFC0;
67 
68 
69             // UV channel
70             // V channel
71             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i, 0) & 0xFFC0;
72             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i, 8) & 0xFFC0;
73             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 1, 0) & 0xFFC0;
74             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 1, 8) & 0xFFC0;
75             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 16, 0) & 0xFFC0;
76             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 16, 8) & 0xFFC0;
77             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 17, 0) & 0xFFC0;
78             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 17, 8) & 0xFFC0;
79 
80             // U channel
81             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 4, 0) & 0xFFC0;
82             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 4, 8) & 0xFFC0;
83             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 5, 0) & 0xFFC0;
84             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 5, 8) & 0xFFC0;
85             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 20, 0) & 0xFFC0;
86             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 20, 8) & 0xFFC0;
87             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 21, 0) & 0xFFC0;
88             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 21, 8) & 0xFFC0;
89 
90             // Rounding
91             Result.format<ushort>() = cm_add<ushort>(Result.format<ushort>(), 0x20, SAT);
92             Result.format<ushort>() = Result.format<ushort>() & 0xFFC0;
93 
94             write(Dst_Surface, 4 * DstX, DstY, Result);
95         }
96 
97         // Right 8x16
98         {
99             // Y channel
100             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 0) & 0xFFC0;
101             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 8) & 0xFFC0;
102             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 0) & 0xFFC0;
103             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 8) & 0xFFC0;
104             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(26 + 32 * i, 0) & 0xFFC0;
105             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(26 + 32 * i, 8) & 0xFFC0;
106             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(27 + 32 * i, 0) & 0xFFC0;
107             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(27 + 32 * i, 8) & 0xFFC0;
108 
109 
110             // UV channel
111             // V channel
112             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 8, 0) & 0xFFC0;
113             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 8, 8) & 0xFFC0;
114             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 9, 0) & 0xFFC0;
115             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 9, 8) & 0xFFC0;
116             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 24, 0) & 0xFFC0;
117             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 24, 8) & 0xFFC0;
118             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 25, 0) & 0xFFC0;
119             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 25, 8) & 0xFFC0;
120 
121             // U channel
122             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 12, 0) & 0xFFC0;
123             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 12, 8) & 0xFFC0;
124             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 13, 0) & 0xFFC0;
125             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 13, 8) & 0xFFC0;
126             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 28, 0) & 0xFFC0;
127             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 28, 8) & 0xFFC0;
128             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 29, 0) & 0xFFC0;
129             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 29, 8) & 0xFFC0;
130 
131             // Rounding
132             Result.format<ushort>() = cm_add<ushort>(Result.format<ushort>(), 0x20, SAT);
133             Result.format<ushort>() = Result.format<ushort>() & 0xFFC0;
134 
135             write(Dst_Surface, 4 * DstX + 32, DstY, Result);
136         }
137     }
138 }