1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 {
23     /*
24     Buffer layout after shuffle
25     _________________________________________________
26     |_______Block0__________|_______Block1__________|
27     |_______Block2__________|_______Block3__________|
28     |_______Block4__________|_______Block5__________|
29     |_______Block6__________|_______Block7__________|
30 
31     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
32     _______________________________________________
33     |____R0_________R1_____|____R2_________R3_____|
34     |____G0_________G1_____|____G2_________G3_____|
35     |____B0_________B1_____|____B2_________B3_____|
36     |____A0_________A1_____|____A2_________A3_____|
37     |____R4_________R5_____|____R6_________R7_____|
38     |____G4_________G5_____|____G6_________G7_____|
39     |____B4_________B5_____|____B6_________B7_____|
40     |____A4_________A5_____|____A6_________A7_____|
41     */
42 
43     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
44 
45     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
46     matrix_ref<uint, 4, 8> TempResult4x8_Top    = Result.select<4, 1, 8, 1>(0, 0);
47     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
48 #pragma unroll
49     for (uchar i = 0; i < 2; i++, DstY += 8)
50     {
51         // Rounding
52         DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i, 0) = cm_add<ushort>(DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i, 0), 0x80, SAT);
53         DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i + 8, 0) = cm_add<ushort>(DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i + 8, 0), 0x80, SAT);
54         DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i + 16, 0) = cm_add<ushort>(DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i + 16, 0), 0x80, SAT);
55         DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i + 24, 0) = cm_add<ushort>(DataBuffer.format<ushort, 96, 16>().select<6, 1, 16, 1>(32 * i + 24, 0), 0x80, SAT);
56 
57         // First 8x16
58         {
59             // Y channel
60             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(2 + 32 * i, 1);
61             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(0, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(10 + 32 * i, 1);
62             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(2 + 32 * i, 17);
63             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(1, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(10 + 32 * i, 17);
64             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(3 + 32 * i, 1);
65             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(2, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(11 + 32 * i, 1);
66             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(3 + 32 * i, 17);
67             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 8, 2>(3, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(11 + 32 * i, 17);
68 
69             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(18 + 32 * i, 1);
70             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(0, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(26 + 32 * i, 1);
71             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(18 + 32 * i, 17);
72             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(1, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(26 + 32 * i, 17);
73             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(19 + 32 * i, 1);
74             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(2, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(27 + 32 * i, 1);
75             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(19 + 32 * i, 17);
76             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 8, 2>(3, 16 + DestinationPackedYOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 8, 2>(27 + 32 * i, 17);
77 
78             // UV channel
79             // V channel
80             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i, 1);
81             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 8, 1);
82             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i, 17);
83             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 8, 17);
84             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 1, 1);
85             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 9, 1);
86             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 1, 17);
87             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 9, 17);
88 
89             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 16, 1);
90             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 24, 1);
91             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 16, 17);
92             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 24, 17);
93             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 17, 1);
94             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 25, 1);
95             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 17, 17);
96             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, 16 + DestinationPackedVOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 25, 17);
97 
98             // U channel
99             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 4, 1);
100             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 12, 1);
101             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 4, 17);
102             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 12, 17);
103             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 5, 1);
104             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 13, 1);
105             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 5, 17);
106             TempResult4x8_Top.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 13, 17);
107 
108             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 20, 1);
109             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(0, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 28, 1);
110             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 20, 17);
111             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(1, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 28, 17);
112             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 21, 1);
113             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(2, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 29, 1);
114             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 21, 17);
115             TempResult4x8_Bottom.format<uchar, 4, 32>().select<1, 1, 4, 4>(3, 16 + DestinationPackedUOffset) = DataBuffer.format<uchar, 96, 32>().select<1, 1, 4, 4>(32 * i + 29, 17);
116 
117             write(Dst_Surface, DstX * 2, DstY, Result);
118         }
119     }
120 }