1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 {
23     /*
24     Buffer layout after shuffle
25     _________________________________________________
26     |_______Block0__________|_______Block1__________|
27     |_______Block2__________|_______Block3__________|
28     |_______Block4__________|_______Block5__________|
29     |_______Block6__________|_______Block7__________|
30 
31     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
32     _______________________________________________
33     |____R0_________R1_____|____R2_________R3_____|
34     |____G0_________G1_____|____G2_________G3_____|
35     |____B0_________B1_____|____B2_________B3_____|
36     |____A0_________A1_____|____A2_________A3_____|
37     |____R4_________R5_____|____R6_________R7_____|
38     |____G4_________G5_____|____G6_________G7_____|
39     |____B4_________B5_____|____B6_________B7_____|
40     |____A4_________A5_____|____A6_________A7_____|
41     */
42 
43     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
44 
45     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
46     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
47     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
48 
49 #pragma unroll
50     for (uchar i = 0; i < 2; i++, DstY += 8)
51     {
52         // Left 8x16
53         {
54             // Y channel
55             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 0);
56             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 8);
57             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 0);
58             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 8);
59             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(18 + 32 * i, 0);
60             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(18 + 32 * i, 8);
61             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(19 + 32 * i, 0);
62             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(19 + 32 * i, 8);
63 
64 
65             // UV channel
66             // V channel
67             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i, 0);
68             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i, 8);
69             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 1, 0);
70             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 1, 8);
71             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 16, 0);
72             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 16, 8);
73             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 17, 0);
74             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 17, 8);
75 
76             // U channel
77             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 4, 0);
78             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 4, 8);
79             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 5, 0);
80             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 5, 8);
81             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 20, 0);
82             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 20, 8);
83             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 21, 0);
84             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 21, 8);
85 
86             write(Dst_Surface, 4 * DstX, DstY, Result);
87         }
88 
89         // Right 8x16
90         {
91             // Y channel
92             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 0);
93             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 8);
94             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 0);
95             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 8);
96             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(26 + 32 * i, 0);
97             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(26 + 32 * i, 8);
98             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(27 + 32 * i, 0);
99             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, 0) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(27 + 32 * i, 8);
100 
101 
102             // UV channel
103             // V channel
104             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 8, 0);
105             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 8, 8);
106             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 9, 0);
107             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 9, 8);
108             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 24, 0);
109             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 24, 8);
110             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 25, 0);
111             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 3) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 25, 8);
112 
113             // U channel
114             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 12, 0);
115             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 12, 8);
116             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 13, 0);
117             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 13, 8);
118             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 28, 0);
119             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 28, 8);
120             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 29, 0);
121             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, 1) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 29, 8);
122 
123             write(Dst_Surface, 4 * DstX + 32, DstY, Result);
124         }
125     }
126 }