1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 {
23     /*
24     Buffer layout after shuffle
25     _________________________________________________
26     |_______Block0__________|_______Block1__________|
27     |_______Block2__________|_______Block3__________|
28     |_______Block4__________|_______Block5__________|
29     |_______Block6__________|_______Block7__________|
30 
31     Write back buffer layout correlate to the block number#, each box stands for 1 GRF
32     _______________________________________________
33     |____R0_________R1_____|____R2_________R3_____|
34     |____G0_________G1_____|____G2_________G3_____|
35     |____B0_________B1_____|____B2_________B3_____|
36     |____A0_________A1_____|____A2_________A3_____|
37     |____R4_________R5_____|____R6_________R7_____|
38     |____G4_________G5_____|____G6_________G7_____|
39     |____B4_________B5_____|____B6_________B7_____|
40     |____A4_________A5_____|____A6_________A7_____|
41     */
42 
43     matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);
44 
45     SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
46 
47     matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
48     matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);
49 
50 #pragma unroll
51     for (uchar i = 0; i < 2; i++, DstY += 8)
52     {
53         // First 8x16
54         {
55             // R/G/B/A channels
56             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(32 * i, 0) & 0xF800);
57             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12+ 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 32 * i, 0) & 0xF800);
58             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(32 * i, 8) & 0xF800);
59             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 32 * i, 8) & 0xF800);
60 
61             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 32 * i, 0) & 0xF800);
62             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 32 * i, 0) & 0xF800);
63             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 32 * i, 8) & 0xF800);
64             TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 32 * i, 8) & 0xF800);
65 
66 
67             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(16 + 32 * i, 0) & 0xF800);
68             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(0, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 16 + 32 * i, 0) & 0xF800);
69             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(4 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(16 + 32 * i, 8) & 0xF800);
70             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(1, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(12 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(8 + 16 + 32 * i, 8) & 0xF800);
71 
72             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 16 + 32 * i, 0) & 0xF800);
73             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(2, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 16 + 32 * i, 0) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 16 + 32 * i, 0) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 16 + 32 * i, 0) & 0xF800);
74             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 0) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(5 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(1 + 16 + 32 * i, 8) & 0xF800);
75             TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 1>(3, 8) = (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(13 + 16 + 32 * i, 8) >> 11) + ((DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 16 + 32 * i, 8) >> 10) << 5) + (DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(9 + 16 + 32 * i, 8) & 0xF800);
76 
77             write(Dst_Surface, DstX * 2, DstY, Result);
78         }
79     }
80 }