1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 {
23     matrix<ushort, 1, 16> Temp;
24 
25 #pragma unroll
26     for (int i = 0; i < 4; i++)
27     {
28         /*
29         Buffer layout after shuffle
30         _________________________________________________
31         |_______Block0__________|_______Block1__________|
32         |_______Block2__________|_______Block3__________|
33         |_______Block4__________|_______Block5__________|
34         |_______Block6__________|_______Block7__________|
35 
36         Write back buffer layout correlate to the block number#, each box stands for 1 GRF
37         _______________________________________________
38         |____R0_________R1_____|____R2_________R3_____|
39         |____G0_________G1_____|____G2_________G3_____|
40         |____B0_________B1_____|____B2_________B3_____|
41         |____A0_________A1_____|____A2_________A3_____|
42         |____R4_________R5_____|____R6_________R7_____|
43         |____G4_________G5_____|____G6_________G7_____|
44         |____B4_________B5_____|____B6_________B7_____|
45         |____A4_________A5_____|____A6_________A7_____|
46         */
47         {
48             matrix_ref<ushort, 1, 16> Alpha = DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + 6, 0);
49 
50             // R0/G0/B0/A0
51             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_0, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(0) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
52             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_0, 0).merge(Temp, (ColorFill(0) << 8), TempMask0[0][4 * i]);
53             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_0, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(1) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
54             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_0, 0).merge(Temp, (ColorFill(1) << 8), TempMask0[0][4 * i]);
55             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_0, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(2) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
56             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_0, 0).merge(Temp, (ColorFill(2) << 8), TempMask0[0][4 * i]);
57             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_0, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(3) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
58             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_0, 0).merge(Temp, (ColorFill(3) << 8), TempMask0[0][4 * i]);
59         }
60 
61         {
62             matrix_ref<ushort, 1, 16> Alpha = DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + 7, 0);
63 
64             // R1/G1/B1/A1
65             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_1, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(0) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
66             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_1, 0).merge(Temp, (ColorFill(0) << 8), TempMask0[0][4 * i + 1]);
67             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_1, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(1) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
68             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_1, 0).merge(Temp, (ColorFill(1) << 8), TempMask0[0][4 * i + 1]);
69             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_1, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(2) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
70             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_1, 0).merge(Temp, (ColorFill(2) << 8), TempMask0[0][4 * i + 1]);
71             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_1, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(3) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
72             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_1, 0).merge(Temp, (ColorFill(3) << 8), TempMask0[0][4 * i + 1]);
73         }
74 
75         {
76             matrix_ref<ushort, 1, 16> Alpha = DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + 14, 0);
77 
78             // R2/G2/B2/A2
79             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_2, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(0) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
80             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_2, 0).merge(Temp, (ColorFill(0) << 8), TempMask0[0][4 * i + 2]);
81             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_2, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(1) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
82             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_2, 0).merge(Temp, (ColorFill(1) << 8), TempMask0[0][4 * i + 2]);
83             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_2, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(2) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
84             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_2, 0).merge(Temp, (ColorFill(2) << 8), TempMask0[0][4 * i + 2]);
85             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_2, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(3) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
86             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_2, 0).merge(Temp, (ColorFill(3) << 8), TempMask0[0][4 * i + 2]);
87         }
88 
89         {
90             matrix_ref<ushort, 1, 16> Alpha = DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + 15, 0);
91 
92             // R3/G3/B3/A3
93             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_3, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(0) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
94             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_RV_3, 0).merge(Temp, (ColorFill(0) << 8), TempMask0[0][4 * i + 3]);
95             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_3, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(1) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
96             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_GY_3, 0).merge(Temp, (ColorFill(1) << 8), TempMask0[0][4 * i + 3]);
97             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_3, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(2) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
98             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_BU_3, 0).merge(Temp, (ColorFill(2) << 8), TempMask0[0][4 * i + 3]);
99             Temp = (((DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_3, 0) * (cm_add<ushort>(Alpha, 256, SAT)))) + (((ColorFill(3) << 8) * (cm_add<ushort>(0xFF00, -Alpha, SAT))))) >> 16;
100             DataBuffer.format<ushort, 96, 16>().select<1, 1, 16, 1>(16 * i + Channel_Offset_A_3, 0).merge(Temp, (ColorFill(3) << 8), TempMask0[0][4 * i + 3]);
101         }
102     }
103     TempMask0 = 0xFFFFFFFF;
104 }