1 /* 2 * Copyright (c) 2019, Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 { 23 /* 24 Buffer layout after shuffle 25 _________________________________________________ 26 |_______Block0__________|_______Block1__________| 27 |_______Block2__________|_______Block3__________| 28 |_______Block4__________|_______Block5__________| 29 |_______Block6__________|_______Block7__________| 30 31 Write back buffer layout correlate to the block number#, each box stands for 1 GRF 32 _______________________________________________ 33 |____R0_________R1_____|____R2_________R3_____| 34 |____G0_________G1_____|____G2_________G3_____| 35 |____B0_________B1_____|____B2_________B3_____| 36 |____A0_________A1_____|____A2_________A3_____| 37 |____R4_________R5_____|____R6_________R7_____| 38 |____G4_________G5_____|____G6_________G7_____| 39 |____B4_________B5_____|____B6_________B7_____| 40 |____A4_________A5_____|____A6_________A7_____| 41 */ 42 43 matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0); 44 45 SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START); 46 matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0); 47 matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0); 48 49 DestinationPackedYOffset = DestinationPackedYOffset >> 1; 50 DestinationPackedUOffset = DestinationPackedUOffset >> 1; 51 DestinationPackedVOffset = DestinationPackedVOffset >> 1; 52 53 #pragma unroll 54 for (uchar i = 0; i < 2; i++, DstY += 8) 55 { 56 // Left 8x16 57 { 58 // Y channel 59 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 0) & 0xFFC0; 60 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(2 + 32 * i, 8) & 0xFFC0; 61 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 0) & 0xFFC0; 62 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(3 + 32 * i, 8) & 0xFFC0; 63 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(18 + 32 * i, 0) & 0xFFC0; 64 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(18 + 32 * i, 8) & 0xFFC0; 65 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(19 + 32 * i, 0) & 0xFFC0; 66 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(19 + 32 * i, 8) & 0xFFC0; 67 68 69 // UV channel 70 // V channel 71 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i, 0) & 0xFFC0; 72 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i, 8) & 0xFFC0; 73 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 1, 0) & 0xFFC0; 74 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 1, 8) & 0xFFC0; 75 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 16, 0) & 0xFFC0; 76 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 16, 8) & 0xFFC0; 77 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 17, 0) & 0xFFC0; 78 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 17, 8) & 0xFFC0; 79 80 // U channel 81 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 4, 0) & 0xFFC0; 82 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 4, 8) & 0xFFC0; 83 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 5, 0) & 0xFFC0; 84 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 5, 8) & 0xFFC0; 85 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 20, 0) & 0xFFC0; 86 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 20, 8) & 0xFFC0; 87 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 21, 0) & 0xFFC0; 88 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 21, 8) & 0xFFC0; 89 90 // Rounding 91 Result.format<ushort>() = cm_add<ushort>(Result.format<ushort>(), 0x20, SAT); 92 Result.format<ushort>() = Result.format<ushort>() & 0xFFC0; 93 94 write(Dst_Surface, 4 * DstX, DstY, Result); 95 } 96 97 // Right 8x16 98 { 99 // Y channel 100 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 0) & 0xFFC0; 101 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(10 + 32 * i, 8) & 0xFFC0; 102 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 0) & 0xFFC0; 103 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(11 + 32 * i, 8) & 0xFFC0; 104 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(0, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(26 + 32 * i, 0) & 0xFFC0; 105 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(1, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(26 + 32 * i, 8) & 0xFFC0; 106 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(2, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(27 + 32 * i, 0) & 0xFFC0; 107 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 8, 2>(3, DestinationPackedYOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 8, 1>(27 + 32 * i, 8) & 0xFFC0; 108 109 110 // UV channel 111 // V channel 112 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 8, 0) & 0xFFC0; 113 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 8, 8) & 0xFFC0; 114 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 9, 0) & 0xFFC0; 115 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 9, 8) & 0xFFC0; 116 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 24, 0) & 0xFFC0; 117 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 24, 8) & 0xFFC0; 118 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 25, 0) & 0xFFC0; 119 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedVOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 25, 8) & 0xFFC0; 120 121 // U channel 122 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 12, 0) & 0xFFC0; 123 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 12, 8) & 0xFFC0; 124 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 13, 0) & 0xFFC0; 125 TempResult4x8_Top.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 13, 8) & 0xFFC0; 126 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(0, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 28, 0) & 0xFFC0; 127 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(1, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 28, 8) & 0xFFC0; 128 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(2, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 29, 0) & 0xFFC0; 129 TempResult4x8_Bottom.format<ushort, 4, 16>().select<1, 1, 4, 4>(3, DestinationPackedUOffset) = DataBuffer.format<ushort, 96, 16>().select<1, 1, 4, 2>(32 * i + 29, 8) & 0xFFC0; 130 131 // Rounding 132 Result.format<ushort>() = cm_add<ushort>(Result.format<ushort>(), 0x20, SAT); 133 Result.format<ushort>() = Result.format<ushort>() & 0xFFC0; 134 135 write(Dst_Surface, 4 * DstX + 32, DstY, Result); 136 } 137 } 138 }