Source/Core_Kernels/Save_Y416.h

/*
* Copyright (c) 2019, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

{
    /*
    Buffer layout after shuffle
    _________________________________________________
    |_______Block0__________|_______Block1__________|
    |_______Block2__________|_______Block3__________|
    |_______Block4__________|_______Block5__________|
    |_______Block6__________|_______Block7__________|

    Write back buffer layout correlate to the block number#, each box stands for 1 GRF
    _______________________________________________
    |____R0_________R1_____|____R2_________R3_____|
    |____G0_________G1_____|____G2_________G3_____|
    |____B0_________B1_____|____B2_________B3_____|
    |____A0_________A1_____|____A2_________A3_____|
    |____R4_________R5_____|____R6_________R7_____|
    |____G4_________G5_____|____G6_________G7_____|
    |____B4_________B5_____|____B6_________B7_____|
    |____A4_________A5_____|____A6_________A7_____|
    */

    matrix_ref<uint, 8, 8> Result = DataBuffer.format<uint, 96, 8>().select<8, 1, 8, 1>(64, 0);

    SurfaceIndex Dst_Surface(MDF_FC_OUTPUT_BTI_START);
    matrix_ref<uint, 4, 8> TempResult4x8_Top = Result.select<4, 1, 8, 1>(0, 0);
    matrix_ref<uint, 4, 8> TempResult4x8_Bottom = Result.select<4, 1, 8, 1>(4, 0);

    ushort TempA = DestinationRGBFormat << 8;

#pragma unroll
    for (uchar i = 0; i < 2; i++, DstY += 8)
    {
        // First 8x8
        {

            // first 8x4
            {
                // R/G/B channel top half
                matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 0);
                matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 8);
                matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 16);
                matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 24);

                matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 32);
                matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 40);
                matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 48);
                matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 56);

                matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 0);
                matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 8);
                matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 16);
                matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 24);

                TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);

                // R/G/B channel bottom half
                matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 0);
                matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 8);
                matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 16);
                matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 24);

                matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 32);
                matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 40);
                matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 48);
                matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 56);

                matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 0);
                matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 8);
                matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 16);
                matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 24);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);

                write(Dst_Surface, DstX * 8, DstY, Result);
            }

            // second 8x4
            {
                // R/G/B channel top half
                matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 0 + 4);
                matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 8 + 4);
                matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 16 + 4);
                matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 24 + 4);

                matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i, 32 + 4);
                matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i, 40 + 4);
                matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i, 48 + 4);
                matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i, 56 + 4);

                matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 0 + 4);
                matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 8 + 4);
                matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 16 + 4);
                matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 1, 24 + 4);

                TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);

                // R/G/B channel bottom half
                matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 0 + 4);
                matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 8 + 4);
                matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 16 + 4);
                matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 24 + 4);

                matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 32 + 4);
                matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 40 + 4);
                matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 48 + 4);
                matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 4, 56 + 4);

                matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 0 + 4);
                matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 8 + 4);
                matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 16 + 4);
                matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 5, 24 + 4);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);

                write(Dst_Surface, DstX * 8 + 32, DstY, Result);
            }
        }

        // Second 8x8
        {
            // first 8x4
            {
                // R/G/B channel top half
                matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 0);
                matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 8);
                matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 16);
                matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 24);

                matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 32);
                matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 40);
                matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 48);
                matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 56);

                matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 0);
                matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 8);
                matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 16);
                matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 24);

                TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);

                // R/G/B channel bottom half
                matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 0);
                matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 8);
                matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 16);
                matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 24);

                matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 32);
                matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 40);
                matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 48);
                matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 56);

                matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 0);
                matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 8);
                matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 16);
                matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 24);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);

                write(Dst_Surface, DstX * 8 + 64, DstY, Result);
            }

            // second 8x4
            {
                // R/G/B channel top half
                matrix_ref<ushort, 1, 4> TempR0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 0 + 4);
                matrix_ref<ushort, 1, 4> TempR2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 8 + 4);
                matrix_ref<ushort, 1, 4> TempR4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 16 + 4);
                matrix_ref<ushort, 1, 4> TempR6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 24 + 4);

                matrix_ref<ushort, 1, 4> TempG0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 32 + 4);
                matrix_ref<ushort, 1, 4> TempG2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 40 + 4);
                matrix_ref<ushort, 1, 4> TempG4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 48 + 4);
                matrix_ref<ushort, 1, 4> TempG6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 2, 56 + 4);

                matrix_ref<ushort, 1, 4> TempB0 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 0 + 4);
                matrix_ref<ushort, 1, 4> TempB2 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 8 + 4);
                matrix_ref<ushort, 1, 4> TempB4 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 16 + 4);
                matrix_ref<ushort, 1, 4> TempB6 = DataBuffer.select<1, 1, 4, 1>(8 * i + 3, 24 + 4);

                TempResult4x8_Top.select<1, 1, 4, 2>(0, 0) = (TempB0) + (TempG0 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(0, 1) = (TempR0) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(1, 0) = (TempB2) + (TempG2 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(1, 1) = (TempR2) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(2, 0) = (TempB4) + (TempG4 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(2, 1) = (TempR4) + (TempA << 16);

                TempResult4x8_Top.select<1, 1, 4, 2>(3, 0) = (TempB6) + (TempG6 << 16);
                TempResult4x8_Top.select<1, 1, 4, 2>(3, 1) = (TempR6) + (TempA << 16);

                // R/G/B channel bottom half
                matrix_ref<ushort, 1, 4> TempR8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 0 + 4);
                matrix_ref<ushort, 1, 4> TempR10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 8 + 4);
                matrix_ref<ushort, 1, 4> TempR12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 16 + 4);
                matrix_ref<ushort, 1, 4> TempR14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 24 + 4);

                matrix_ref<ushort, 1, 4> TempG8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 32 + 4);
                matrix_ref<ushort, 1, 4> TempG10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 40 + 4);
                matrix_ref<ushort, 1, 4> TempG12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 48 + 4);
                matrix_ref<ushort, 1, 4> TempG14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 6, 56 + 4);

                matrix_ref<ushort, 1, 4> TempB8 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 0 + 4);
                matrix_ref<ushort, 1, 4> TempB10 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 8 + 4);
                matrix_ref<ushort, 1, 4> TempB12 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 16 + 4);
                matrix_ref<ushort, 1, 4> TempB14 = DataBuffer.select<1, 1, 4, 1>(8 * i + 7, 24 + 4);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 0) = (TempB8) + (TempG8 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(0, 1) = (TempR8) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 0) = (TempB10) + (TempG10 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(1, 1) = (TempR10) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 0) = (TempB12) + (TempG12 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(2, 1) = (TempR12) + (TempA << 16);

                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 0) = (TempB14) + (TempG14 << 16);
                TempResult4x8_Bottom.select<1, 1, 4, 2>(3, 1) = (TempR14) + (TempA << 16);

                write(Dst_Surface, DstX * 8 + 96, DstY, Result);
            }
        }
    }
}