1 // Copyright (c) 2018 Intel Corporation
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in all
11 // copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 // SOFTWARE.
20 
21 #include "cm/cm.h"
22 #include "cm/cmtl.h"
23 
24 #define INTERDATA_SIZE_SMALL    8
25 #define INTERDATA_SIZE_BIG      64   // 32x32 and 64x64 blocks
26 #define MVDATA_SIZE             4    // mfxI16Pair
27 #define SINGLE_MVDATA_SIZE      2    // mfxI16Pair
28 #define MBDIST_SIZE             64   // 16*mfxU32
29 #define DIST_SIZE               4
30 #define RF_DECISION_LEVEL       10
31 
32 #define SLICE(VEC, FROM, HOWMANY, STEP) ((VEC).select<HOWMANY, STEP>(FROM))
33 #define SLICE1(VEC, FROM, HOWMANY) SLICE(VEC, FROM, HOWMANY, 1)
34 #define SELECT_N_ROWS(m, from, nrows) m.select<nrows, 1, m.COLS, 1>(from)
35 #define SELECT_N_COLS(m, from, ncols) m.select<m.ROWS, 1, ncols, 1>(0, from)
36 #define OUT_BLOCK       16  // output pixels computed per thread
37 
38 /*----------------------------------------------------------------------*/
39 #define ROUND_UP(offset, round_to)   ( ( (offset) + (round_to) - 1) &~ ((round_to) - 1 ))
40 #define ROUND_DOWN(offset, round_to)   ( (offset) &~ ( ( round_to) - 1 ) )
41 
42 #define BLOCK_PIXEL_WIDTH    (32)
43 #define BLOCK_HEIGHT        (8)
44 #define BLOCK_HEIGHT_NV12   (4)
45 
46 #define SUB_BLOCK_PIXEL_WIDTH (8)
47 #define SUB_BLOCK_HEIGHT      (8)
48 #define SUB_BLOCK_HEIGHT_NV12 (4)
49 
50 #define BLOCK_WIDTH                        (64)
51 #define PADDED_BLOCK_WIDTH                (128)
52 #define PADDED_BLOCK_WIDTH_CPU_TO_GPU    (80)
53 
54 #define MIN(x, y)    (x < y ? x:y)
55 
56 _GENX_MAIN_  void
SubSamplePoint_p(SurfaceIndex ibuf,SurfaceIndex obuf,uint in_width,uint in_height,uint out_width,uint out_height)57 SubSamplePoint_p(SurfaceIndex ibuf, SurfaceIndex obuf, uint in_width, uint in_height, uint out_width, uint out_height)
58 {
59     vector<uchar, OUT_BLOCK>
60         out = 0;
61     uint
62         ix = get_thread_origin_x(),
63         iy = get_thread_origin_y(),
64         step_h = in_height / out_height,
65         step_w = in_width / out_width,
66         nc = !(step_h % 2),
67         cor = (iy % 2) & nc, //To deal with interlace content
68         offset_x = (ix * step_w * OUT_BLOCK),
69         offset_y = (iy * step_h + cor);
70     matrix<uchar, 1, 1>
71         pxl;
72     vector<uint, 4>
73         lumaVal;
74 #pragma unroll(OUT_BLOCK)
75     for (int i = 0; i < OUT_BLOCK; i++) {
76         read(ibuf, offset_x, offset_y, pxl);
77         out(i) = pxl(0, 0);
78         offset_x += step_w;
79     }
80 
81     write_plane(obuf, GENX_SURFACE_Y_PLANE, (ix * OUT_BLOCK), iy, out);
82 }
83 
84 _GENX_MAIN_  void
SubSamplePoint_t(SurfaceIndex ibuf,SurfaceIndex obuf,uint in_width,uint in_height,uint out_width,uint out_height)85 SubSamplePoint_t(SurfaceIndex ibuf, SurfaceIndex obuf, uint in_width, uint in_height, uint out_width, uint out_height)
86 {
87     vector<uchar, OUT_BLOCK>
88         out = 0;
89     uint
90         ix = get_thread_origin_x(),
91         iy = get_thread_origin_y(),
92         step_h = in_height / out_height,
93         step_w = in_width / out_width,
94         nc = (step_h % 2),
95         cor = (iy % 2) & nc,
96         offset_x = (ix * step_w * OUT_BLOCK),
97         offset_y = (iy * step_h + cor);
98     matrix<uchar, 1, 1>
99         pxl;
100     matrix<uint, 1, 1>
101         lumaVal;
102 #pragma unroll(OUT_BLOCK)
103     for (int i = 0; i < OUT_BLOCK; i++) {
104         read(ibuf, offset_x, offset_y, pxl);
105         out(i) = pxl(0, 0);
106         offset_x += step_w;
107     }
108 
109     write_plane(obuf, GENX_SURFACE_Y_PLANE, (ix * OUT_BLOCK), iy, out);
110 }
111 
112 _GENX_MAIN_  void
SubSamplePoint_b(SurfaceIndex ibuf,SurfaceIndex obuf,uint in_width,uint in_height,uint out_width,uint out_height)113 SubSamplePoint_b(SurfaceIndex ibuf, SurfaceIndex obuf, uint in_width, uint in_height, uint out_width, uint out_height)
114 {
115     vector<uchar, OUT_BLOCK>
116         out = 0;
117     uint
118         ix          = get_thread_origin_x(),
119         iy          = get_thread_origin_y(),
120         step_h      = in_height / out_height,
121         step_w      = in_width / out_width,
122         nc          = !(step_h % 2),
123         cor         = !(iy % 2) | nc,
124         offset_x    = (ix * step_w * OUT_BLOCK),
125         offset_y    = (iy * step_h + cor);
126     matrix<uchar, 1, 1>
127         pxl;
128     matrix<uint, 1, 1>
129         lumaVal;
130 #pragma unroll(OUT_BLOCK)
131     for (int i = 0; i < OUT_BLOCK; i++) {
132         read(ibuf, offset_x, offset_y, pxl);
133         out(i) = pxl(0, 0);
134         offset_x += step_w;
135     }
136 
137     write_plane(obuf, GENX_SURFACE_Y_PLANE, (ix * OUT_BLOCK), iy, out);
138 }
139 
140 _GENX_MAIN_  void
surfaceCopy_Y(SurfaceIndex INBUF_IDX,SurfaceIndex OUTBUF_IDX,uint width_dword,uint height,uint width_stride)141 surfaceCopy_Y(SurfaceIndex INBUF_IDX, SurfaceIndex OUTBUF_IDX, uint width_dword, uint height, uint width_stride)
142 {
143     (void)width_dword;
144     (void)height;
145     (void)width_stride;
146 
147     //write Y plane
148     matrix<uchar, BLOCK_HEIGHT, BLOCK_PIXEL_WIDTH> inData_m;
149     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData0(inData_m.row(0));
150     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData1(inData_m.row(1));
151     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData2(inData_m.row(2));
152     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData3(inData_m.row(3));
153     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData4(inData_m.row(4));
154     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData5(inData_m.row(5));
155     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData6(inData_m.row(6));
156     vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData7(inData_m.row(7));
157 
158     matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData0;
159     matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData1;
160     matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData2;
161     matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData3;
162 
163     int horizOffset = get_thread_origin_x() * BLOCK_PIXEL_WIDTH;
164     int vertOffset = get_thread_origin_y() * BLOCK_HEIGHT;
165 
166     read(INBUF_IDX, horizOffset, vertOffset,     inData0);
167     read(INBUF_IDX, horizOffset, vertOffset + 1, inData1);
168     read(INBUF_IDX, horizOffset, vertOffset + 2, inData2);
169     read(INBUF_IDX, horizOffset, vertOffset + 3, inData3);
170     read(INBUF_IDX, horizOffset, vertOffset + 4, inData4);
171     read(INBUF_IDX, horizOffset, vertOffset + 5, inData5);
172     read(INBUF_IDX, horizOffset, vertOffset + 6, inData6);
173     read(INBUF_IDX, horizOffset, vertOffset + 7, inData7);
174 
175     outData0 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 0);
176     outData1 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 8);
177     outData2 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 16);
178     outData3 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 24);
179 
180     write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset,                           vertOffset, outData0);
181     write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset + SUB_BLOCK_PIXEL_WIDTH,   vertOffset, outData1);
182     write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset + SUB_BLOCK_PIXEL_WIDTH*2, vertOffset, outData2);
183     write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset + SUB_BLOCK_PIXEL_WIDTH*3, vertOffset, outData3);
184 }
185