1 // Copyright (c) 2018 Intel Corporation
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in all
11 // copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 // SOFTWARE.
20
21 #include "cm/cm.h"
22 #include "cm/cmtl.h"
23
24 #define INTERDATA_SIZE_SMALL 8
25 #define INTERDATA_SIZE_BIG 64 // 32x32 and 64x64 blocks
26 #define MVDATA_SIZE 4 // mfxI16Pair
27 #define SINGLE_MVDATA_SIZE 2 // mfxI16Pair
28 #define MBDIST_SIZE 64 // 16*mfxU32
29 #define DIST_SIZE 4
30 #define RF_DECISION_LEVEL 10
31
32 #define SLICE(VEC, FROM, HOWMANY, STEP) ((VEC).select<HOWMANY, STEP>(FROM))
33 #define SLICE1(VEC, FROM, HOWMANY) SLICE(VEC, FROM, HOWMANY, 1)
34 #define SELECT_N_ROWS(m, from, nrows) m.select<nrows, 1, m.COLS, 1>(from)
35 #define SELECT_N_COLS(m, from, ncols) m.select<m.ROWS, 1, ncols, 1>(0, from)
36 #define OUT_BLOCK 16 // output pixels computed per thread
37
38 /*----------------------------------------------------------------------*/
39 #define ROUND_UP(offset, round_to) ( ( (offset) + (round_to) - 1) &~ ((round_to) - 1 ))
40 #define ROUND_DOWN(offset, round_to) ( (offset) &~ ( ( round_to) - 1 ) )
41
42 #define BLOCK_PIXEL_WIDTH (32)
43 #define BLOCK_HEIGHT (8)
44 #define BLOCK_HEIGHT_NV12 (4)
45
46 #define SUB_BLOCK_PIXEL_WIDTH (8)
47 #define SUB_BLOCK_HEIGHT (8)
48 #define SUB_BLOCK_HEIGHT_NV12 (4)
49
50 #define BLOCK_WIDTH (64)
51 #define PADDED_BLOCK_WIDTH (128)
52 #define PADDED_BLOCK_WIDTH_CPU_TO_GPU (80)
53
54 #define MIN(x, y) (x < y ? x:y)
55
56 _GENX_MAIN_ void
SubSamplePoint_p(SurfaceIndex ibuf,SurfaceIndex obuf,uint in_width,uint in_height,uint out_width,uint out_height)57 SubSamplePoint_p(SurfaceIndex ibuf, SurfaceIndex obuf, uint in_width, uint in_height, uint out_width, uint out_height)
58 {
59 vector<uchar, OUT_BLOCK>
60 out = 0;
61 uint
62 ix = get_thread_origin_x(),
63 iy = get_thread_origin_y(),
64 step_h = in_height / out_height,
65 step_w = in_width / out_width,
66 nc = !(step_h % 2),
67 cor = (iy % 2) & nc, //To deal with interlace content
68 offset_x = (ix * step_w * OUT_BLOCK),
69 offset_y = (iy * step_h + cor);
70 matrix<uchar, 1, 1>
71 pxl;
72 vector<uint, 4>
73 lumaVal;
74 #pragma unroll(OUT_BLOCK)
75 for (int i = 0; i < OUT_BLOCK; i++) {
76 read(ibuf, offset_x, offset_y, pxl);
77 out(i) = pxl(0, 0);
78 offset_x += step_w;
79 }
80
81 write_plane(obuf, GENX_SURFACE_Y_PLANE, (ix * OUT_BLOCK), iy, out);
82 }
83
84 _GENX_MAIN_ void
SubSamplePoint_t(SurfaceIndex ibuf,SurfaceIndex obuf,uint in_width,uint in_height,uint out_width,uint out_height)85 SubSamplePoint_t(SurfaceIndex ibuf, SurfaceIndex obuf, uint in_width, uint in_height, uint out_width, uint out_height)
86 {
87 vector<uchar, OUT_BLOCK>
88 out = 0;
89 uint
90 ix = get_thread_origin_x(),
91 iy = get_thread_origin_y(),
92 step_h = in_height / out_height,
93 step_w = in_width / out_width,
94 nc = (step_h % 2),
95 cor = (iy % 2) & nc,
96 offset_x = (ix * step_w * OUT_BLOCK),
97 offset_y = (iy * step_h + cor);
98 matrix<uchar, 1, 1>
99 pxl;
100 matrix<uint, 1, 1>
101 lumaVal;
102 #pragma unroll(OUT_BLOCK)
103 for (int i = 0; i < OUT_BLOCK; i++) {
104 read(ibuf, offset_x, offset_y, pxl);
105 out(i) = pxl(0, 0);
106 offset_x += step_w;
107 }
108
109 write_plane(obuf, GENX_SURFACE_Y_PLANE, (ix * OUT_BLOCK), iy, out);
110 }
111
112 _GENX_MAIN_ void
SubSamplePoint_b(SurfaceIndex ibuf,SurfaceIndex obuf,uint in_width,uint in_height,uint out_width,uint out_height)113 SubSamplePoint_b(SurfaceIndex ibuf, SurfaceIndex obuf, uint in_width, uint in_height, uint out_width, uint out_height)
114 {
115 vector<uchar, OUT_BLOCK>
116 out = 0;
117 uint
118 ix = get_thread_origin_x(),
119 iy = get_thread_origin_y(),
120 step_h = in_height / out_height,
121 step_w = in_width / out_width,
122 nc = !(step_h % 2),
123 cor = !(iy % 2) | nc,
124 offset_x = (ix * step_w * OUT_BLOCK),
125 offset_y = (iy * step_h + cor);
126 matrix<uchar, 1, 1>
127 pxl;
128 matrix<uint, 1, 1>
129 lumaVal;
130 #pragma unroll(OUT_BLOCK)
131 for (int i = 0; i < OUT_BLOCK; i++) {
132 read(ibuf, offset_x, offset_y, pxl);
133 out(i) = pxl(0, 0);
134 offset_x += step_w;
135 }
136
137 write_plane(obuf, GENX_SURFACE_Y_PLANE, (ix * OUT_BLOCK), iy, out);
138 }
139
140 _GENX_MAIN_ void
surfaceCopy_Y(SurfaceIndex INBUF_IDX,SurfaceIndex OUTBUF_IDX,uint width_dword,uint height,uint width_stride)141 surfaceCopy_Y(SurfaceIndex INBUF_IDX, SurfaceIndex OUTBUF_IDX, uint width_dword, uint height, uint width_stride)
142 {
143 (void)width_dword;
144 (void)height;
145 (void)width_stride;
146
147 //write Y plane
148 matrix<uchar, BLOCK_HEIGHT, BLOCK_PIXEL_WIDTH> inData_m;
149 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData0(inData_m.row(0));
150 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData1(inData_m.row(1));
151 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData2(inData_m.row(2));
152 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData3(inData_m.row(3));
153 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData4(inData_m.row(4));
154 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData5(inData_m.row(5));
155 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData6(inData_m.row(6));
156 vector_ref<uchar, BLOCK_PIXEL_WIDTH> inData7(inData_m.row(7));
157
158 matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData0;
159 matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData1;
160 matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData2;
161 matrix<uchar, SUB_BLOCK_HEIGHT, SUB_BLOCK_PIXEL_WIDTH> outData3;
162
163 int horizOffset = get_thread_origin_x() * BLOCK_PIXEL_WIDTH;
164 int vertOffset = get_thread_origin_y() * BLOCK_HEIGHT;
165
166 read(INBUF_IDX, horizOffset, vertOffset, inData0);
167 read(INBUF_IDX, horizOffset, vertOffset + 1, inData1);
168 read(INBUF_IDX, horizOffset, vertOffset + 2, inData2);
169 read(INBUF_IDX, horizOffset, vertOffset + 3, inData3);
170 read(INBUF_IDX, horizOffset, vertOffset + 4, inData4);
171 read(INBUF_IDX, horizOffset, vertOffset + 5, inData5);
172 read(INBUF_IDX, horizOffset, vertOffset + 6, inData6);
173 read(INBUF_IDX, horizOffset, vertOffset + 7, inData7);
174
175 outData0 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 0);
176 outData1 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 8);
177 outData2 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 16);
178 outData3 = inData_m.select<SUB_BLOCK_HEIGHT, 1, SUB_BLOCK_PIXEL_WIDTH, 1>(0, 24);
179
180 write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset, vertOffset, outData0);
181 write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset + SUB_BLOCK_PIXEL_WIDTH, vertOffset, outData1);
182 write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset + SUB_BLOCK_PIXEL_WIDTH*2, vertOffset, outData2);
183 write_plane(OUTBUF_IDX, GENX_SURFACE_Y_PLANE, horizOffset + SUB_BLOCK_PIXEL_WIDTH*3, vertOffset, outData3);
184 }
185