1 /*****************************************************************************
2 * Copyright (C) 2013-2020 MulticoreWare, Inc
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24 #include "common.h"
25 #include "primitives.h"
26
27 namespace X265_NS {
28 // x265 private namespace
29
30 extern const uint8_t lumaPartitionMapTable[] =
31 {
32 // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64
33 LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4
34 LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8
35 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12
36 LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16
37 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20
38 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24
39 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28
40 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32
41 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36
42 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40
43 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44
44 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48
45 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52
46 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56
47 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60
48 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64
49 };
50
51 /* the "authoritative" set of encoder primitives */
52 EncoderPrimitives primitives;
53
54 void setupPixelPrimitives_c(EncoderPrimitives &p);
55 void setupDCTPrimitives_c(EncoderPrimitives &p);
56 void setupFilterPrimitives_c(EncoderPrimitives &p);
57 void setupIntraPrimitives_c(EncoderPrimitives &p);
58 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
59 void setupSaoPrimitives_c(EncoderPrimitives &p);
60 void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
61 void setupLowPassPrimitives_c(EncoderPrimitives& p);
62
setupCPrimitives(EncoderPrimitives & p)63 void setupCPrimitives(EncoderPrimitives &p)
64 {
65 setupPixelPrimitives_c(p); // pixel.cpp
66 setupDCTPrimitives_c(p); // dct.cpp
67 setupLowPassPrimitives_c(p); // lowpassdct.cpp
68 setupFilterPrimitives_c(p); // ipfilter.cpp
69 setupIntraPrimitives_c(p); // intrapred.cpp
70 setupLoopFilterPrimitives_c(p); // loopfilter.cpp
71 setupSaoPrimitives_c(p); // sao.cpp
72 setupSeaIntegralPrimitives_c(p); // framefilter.cpp
73 }
74
enableLowpassDCTPrimitives(EncoderPrimitives & p)75 void enableLowpassDCTPrimitives(EncoderPrimitives &p)
76 {
77 // update copies of the standard dct transform
78 p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct;
79 p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct;
80 p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct;
81 p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct;
82
83 // replace active dct by lowpass dct for high dct transforms
84 p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct;
85 p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct;
86 }
87
setupAliasPrimitives(EncoderPrimitives & p)88 void setupAliasPrimitives(EncoderPrimitives &p)
89 {
90 #if HIGH_BIT_DEPTH
91 /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */
92 for (int i = 0; i < NUM_CU_SIZES; i++)
93 {
94 p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss;
95
96 p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp;
97 p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp;
98 p.cu[i].copy_ss = (copy_ss_t)p.pu[i].copy_pp;
99
100 p.chroma[X265_CSP_I420].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
101 p.chroma[X265_CSP_I420].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
102 p.chroma[X265_CSP_I420].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
103
104 p.chroma[X265_CSP_I422].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
105 p.chroma[X265_CSP_I422].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
106 p.chroma[X265_CSP_I422].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
107 }
108 #endif
109
110 /* alias chroma 4:4:4 from luma primitives (all but chroma filters) */
111
112 p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL;
113
114 for (int i = 0; i < NUM_PU_SIZES; i++)
115 {
116 p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
117 p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED] = p.pu[i].addAvg[NONALIGNED];
118 p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED];
119 p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd;
120 p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED] = p.pu[i].convert_p2s[NONALIGNED];
121 p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED];
122 }
123
124 for (int i = 0; i < NUM_CU_SIZES; i++)
125 {
126 p.chroma[X265_CSP_I444].cu[i].sa8d = p.cu[i].sa8d;
127 p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp;
128 p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps;
129 p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED] = p.cu[i].add_ps[NONALIGNED];
130 p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED];
131 p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
132 p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
133 p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
134 }
135
136 p.cu[BLOCK_4x4].sa8d = p.pu[LUMA_4x4].satd;
137
138 /* Chroma PU can often use luma satd primitives */
139 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = p.pu[LUMA_4x4].satd;
140 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = p.pu[LUMA_8x8].satd;
141 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = p.pu[LUMA_16x16].satd;
142 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = p.pu[LUMA_32x32].satd;
143
144 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = p.pu[LUMA_8x4].satd;
145 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = p.pu[LUMA_4x8].satd;
146 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = p.pu[LUMA_16x8].satd;
147 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = p.pu[LUMA_8x16].satd;
148 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = p.pu[LUMA_32x16].satd;
149 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = p.pu[LUMA_16x32].satd;
150
151 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = p.pu[LUMA_16x12].satd;
152 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = p.pu[LUMA_12x16].satd;
153 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = p.pu[LUMA_16x4].satd;
154 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = p.pu[LUMA_4x16].satd;
155 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = p.pu[LUMA_32x24].satd;
156 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = p.pu[LUMA_24x32].satd;
157 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = p.pu[LUMA_32x8].satd;
158 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = p.pu[LUMA_8x32].satd;
159
160 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = p.pu[LUMA_4x8].satd;
161 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = p.pu[LUMA_8x16].satd;
162 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = p.pu[LUMA_16x32].satd;
163 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = p.pu[LUMA_32x64].satd;
164
165 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = p.pu[LUMA_4x4].satd;
166 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = p.pu[LUMA_8x8].satd;
167 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = p.pu[LUMA_4x16].satd;
168 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = p.pu[LUMA_16x16].satd;
169 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = p.pu[LUMA_8x32].satd;
170 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = p.pu[LUMA_32x32].satd;
171 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = p.pu[LUMA_16x64].satd;
172
173 //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x12] = satd4<8, 12>;
174 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = p.pu[LUMA_8x4].satd;
175 //p.chroma[X265_CSP_I422].satd[CHROMA_422_16x24] = satd8<16, 24>;
176 //p.chroma[X265_CSP_I422].satd[CHROMA_422_12x32] = satd4<12, 32>;
177 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = p.pu[LUMA_16x8].satd;
178 //p.chroma[X265_CSP_I422].satd[CHROMA_422_4x32] = satd4<4, 32>;
179 //p.chroma[X265_CSP_I422].satd[CHROMA_422_32x48] = satd8<32, 48>;
180 //p.chroma[X265_CSP_I422].satd[CHROMA_422_24x64] = satd8<24, 64>;
181 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = p.pu[LUMA_32x16].satd;
182 //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x64] = satd8<8, 64>;
183
184 p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sa8d = NULL;
185 p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = p.pu[LUMA_4x4].satd;
186 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = p.cu[BLOCK_8x8].sa8d;
187 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = p.cu[BLOCK_16x16].sa8d;
188 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = p.cu[BLOCK_32x32].sa8d;
189
190 p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sa8d = NULL;
191 p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sa8d = p.pu[LUMA_4x8].satd;
192
193 /* alias CU copy_pp from square PU copy_pp */
194 for (int i = 0; i < NUM_CU_SIZES; i++)
195 {
196 p.cu[i].copy_pp = p.pu[i].copy_pp;
197
198 for (int c = 0; c < X265_CSP_COUNT; c++)
199 p.chroma[c].cu[i].copy_pp = p.chroma[c].pu[i].copy_pp;
200 }
201
202 p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sse_pp = NULL;
203 p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = p.cu[BLOCK_4x4].sse_pp;
204 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = p.cu[BLOCK_8x8].sse_pp;
205 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = p.cu[BLOCK_16x16].sse_pp;
206 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = p.cu[BLOCK_32x32].sse_pp;
207
208 p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
209 }
210
x265_report_simd(x265_param * param)211 void x265_report_simd(x265_param* param)
212 {
213 if (param->logLevel >= X265_LOG_INFO)
214 {
215 int cpuid = param->cpuid;
216
217 char buf[1000];
218 char *p = buf + sprintf(buf, "using cpu capabilities:");
219 char *none = p;
220 for (int i = 0; X265_NS::cpu_names[i].flags; i++)
221 {
222 if (!strcmp(X265_NS::cpu_names[i].name, "SSE")
223 && (cpuid & X265_CPU_SSE2))
224 continue;
225 if (!strcmp(X265_NS::cpu_names[i].name, "SSE2")
226 && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW)))
227 continue;
228 if (!strcmp(X265_NS::cpu_names[i].name, "SSE3")
229 && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
230 continue;
231 if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1")
232 && (cpuid & X265_CPU_SSE42))
233 continue;
234 if (!strcmp(X265_NS::cpu_names[i].name, "BMI1")
235 && (cpuid & X265_CPU_BMI2))
236 continue;
237 if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags
238 && (!i || X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags))
239 p += sprintf(p, " %s", X265_NS::cpu_names[i].name);
240 }
241
242 if (p == none)
243 sprintf(p, " none!");
244 x265_log(param, X265_LOG_INFO, "%s\n", buf);
245 }
246 }
247
x265_setup_primitives(x265_param * param)248 void x265_setup_primitives(x265_param *param)
249 {
250 if (!primitives.pu[0].sad)
251 {
252 setupCPrimitives(primitives);
253
254 /* We do not want the encoder to use the un-optimized intra all-angles
255 * C references. It is better to call the individual angle functions
256 * instead. We must check for NULL before using this primitive */
257 for (int i = 0; i < NUM_TR_SIZE; i++)
258 primitives.cu[i].intra_pred_allangs = NULL;
259
260 #if ENABLE_ASSEMBLY
261 #if X265_ARCH_X86
262 setupInstrinsicPrimitives(primitives, param->cpuid);
263 #endif
264 setupAssemblyPrimitives(primitives, param->cpuid);
265 #endif
266 #if HAVE_ALTIVEC
267 if (param->cpuid & X265_CPU_ALTIVEC)
268 {
269 setupPixelPrimitives_altivec(primitives); // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions
270 setupDCTPrimitives_altivec(primitives); // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions
271 setupFilterPrimitives_altivec(primitives); // ipfilter.cpp, overwrite the initialization for altivec optimizated functions
272 setupIntraPrimitives_altivec(primitives); // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions
273 }
274 #endif
275
276 setupAliasPrimitives(primitives);
277
278 if (param->bLowPassDct)
279 {
280 enableLowpassDCTPrimitives(primitives);
281 }
282 }
283
284 x265_report_simd(param);
285 }
286 }
287
288 #if ENABLE_ASSEMBLY && X265_ARCH_X86
289 /* these functions are implemented in assembly. When assembly is not being
290 * compiled, they are unnecessary and can be NOPs */
291 #else
292 extern "C" {
PFX(cpu_cpuid_test)293 int PFX(cpu_cpuid_test)(void) { return 0; }
PFX(cpu_emms)294 void PFX(cpu_emms)(void) {}
PFX(cpu_cpuid)295 void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
PFX(cpu_xgetbv)296 void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {}
297
298 #if X265_ARCH_ARM == 0
PFX(cpu_neon_test)299 void PFX(cpu_neon_test)(void) {}
PFX(cpu_fast_neon_mrc_test)300 int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
301 #endif // X265_ARCH_ARM
302 }
303 #endif
304