1 /*****************************************************************************
2  * Copyright (C) 2013-2020 MulticoreWare, Inc
3  *
4  * Authors: Steve Borho <steve@borho.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
19  *
20  * This program is also available under a commercial proprietary license.
21  * For more information, contact us at license @ x265.com.
22  *****************************************************************************/
23 
24 #include "common.h"
25 #include "primitives.h"
26 
27 namespace X265_NS {
28 // x265 private namespace
29 
30 extern const uint8_t lumaPartitionMapTable[] =
31 {
32 //  4          8          12          16          20  24          28  32          36  40  44  48          52  56  60  64
33     LUMA_4x4,  LUMA_4x8,  255,        LUMA_4x16,  255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 4
34     LUMA_8x4,  LUMA_8x8,  255,        LUMA_8x16,  255, 255,        255, LUMA_8x32,  255, 255, 255, 255,        255, 255, 255, 255,        // 8
35     255,        255,      255,        LUMA_12x16, 255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 12
36     LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255,        255, LUMA_16x32, 255, 255, 255, 255,        255, 255, 255, LUMA_16x64, // 16
37     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 20
38     255,        255,      255,        255,        255, 255,        255, LUMA_24x32, 255, 255, 255, 255,        255, 255, 255, 255,        // 24
39     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 28
40     255,        LUMA_32x8, 255,       LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255,        255, 255, 255, LUMA_32x64, // 32
41     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 36
42     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 40
43     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 44
44     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, LUMA_48x64, // 48
45     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 52
46     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 56
47     255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 60
48     255,        255,      255,        LUMA_64x16, 255, 255,        255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64  // 64
49 };
50 
51 /* the "authoritative" set of encoder primitives */
52 EncoderPrimitives primitives;
53 
54 void setupPixelPrimitives_c(EncoderPrimitives &p);
55 void setupDCTPrimitives_c(EncoderPrimitives &p);
56 void setupFilterPrimitives_c(EncoderPrimitives &p);
57 void setupIntraPrimitives_c(EncoderPrimitives &p);
58 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
59 void setupSaoPrimitives_c(EncoderPrimitives &p);
60 void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
61 void setupLowPassPrimitives_c(EncoderPrimitives& p);
62 
setupCPrimitives(EncoderPrimitives & p)63 void setupCPrimitives(EncoderPrimitives &p)
64 {
65     setupPixelPrimitives_c(p);      // pixel.cpp
66     setupDCTPrimitives_c(p);        // dct.cpp
67     setupLowPassPrimitives_c(p);    // lowpassdct.cpp
68     setupFilterPrimitives_c(p);     // ipfilter.cpp
69     setupIntraPrimitives_c(p);      // intrapred.cpp
70     setupLoopFilterPrimitives_c(p); // loopfilter.cpp
71     setupSaoPrimitives_c(p);        // sao.cpp
72     setupSeaIntegralPrimitives_c(p);  // framefilter.cpp
73 }
74 
enableLowpassDCTPrimitives(EncoderPrimitives & p)75 void enableLowpassDCTPrimitives(EncoderPrimitives &p)
76 {
77     // update copies of the standard dct transform
78     p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct;
79     p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct;
80     p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct;
81     p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct;
82 
83     // replace active dct by lowpass dct for high dct transforms
84     p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct;
85     p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct;
86 }
87 
setupAliasPrimitives(EncoderPrimitives & p)88 void setupAliasPrimitives(EncoderPrimitives &p)
89 {
90 #if HIGH_BIT_DEPTH
91     /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */
92     for (int i = 0; i < NUM_CU_SIZES; i++)
93     {
94         p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss;
95 
96         p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp;
97         p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp;
98         p.cu[i].copy_ss = (copy_ss_t)p.pu[i].copy_pp;
99 
100         p.chroma[X265_CSP_I420].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
101         p.chroma[X265_CSP_I420].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
102         p.chroma[X265_CSP_I420].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
103 
104         p.chroma[X265_CSP_I422].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
105         p.chroma[X265_CSP_I422].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
106         p.chroma[X265_CSP_I422].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
107     }
108 #endif
109 
110     /* alias chroma 4:4:4 from luma primitives (all but chroma filters) */
111 
112     p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL;
113 
114     for (int i = 0; i < NUM_PU_SIZES; i++)
115     {
116         p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
117         p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED]  = p.pu[i].addAvg[NONALIGNED];
118         p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED];
119         p.chroma[X265_CSP_I444].pu[i].satd    = p.pu[i].satd;
120         p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED]     = p.pu[i].convert_p2s[NONALIGNED];
121         p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED];
122     }
123 
124     for (int i = 0; i < NUM_CU_SIZES; i++)
125     {
126         p.chroma[X265_CSP_I444].cu[i].sa8d    = p.cu[i].sa8d;
127         p.chroma[X265_CSP_I444].cu[i].sse_pp  = p.cu[i].sse_pp;
128         p.chroma[X265_CSP_I444].cu[i].sub_ps  = p.cu[i].sub_ps;
129         p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED]  = p.cu[i].add_ps[NONALIGNED];
130         p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED];
131         p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
132         p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
133         p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
134     }
135 
136     p.cu[BLOCK_4x4].sa8d = p.pu[LUMA_4x4].satd;
137 
138     /* Chroma PU can often use luma satd primitives */
139     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = p.pu[LUMA_4x4].satd;
140     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = p.pu[LUMA_8x8].satd;
141     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = p.pu[LUMA_16x16].satd;
142     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = p.pu[LUMA_32x32].satd;
143 
144     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = p.pu[LUMA_8x4].satd;
145     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = p.pu[LUMA_4x8].satd;
146     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = p.pu[LUMA_16x8].satd;
147     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = p.pu[LUMA_8x16].satd;
148     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = p.pu[LUMA_32x16].satd;
149     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = p.pu[LUMA_16x32].satd;
150 
151     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = p.pu[LUMA_16x12].satd;
152     p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = p.pu[LUMA_12x16].satd;
153     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = p.pu[LUMA_16x4].satd;
154     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = p.pu[LUMA_4x16].satd;
155     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = p.pu[LUMA_32x24].satd;
156     p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = p.pu[LUMA_24x32].satd;
157     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = p.pu[LUMA_32x8].satd;
158     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = p.pu[LUMA_8x32].satd;
159 
160     p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = p.pu[LUMA_4x8].satd;
161     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = p.pu[LUMA_8x16].satd;
162     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = p.pu[LUMA_16x32].satd;
163     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = p.pu[LUMA_32x64].satd;
164 
165     p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = p.pu[LUMA_4x4].satd;
166     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = p.pu[LUMA_8x8].satd;
167     p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = p.pu[LUMA_4x16].satd;
168     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = p.pu[LUMA_16x16].satd;
169     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = p.pu[LUMA_8x32].satd;
170     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = p.pu[LUMA_32x32].satd;
171     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = p.pu[LUMA_16x64].satd;
172 
173     //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x12]  = satd4<8, 12>;
174     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd  = p.pu[LUMA_8x4].satd;
175     //p.chroma[X265_CSP_I422].satd[CHROMA_422_16x24] = satd8<16, 24>;
176     //p.chroma[X265_CSP_I422].satd[CHROMA_422_12x32] = satd4<12, 32>;
177     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = p.pu[LUMA_16x8].satd;
178     //p.chroma[X265_CSP_I422].satd[CHROMA_422_4x32]  = satd4<4, 32>;
179     //p.chroma[X265_CSP_I422].satd[CHROMA_422_32x48] = satd8<32, 48>;
180     //p.chroma[X265_CSP_I422].satd[CHROMA_422_24x64] = satd8<24, 64>;
181     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = p.pu[LUMA_32x16].satd;
182     //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x64]  = satd8<8, 64>;
183 
184     p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sa8d = NULL;
185     p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = p.pu[LUMA_4x4].satd;
186     p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = p.cu[BLOCK_8x8].sa8d;
187     p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = p.cu[BLOCK_16x16].sa8d;
188     p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = p.cu[BLOCK_32x32].sa8d;
189 
190     p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sa8d = NULL;
191     p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sa8d = p.pu[LUMA_4x8].satd;
192 
193     /* alias CU copy_pp from square PU copy_pp */
194     for (int i = 0; i < NUM_CU_SIZES; i++)
195     {
196         p.cu[i].copy_pp = p.pu[i].copy_pp;
197 
198         for (int c = 0; c < X265_CSP_COUNT; c++)
199             p.chroma[c].cu[i].copy_pp = p.chroma[c].pu[i].copy_pp;
200     }
201 
202     p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sse_pp = NULL;
203     p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = p.cu[BLOCK_4x4].sse_pp;
204     p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = p.cu[BLOCK_8x8].sse_pp;
205     p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = p.cu[BLOCK_16x16].sse_pp;
206     p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = p.cu[BLOCK_32x32].sse_pp;
207 
208     p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
209 }
210 
x265_report_simd(x265_param * param)211 void x265_report_simd(x265_param* param)
212 {
213     if (param->logLevel >= X265_LOG_INFO)
214     {
215         int cpuid = param->cpuid;
216 
217         char buf[1000];
218         char *p = buf + sprintf(buf, "using cpu capabilities:");
219         char *none = p;
220         for (int i = 0; X265_NS::cpu_names[i].flags; i++)
221         {
222             if (!strcmp(X265_NS::cpu_names[i].name, "SSE")
223                 && (cpuid & X265_CPU_SSE2))
224                 continue;
225             if (!strcmp(X265_NS::cpu_names[i].name, "SSE2")
226                 && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW)))
227                 continue;
228             if (!strcmp(X265_NS::cpu_names[i].name, "SSE3")
229                 && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
230                 continue;
231             if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1")
232                 && (cpuid & X265_CPU_SSE42))
233                 continue;
234             if (!strcmp(X265_NS::cpu_names[i].name, "BMI1")
235                 && (cpuid & X265_CPU_BMI2))
236                 continue;
237             if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags
238                 && (!i || X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags))
239                 p += sprintf(p, " %s", X265_NS::cpu_names[i].name);
240         }
241 
242         if (p == none)
243             sprintf(p, " none!");
244         x265_log(param, X265_LOG_INFO, "%s\n", buf);
245     }
246 }
247 
x265_setup_primitives(x265_param * param)248 void x265_setup_primitives(x265_param *param)
249 {
250     if (!primitives.pu[0].sad)
251     {
252         setupCPrimitives(primitives);
253 
254         /* We do not want the encoder to use the un-optimized intra all-angles
255          * C references. It is better to call the individual angle functions
256          * instead. We must check for NULL before using this primitive */
257         for (int i = 0; i < NUM_TR_SIZE; i++)
258             primitives.cu[i].intra_pred_allangs = NULL;
259 
260 #if ENABLE_ASSEMBLY
261 #if X265_ARCH_X86
262         setupInstrinsicPrimitives(primitives, param->cpuid);
263 #endif
264         setupAssemblyPrimitives(primitives, param->cpuid);
265 #endif
266 #if HAVE_ALTIVEC
267         if (param->cpuid & X265_CPU_ALTIVEC)
268         {
269             setupPixelPrimitives_altivec(primitives);       // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions
270             setupDCTPrimitives_altivec(primitives);         // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions
271             setupFilterPrimitives_altivec(primitives);      // ipfilter.cpp, overwrite the initialization for altivec optimizated functions
272             setupIntraPrimitives_altivec(primitives);       // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions
273         }
274 #endif
275 
276         setupAliasPrimitives(primitives);
277 
278         if (param->bLowPassDct)
279         {
280             enableLowpassDCTPrimitives(primitives);
281         }
282     }
283 
284     x265_report_simd(param);
285 }
286 }
287 
288 #if ENABLE_ASSEMBLY && X265_ARCH_X86
289 /* these functions are implemented in assembly. When assembly is not being
290  * compiled, they are unnecessary and can be NOPs */
291 #else
292 extern "C" {
PFX(cpu_cpuid_test)293 int PFX(cpu_cpuid_test)(void) { return 0; }
PFX(cpu_emms)294 void PFX(cpu_emms)(void) {}
PFX(cpu_cpuid)295 void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
PFX(cpu_xgetbv)296 void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {}
297 
298 #if X265_ARCH_ARM == 0
PFX(cpu_neon_test)299 void PFX(cpu_neon_test)(void) {}
PFX(cpu_fast_neon_mrc_test)300 int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
301 #endif // X265_ARCH_ARM
302 }
303 #endif
304