1 /*****************************************************************************
2
3 AvsFilterRemoveGrain/Repair16
4 Author: Laurent de Soras, 2012
5 Modified for VapourSynth by Fredrik Mellbin 2013
6
7 --- Legal stuff ---
8
9 This program is free software. It comes without any warranty, to
10 the extent permitted by applicable law. You can redistribute it
11 and/or modify it under the terms of the Do What The Fuck You Want
12 To Public License, Version 2, as published by Sam Hocevar. See
13 http://sam.zoy.org/wtfpl/COPYING for more details.
14
15 *Tab=3***********************************************************************/
16
17 #include "shared.h"
18
19 #ifdef VS_TARGET_CPU_X86
20
21 class ConvSigned
22 {
23 public:
cv(__m128i a,__m128i m)24 static __forceinline __m128i cv (__m128i a, __m128i m)
25 {
26 return (_mm_xor_si128 (a, m));
27 }
28 };
29
30
31 class ConvUnsigned
32 {
33 public:
cv(__m128i a,__m128i m)34 static __forceinline __m128i cv (__m128i a, __m128i m)
35 {
36 return (a);
37 }
38 };
39
40 #define AvsFilterRepair16_READ_PIX \
41 const int om = stride_src2 - 1; \
42 const int o0 = stride_src2 ; \
43 const int op = stride_src2 + 1; \
44 __m128i cr, a1, a2, a3, a4, c, a5, a6, a7, a8; \
45 if (sizeof(T) == 1) { \
46 __m128i zeroreg = _mm_setzero_si128(); \
47 cr = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src1_ptr + 0)), zeroreg), mask_sign); \
48 a1 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - op)), zeroreg), mask_sign); \
49 a2 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - o0)), zeroreg), mask_sign); \
50 a3 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - om)), zeroreg), mask_sign); \
51 a4 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - 1 )), zeroreg), mask_sign); \
52 c = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + 0 )), zeroreg), mask_sign); \
53 a5 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + 1 )), zeroreg), mask_sign); \
54 a6 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + om)), zeroreg), mask_sign); \
55 a7 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + o0)), zeroreg), mask_sign); \
56 a8 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + op)), zeroreg), mask_sign); \
57 } else { \
58 cr = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src1_ptr + 0 )), mask_sign); \
59 a1 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - op)), mask_sign); \
60 a2 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - o0)), mask_sign); \
61 a3 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - om)), mask_sign); \
62 a4 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - 1 )), mask_sign); \
63 c = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + 0 )), mask_sign); \
64 a5 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + 1 )), mask_sign); \
65 a6 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + om)), mask_sign); \
66 a7 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + o0)), mask_sign); \
67 a8 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + op)), mask_sign); \
68 }
69
70 #define AvsFilterRepair16_SORT_AXIS_SSE2 \
71 const __m128i ma1 = _mm_max_epi16(a1, a8); \
72 const __m128i mi1 = _mm_min_epi16(a1, a8); \
73 const __m128i ma2 = _mm_max_epi16(a2, a7); \
74 const __m128i mi2 = _mm_min_epi16(a2, a7); \
75 const __m128i ma3 = _mm_max_epi16(a3, a6); \
76 const __m128i mi3 = _mm_min_epi16(a3, a6); \
77 const __m128i ma4 = _mm_max_epi16(a4, a5); \
78 const __m128i mi4 = _mm_min_epi16(a4, a5);
79
80 #else
81
82 class ConvSigned
83 {
84 };
85
86
87 class ConvUnsigned
88 {
89 };
90 #endif
91
92 #define AvsFilterRepair16_SORT_AXIS_CPP \
93 const int ma1 = std::max(a1, a8); \
94 const int mi1 = std::min(a1, a8); \
95 const int ma2 = std::max(a2, a7); \
96 const int mi2 = std::min(a2, a7); \
97 const int ma3 = std::max(a3, a6); \
98 const int mi3 = std::min(a3, a6); \
99 const int ma4 = std::max(a4, a5); \
100 const int mi4 = std::min(a4, a5);
101
102
103 class OpRG01
104 {
105 public:
106 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)107 static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
108 const int mi = std::min (std::min (
109 std::min (std::min (a1, a2), std::min (a3, a4)),
110 std::min (std::min (a5, a6), std::min (a7, a8))
111 ), c);
112 const int ma = std::max (std::max (
113 std::max (std::max (a1, a2), std::max (a3, a4)),
114 std::max (std::max (a5, a6), std::max (a7, a8))
115 ), c);
116
117 return (limit (cr, mi, ma));
118 }
119 #ifdef VS_TARGET_CPU_X86
120 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)121 static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
122 AvsFilterRepair16_READ_PIX
123
124 const __m128i mi = _mm_min_epi16 (_mm_min_epi16 (
125 _mm_min_epi16 (_mm_min_epi16 (a1, a2), _mm_min_epi16 (a3, a4)),
126 _mm_min_epi16 (_mm_min_epi16 (a5, a6), _mm_min_epi16 (a7, a8))
127 ), c);
128 const __m128i ma = _mm_max_epi16 (_mm_max_epi16 (
129 _mm_max_epi16 (_mm_max_epi16 (a1, a2), _mm_max_epi16 (a3, a4)),
130 _mm_max_epi16 (_mm_max_epi16 (a5, a6), _mm_max_epi16 (a7, a8))
131 ), c);
132
133 return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
134 }
135 #endif
136 };
137
138 class OpRG02
139 {
140 public:
141 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)142 static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
143 int a [9] = { a1, a2, a3, a4, c, a5, a6, a7, a8 };
144
145 std::sort (&a [0], (&a [8]) + 1);
146
147 return (limit (cr, a [2-1], a [7]));
148 }
149 #ifdef VS_TARGET_CPU_X86
150 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)151 static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
152 AvsFilterRepair16_READ_PIX
153
154 sort_pair (a1, a8);
155
156 sort_pair (a1, c);
157 sort_pair (a2, a5);
158 sort_pair (a3, a6);
159 sort_pair (a4, a7);
160 sort_pair ( c, a8);
161
162 sort_pair (a1, a3);
163 sort_pair ( c, a6);
164 sort_pair (a2, a4);
165 sort_pair (a5, a7);
166
167 sort_pair (a3, a8);
168
169 sort_pair (a3, c);
170 sort_pair (a6, a8);
171 sort_pair (a4, a5);
172
173 a2 = _mm_max_epi16 (a1, a2); // sort_pair (a1, a2);
174 a3 = _mm_min_epi16 (a3, a4); // sort_pair (a3, a4);
175 sort_pair ( c, a5);
176 a7 = _mm_max_epi16 (a6, a7); // sort_pair (a6, a7);
177
178 sort_pair (a2, a8);
179
180 a2 = _mm_min_epi16 (a2, c); // sort_pair (a2, c);
181 a8 = _mm_max_epi16 (a5, a8); // sort_pair (a5, a8);
182
183 a2 = _mm_min_epi16 (a2, a3); // sort_pair (a2, a3);
184 a7 = _mm_min_epi16 (a7, a8); // sort_pair (a7, a8);
185
186 return (_mm_min_epi16 (_mm_max_epi16 (cr, a2), a7));
187 }
188 #endif
189 };
190
191 class OpRG03
192 {
193 public:
194 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)195 static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
196 int a [9] = { a1, a2, a3, a4, c, a5, a6, a7, a8 };
197
198 std::sort (&a [0], (&a [8]) + 1);
199
200 return (limit (cr, a [3-1], a [6]));
201 }
202 #ifdef VS_TARGET_CPU_X86
203 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)204 static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
205 AvsFilterRepair16_READ_PIX
206
207 sort_pair (a1, a8);
208
209 sort_pair (a1, c);
210 sort_pair (a2, a5);
211 sort_pair (a3, a6);
212 sort_pair (a4, a7);
213 sort_pair ( c, a8);
214
215 sort_pair (a1, a3);
216 sort_pair ( c, a6);
217 sort_pair (a2, a4);
218 sort_pair (a5, a7);
219
220 sort_pair (a3, a8);
221
222 sort_pair (a3, c);
223 sort_pair (a6, a8);
224 sort_pair (a4, a5);
225
226 a2 = _mm_max_epi16 (a1, a2); // sort_pair (a1, a2);
227 sort_pair (a3, a4);
228 sort_pair ( c, a5);
229 a6 = _mm_min_epi16 (a6, a7); // sort_pair (a6, a7);
230
231 sort_pair (a2, a8);
232
233 a2 = _mm_min_epi16 (a2, c); // sort_pair (a2, c);
234 a6 = _mm_max_epi16 (a4, a6); // sort_pair (a4, a6);
235 a5 = _mm_min_epi16 (a5, a8); // sort_pair (a5, a8);
236
237 a3 = _mm_max_epi16 (a2, a3); // sort_pair (a2, a3);
238 a6 = _mm_max_epi16 (a5, a6); // sort_pair (a5, a6);
239
240 return (_mm_min_epi16 (_mm_max_epi16 (cr, a3), a6));
241 }
242 #endif
243 };
244
245 class OpRG04
246 {
247 public:
248 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)249 static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
250 int a [9] = { a1, a2, a3, a4, c, a5, a6, a7, a8 };
251
252 std::sort (&a [0], (&a [8]) + 1);
253
254 return (limit (cr, a [4-1], a [5]));
255 }
256 #ifdef VS_TARGET_CPU_X86
257 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)258 static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
259 // http://jgamble.ripco.net/cgi-bin/nw.cgi?inputs=9&algorithm=batcher&output=text
260
261 AvsFilterRepair16_READ_PIX
262
263 sort_pair (a1, a8);
264
265 sort_pair (a1, c);
266 sort_pair (a2, a5);
267 sort_pair (a3, a6);
268 sort_pair (a4, a7);
269 sort_pair ( c, a8);
270
271 sort_pair (a1, a3);
272 sort_pair ( c, a6);
273 sort_pair (a2, a4);
274 sort_pair (a5, a7);
275
276 sort_pair (a3, a8);
277
278 sort_pair (a3, c);
279 sort_pair (a6, a8);
280 sort_pair (a4, a5);
281
282 a2 = _mm_max_epi16 (a1, a2); // sort_pair (a1, a2);
283 a4 = _mm_max_epi16 (a3, a4); // sort_pair (a3, a4);
284 sort_pair ( c, a5);
285 a6 = _mm_min_epi16 (a6, a7); // sort_pair (a6, a7);
286
287 sort_pair (a2, a8);
288
289 c = _mm_max_epi16 (a2, c); // sort_pair (a2, c);
290 sort_pair (a4, a6);
291 a5 = _mm_min_epi16 (a5, a8); // sort_pair (a5, a8);
292
293 a4 = _mm_min_epi16 (a4, c); // sort_pair (a4, c);
294 a5 = _mm_min_epi16 (a5, a6); // sort_pair (a5, a6);
295
296 return (_mm_min_epi16 (_mm_max_epi16 (cr, a4), a5));
297 }
298 #endif
299 };
300
301 class OpRG05
302 {
303 public:
304 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)305 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
306 const int mal1 = std::max(std::max(a1, a8), c);
307 const int mil1 = std::min(std::min(a1, a8), c);
308
309 const int mal2 = std::max(std::max(a2, a7), c);
310 const int mil2 = std::min(std::min(a2, a7), c);
311
312 const int mal3 = std::max(std::max(a3, a6), c);
313 const int mil3 = std::min(std::min(a3, a6), c);
314
315 const int mal4 = std::max(std::max(a4, a5), c);
316 const int mil4 = std::min(std::min(a4, a5), c);
317
318 const int clipped1 = limit(cr, mil1, mal1);
319 const int clipped2 = limit(cr, mil2, mal2);
320 const int clipped3 = limit(cr, mil3, mal3);
321 const int clipped4 = limit(cr, mil4, mal4);
322
323 const int c1 = std::abs(cr - clipped1);
324 const int c2 = std::abs(cr - clipped2);
325 const int c3 = std::abs(cr - clipped3);
326 const int c4 = std::abs(cr - clipped4);
327
328 const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
329
330 if (mindiff == c4)
331 return clipped4;
332 else if (mindiff == c2)
333 return clipped2;
334 else if (mindiff == c3)
335 return clipped3;
336 else
337 return clipped1;
338 }
339 #ifdef VS_TARGET_CPU_X86
340 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)341 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
342 AvsFilterRepair16_READ_PIX
343
344 const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
345 const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
346
347 const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
348 const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
349
350 const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
351 const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
352
353 const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
354 const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
355
356 const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
357 const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
358 const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
359 const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
360
361 const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
362 const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
363 const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
364 const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
365 const __m128i cru = _mm_xor_si128(cr, mask_sign);
366
367 const __m128i c1u = abs_dif_epu16(cru, clipped1u);
368 const __m128i c2u = abs_dif_epu16(cru, clipped2u);
369 const __m128i c3u = abs_dif_epu16(cru, clipped3u);
370 const __m128i c4u = abs_dif_epu16(cru, clipped4u);
371
372 const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
373 const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
374 const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
375 const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
376
377 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
378
379 __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
380 result = select_16_equ(mindiff, c3, clipped3, result);
381 result = select_16_equ(mindiff, c2, clipped2, result);
382 return select_16_equ(mindiff, c4, clipped4, result);
383 }
384 #endif
385 };
386
387 class OpRG06
388 {
389 public:
390 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)391 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
392 const int mal1 = std::max(std::max(a1, a8), c);
393 const int mil1 = std::min(std::min(a1, a8), c);
394
395 const int mal2 = std::max(std::max(a2, a7), c);
396 const int mil2 = std::min(std::min(a2, a7), c);
397
398 const int mal3 = std::max(std::max(a3, a6), c);
399 const int mil3 = std::min(std::min(a3, a6), c);
400
401 const int mal4 = std::max(std::max(a4, a5), c);
402 const int mil4 = std::min(std::min(a4, a5), c);
403
404 const int d1 = mal1 - mil1;
405 const int d2 = mal2 - mil2;
406 const int d3 = mal3 - mil3;
407 const int d4 = mal4 - mil4;
408
409 const int clipped1 = limit(cr, mil1, mal1);
410 const int clipped2 = limit(cr, mil2, mal2);
411 const int clipped3 = limit(cr, mil3, mal3);
412 const int clipped4 = limit(cr, mil4, mal4);
413
414 const int c1 = limit((std::abs(cr - clipped1) << 1) + d1, 0, 0xFFFF);
415 const int c2 = limit((std::abs(cr - clipped2) << 1) + d2, 0, 0xFFFF);
416 const int c3 = limit((std::abs(cr - clipped3) << 1) + d3, 0, 0xFFFF);
417 const int c4 = limit((std::abs(cr - clipped4) << 1) + d4, 0, 0xFFFF);
418
419 const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
420
421 if (mindiff == c4)
422 return clipped4;
423 else if (mindiff == c2)
424 return clipped2;
425 else if (mindiff == c3)
426 return clipped3;
427 else
428 return clipped1;
429 }
430 #ifdef VS_TARGET_CPU_X86
431 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)432 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
433 AvsFilterRepair16_READ_PIX
434
435 const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
436 const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
437
438 const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
439 const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
440
441 const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
442 const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
443
444 const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
445 const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
446
447 const __m128i d1 = _mm_sub_epi16(mal1, mil1);
448 const __m128i d2 = _mm_sub_epi16(mal2, mil2);
449 const __m128i d3 = _mm_sub_epi16(mal3, mil3);
450 const __m128i d4 = _mm_sub_epi16(mal4, mil4);
451
452 const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
453 const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
454 const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
455 const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
456
457 const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
458 const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
459 const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
460 const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
461 const __m128i cru = _mm_xor_si128(cr, mask_sign);
462
463 const __m128i absdiff1 = abs_dif_epu16(cru, clipped1u);
464 const __m128i absdiff2 = abs_dif_epu16(cru, clipped2u);
465 const __m128i absdiff3 = abs_dif_epu16(cru, clipped3u);
466 const __m128i absdiff4 = abs_dif_epu16(cru, clipped4u);
467
468 const __m128i c1u = _mm_adds_epu16(_mm_adds_epu16(absdiff1, absdiff1), d1);
469 const __m128i c2u = _mm_adds_epu16(_mm_adds_epu16(absdiff2, absdiff2), d2);
470 const __m128i c3u = _mm_adds_epu16(_mm_adds_epu16(absdiff3, absdiff3), d3);
471 const __m128i c4u = _mm_adds_epu16(_mm_adds_epu16(absdiff4, absdiff4), d4);
472
473 const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
474 const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
475 const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
476 const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
477
478 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
479
480 __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
481 result = select_16_equ(mindiff, c3, clipped3, result);
482 result = select_16_equ(mindiff, c2, clipped2, result);
483 return select_16_equ(mindiff, c4, clipped4, result);
484 }
485 #endif
486 };
487
488 class OpRG07
489 {
490 public:
491 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)492 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
493 const int mal1 = std::max(std::max(a1, a8), c);
494 const int mil1 = std::min(std::min(a1, a8), c);
495
496 const int mal2 = std::max(std::max(a2, a7), c);
497 const int mil2 = std::min(std::min(a2, a7), c);
498
499 const int mal3 = std::max(std::max(a3, a6), c);
500 const int mil3 = std::min(std::min(a3, a6), c);
501
502 const int mal4 = std::max(std::max(a4, a5), c);
503 const int mil4 = std::min(std::min(a4, a5), c);
504
505 const int d1 = mal1 - mil1;
506 const int d2 = mal2 - mil2;
507 const int d3 = mal3 - mil3;
508 const int d4 = mal4 - mil4;
509
510 const int clipped1 = limit(cr, mil1, mal1);
511 const int clipped2 = limit(cr, mil2, mal2);
512 const int clipped3 = limit(cr, mil3, mal3);
513 const int clipped4 = limit(cr, mil4, mal4);
514
515 const int c1 = std::abs(cr - clipped1) + d1;
516 const int c2 = std::abs(cr - clipped2) + d2;
517 const int c3 = std::abs(cr - clipped3) + d3;
518 const int c4 = std::abs(cr - clipped4) + d4;
519
520 const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
521
522 if (mindiff == c4)
523 return clipped4;
524 else if (mindiff == c2)
525 return clipped2;
526 else if (mindiff == c3)
527 return clipped3;
528 else
529 return clipped1;
530 }
531 #ifdef VS_TARGET_CPU_X86
532 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)533 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
534 AvsFilterRepair16_READ_PIX
535
536 const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
537 const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
538
539 const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
540 const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
541
542 const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
543 const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
544
545 const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
546 const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
547
548 const __m128i d1 = _mm_sub_epi16(mal1, mil1);
549 const __m128i d2 = _mm_sub_epi16(mal2, mil2);
550 const __m128i d3 = _mm_sub_epi16(mal3, mil3);
551 const __m128i d4 = _mm_sub_epi16(mal4, mil4);
552
553 const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
554 const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
555 const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
556 const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
557
558 const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
559 const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
560 const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
561 const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
562 const __m128i cru = _mm_xor_si128(cr, mask_sign);
563
564 //todo: what happens when this overflows?
565 const __m128i c1u = _mm_adds_epu16(abs_dif_epu16(cru, clipped1u), d1);
566 const __m128i c2u = _mm_adds_epu16(abs_dif_epu16(cru, clipped2u), d2);
567 const __m128i c3u = _mm_adds_epu16(abs_dif_epu16(cru, clipped3u), d3);
568 const __m128i c4u = _mm_adds_epu16(abs_dif_epu16(cru, clipped4u), d4);
569
570 const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
571 const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
572 const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
573 const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
574
575 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
576
577 __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
578 result = select_16_equ(mindiff, c3, clipped3, result);
579 result = select_16_equ(mindiff, c2, clipped2, result);
580 return select_16_equ(mindiff, c4, clipped4, result);
581 }
582 #endif
583 };
584
585 class OpRG08
586 {
587 public:
588 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)589 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
590 const int mal1 = std::max(std::max(a1, a8), c);
591 const int mil1 = std::min(std::min(a1, a8), c);
592
593 const int mal2 = std::max(std::max(a2, a7), c);
594 const int mil2 = std::min(std::min(a2, a7), c);
595
596 const int mal3 = std::max(std::max(a3, a6), c);
597 const int mil3 = std::min(std::min(a3, a6), c);
598
599 const int mal4 = std::max(std::max(a4, a5), c);
600 const int mil4 = std::min(std::min(a4, a5), c);
601
602 const int d1 = mal1 - mil1;
603 const int d2 = mal2 - mil2;
604 const int d3 = mal3 - mil3;
605 const int d4 = mal4 - mil4;
606
607 const int clipped1 = limit(cr, mil1, mal1);
608 const int clipped2 = limit(cr, mil2, mal2);
609 const int clipped3 = limit(cr, mil3, mal3);
610 const int clipped4 = limit(cr, mil4, mal4);
611
612 const int c1 = limit(std::abs(cr - clipped1) + (d1 << 1), 0, 0xFFFF);
613 const int c2 = limit(std::abs(cr - clipped2) + (d2 << 1), 0, 0xFFFF);
614 const int c3 = limit(std::abs(cr - clipped3) + (d3 << 1), 0, 0xFFFF);
615 const int c4 = limit(std::abs(cr - clipped4) + (d4 << 1), 0, 0xFFFF);
616
617 const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
618
619 if (mindiff == c4)
620 return clipped4;
621 else if (mindiff == c2)
622 return clipped2;
623 else if (mindiff == c3)
624 return clipped3;
625 else
626 return clipped1;
627 }
628 #ifdef VS_TARGET_CPU_X86
629 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)630 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
631 AvsFilterRepair16_READ_PIX
632
633 const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
634 const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
635
636 const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
637 const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
638
639 const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
640 const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
641
642 const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
643 const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
644
645 const __m128i d1 = _mm_sub_epi16(mal1, mil1);
646 const __m128i d2 = _mm_sub_epi16(mal2, mil2);
647 const __m128i d3 = _mm_sub_epi16(mal3, mil3);
648 const __m128i d4 = _mm_sub_epi16(mal4, mil4);
649
650 const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
651 const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
652 const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
653 const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
654
655 const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
656 const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
657 const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
658 const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
659 const __m128i cru = _mm_xor_si128(cr, mask_sign);
660
661 const __m128i c1u = _mm_adds_epu16(abs_dif_epu16(cru, clipped1u), _mm_adds_epu16(d1, d1));
662 const __m128i c2u = _mm_adds_epu16(abs_dif_epu16(cru, clipped2u), _mm_adds_epu16(d2, d2));
663 const __m128i c3u = _mm_adds_epu16(abs_dif_epu16(cru, clipped3u), _mm_adds_epu16(d3, d3));
664 const __m128i c4u = _mm_adds_epu16(abs_dif_epu16(cru, clipped4u), _mm_adds_epu16(d4, d4));
665
666 const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
667 const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
668 const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
669 const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
670
671 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
672
673 __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
674 result = select_16_equ(mindiff, c3, clipped3, result);
675 result = select_16_equ(mindiff, c2, clipped2, result);
676 return select_16_equ(mindiff, c4, clipped4, result);
677 }
678 #endif
679 };
680
681 class OpRG09
682 {
683 public:
684 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)685 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
686 const int mal1 = std::max(std::max(a1, a8), c);
687 const int mil1 = std::min(std::min(a1, a8), c);
688
689 const int mal2 = std::max(std::max(a2, a7), c);
690 const int mil2 = std::min(std::min(a2, a7), c);
691
692 const int mal3 = std::max(std::max(a3, a6), c);
693 const int mil3 = std::min(std::min(a3, a6), c);
694
695 const int mal4 = std::max(std::max(a4, a5), c);
696 const int mil4 = std::min(std::min(a4, a5), c);
697
698 const int d1 = mal1 - mil1;
699 const int d2 = mal2 - mil2;
700 const int d3 = mal3 - mil3;
701 const int d4 = mal4 - mil4;
702
703 const int mindiff = std::min(std::min(d1, d2), std::min(d3, d4));
704
705 if (mindiff == d4)
706 return limit(cr, mil4, mal4);
707 else if (mindiff == d2)
708 return limit(cr, mil2, mal2);
709 else if (mindiff == d3)
710 return limit(cr, mil3, mal3);
711 else
712 return limit(cr, mil1, mal1);
713 }
714 #ifdef VS_TARGET_CPU_X86
715 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)716 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
717 AvsFilterRepair16_READ_PIX
718
719 const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
720 const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
721
722 const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
723 const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
724
725 const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
726 const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
727
728 const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
729 const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
730
731 const __m128i d1 = _mm_sub_epi16(mal1, mil1);
732 const __m128i d2 = _mm_sub_epi16(mal2, mil2);
733 const __m128i d3 = _mm_sub_epi16(mal3, mil3);
734 const __m128i d4 = _mm_sub_epi16(mal4, mil4);
735
736 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4));
737
738 __m128i result = select_16_equ(mindiff, d1, limit_epi16(cr, mil1, mal1), cr);
739 result = select_16_equ(mindiff, d3, limit_epi16(cr, mil3, mal3), result);
740 result = select_16_equ(mindiff, d2, limit_epi16(cr, mil2, mal2), result);
741 return select_16_equ(mindiff, d4, limit_epi16(cr, mil4, mal4), result);
742 }
743 #endif
744 };
745
746 class OpRG10
747 {
748 public:
749 typedef ConvUnsigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)750 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
751 const int d1 = std::abs(cr - a1);
752 const int d2 = std::abs(cr - a2);
753 const int d3 = std::abs(cr - a3);
754 const int d4 = std::abs(cr - a4);
755 const int d5 = std::abs(cr - a5);
756 const int d6 = std::abs(cr - a6);
757 const int d7 = std::abs(cr - a7);
758 const int d8 = std::abs(cr - a8);
759 const int dc = std::abs(cr - c);
760
761 const int mindiff = std::min(std::min(std::min(std::min(d1, d2), std::min(d3, d4)), std::min(std::min(d5, d6), std::min(d7, d8))), dc);
762
763 if (mindiff == d7)
764 return a7;
765 else if (mindiff == d8)
766 return a8;
767 else if (mindiff == d6)
768 return a6;
769 else if (mindiff == d2)
770 return a2;
771 else if (mindiff == d3)
772 return a3;
773 else if (mindiff == d1)
774 return a1;
775 else if (mindiff == d5)
776 return a5;
777 else if (mindiff == dc)
778 return c;
779 else
780 return a4;
781 }
782 #ifdef VS_TARGET_CPU_X86
783 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)784 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
785 AvsFilterRepair16_READ_PIX
786
787 const __m128i d1u = abs_dif_epu16(cr, a1);
788 const __m128i d2u = abs_dif_epu16(cr, a2);
789 const __m128i d3u = abs_dif_epu16(cr, a3);
790 const __m128i d4u = abs_dif_epu16(cr, a4);
791 const __m128i d5u = abs_dif_epu16(cr, a5);
792 const __m128i d6u = abs_dif_epu16(cr, a6);
793 const __m128i d7u = abs_dif_epu16(cr, a7);
794 const __m128i d8u = abs_dif_epu16(cr, a8);
795 const __m128i dcu = abs_dif_epu16(cr, c);
796
797 const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
798 const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
799 const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
800 const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
801 const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
802 const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
803 const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
804 const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
805 const __m128i dc = _mm_xor_si128(dcu, mask_sign);
806
807 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4)), _mm_min_epi16(_mm_min_epi16(d5, d6), _mm_min_epi16(d7, d8))), dc);
808
809 __m128i result = select_16_equ(mindiff, d4, a4, c);
810 result = select_16_equ(mindiff, dc, c, result);
811 result = select_16_equ(mindiff, d5, a5, result);
812 result = select_16_equ(mindiff, d1, a1, result);
813 result = select_16_equ(mindiff, d3, a3, result);
814 result = select_16_equ(mindiff, d2, a2, result);
815 result = select_16_equ(mindiff, d6, a6, result);
816 result = select_16_equ(mindiff, d8, a8, result);
817 return select_16_equ(mindiff, d7, a7, result);
818 }
819 #endif
820 };
821
822 class OpRG12
823 {
824 public:
825 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)826 static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
827 int a [8] = { a1, a2, a3, a4, a5, a6, a7, a8 };
828
829 std::sort (&a [0], (&a [0]) + 8);
830 const int mi = std::min (a [2-1], c);
831 const int ma = std::max (a [7-1], c);
832
833 return (limit (cr, mi, ma));
834 }
835 #ifdef VS_TARGET_CPU_X86
836 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)837 static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
838 AvsFilterRepair16_READ_PIX
839
840 sort_pair (a1, a2);
841 sort_pair (a3, a4);
842 sort_pair (a5, a6);
843 sort_pair (a7, a8);
844
845 sort_pair (a1, a3);
846 sort_pair (a2, a4);
847 sort_pair (a5, a7);
848 sort_pair (a6, a8);
849
850 sort_pair (a2, a3);
851 sort_pair (a6, a7);
852
853 a5 = _mm_max_epi16 (a1, a5); // sort_pair (a1, a5);
854 sort_pair (a2, a6);
855 sort_pair (a3, a7);
856 a4 = _mm_min_epi16 (a4, a8); // sort_pair (a4, a8);
857
858 a3 = _mm_min_epi16 (a3, a5); // sort_pair (a3, a5);
859 a6 = _mm_max_epi16 (a4, a6); // sort_pair (a4, a6);
860
861 a2 = _mm_min_epi16 (a2, a3); // sort_pair (a2, a3);
862 a7 = _mm_max_epi16 (a6, a7); // sort_pair (a6, a7);
863
864 const __m128i mi = _mm_min_epi16 (c, a2);
865 const __m128i ma = _mm_max_epi16 (c, a7);
866
867 return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
868 }
869 #endif
870 };
871
872 class OpRG13
873 {
874 public:
875 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)876 static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
877 int a [8] = { a1, a2, a3, a4, a5, a6, a7, a8 };
878
879 std::sort (&a [0], (&a [0]) + 8);
880 const int mi = std::min (a [3-1], c);
881 const int ma = std::max (a [6-1], c);
882
883 return (limit (cr, mi, ma));
884 }
885 #ifdef VS_TARGET_CPU_X86
886 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)887 static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
888 AvsFilterRepair16_READ_PIX
889
890 sort_pair (a1, a2);
891 sort_pair (a3, a4);
892 sort_pair (a5, a6);
893 sort_pair (a7, a8);
894
895 sort_pair (a1, a3);
896 sort_pair (a2, a4);
897 sort_pair (a5, a7);
898 sort_pair (a6, a8);
899
900 sort_pair (a2, a3);
901 sort_pair (a6, a7);
902
903 a5 = _mm_max_epi16 (a1, a5); // sort_pair (a1, a5);
904 sort_pair (a2, a6);
905 sort_pair (a3, a7);
906 a4 = _mm_min_epi16 (a4, a8); // sort_pair (a4, a8);
907
908 a3 = _mm_min_epi16 (a3, a5); // sort_pair (a3, a5);
909 a6 = _mm_max_epi16 (a4, a6); // sort_pair (a4, a6);
910
911 a3 = _mm_max_epi16 (a2, a3); // sort_pair (a2, a3);
912 a6 = _mm_min_epi16 (a6, a7); // sort_pair (a6, a7);
913
914 const __m128i mi = _mm_min_epi16 (c, a3);
915 const __m128i ma = _mm_max_epi16 (c, a6);
916
917 return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
918 }
919 #endif
920 };
921
922 class OpRG14
923 {
924 public:
925 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)926 static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
927 int a [8] = { a1, a2, a3, a4, a5, a6, a7, a8 };
928
929 std::sort (&a [0], (&a [0]) + 8);
930 const int mi = std::min (a [4-1], c);
931 const int ma = std::max (a [5-1], c);
932
933 return (limit (cr, mi, ma));
934 }
935 #ifdef VS_TARGET_CPU_X86
936 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)937 static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
938 AvsFilterRepair16_READ_PIX
939
940 sort_pair (a1, a2);
941 sort_pair (a3, a4);
942 sort_pair (a5, a6);
943 sort_pair (a7, a8);
944
945 sort_pair (a1, a3);
946 sort_pair (a2, a4);
947 sort_pair (a5, a7);
948 sort_pair (a6, a8);
949
950 sort_pair (a2, a3);
951 sort_pair (a6, a7);
952
953 a5 = _mm_max_epi16 (a1, a5); // sort_pair (a1, a5);
954 a6 = _mm_max_epi16 (a2, a6); // sort_pair (a2, a6);
955 a3 = _mm_min_epi16 (a3, a7); // sort_pair (a3, a7);
956 a4 = _mm_min_epi16 (a4, a8); // sort_pair (a4, a8);
957
958 a5 = _mm_max_epi16 (a3, a5); // sort_pair (a3, a5);
959 a4 = _mm_min_epi16 (a4, a6); // sort_pair (a4, a6);
960
961 sort_pair (a4, a5);
962
963 const __m128i mi = _mm_min_epi16 (c, a4);
964 const __m128i ma = _mm_max_epi16 (c, a5);
965
966 return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
967 }
968 #endif
969 };
970
971 class OpRG15 {
972 public:
973 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)974 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
975 AvsFilterRepair16_SORT_AXIS_CPP
976
977 const int c1 = std::abs(c - limit(c, mi1, ma1));
978 const int c2 = std::abs(c - limit(c, mi2, ma2));
979 const int c3 = std::abs(c - limit(c, mi3, ma3));
980 const int c4 = std::abs(c - limit(c, mi4, ma4));
981
982 const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
983
984 int mi;
985 int ma;
986 if (mindiff == c4) {
987 mi = mi4;
988 ma = ma4;
989 } else if (mindiff == c2) {
990 mi = mi2;
991 ma = ma2;
992 } else if (mindiff == c3) {
993 mi = mi3;
994 ma = ma3;
995 } else {
996 mi = mi1;
997 ma = ma1;
998 }
999
1000 mi = std::min(mi, c);
1001 ma = std::max(ma, c);
1002
1003 return (limit(cr, mi, ma));
1004 }
1005 #ifdef VS_TARGET_CPU_X86
1006 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1007 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1008 AvsFilterRepair16_READ_PIX
1009 AvsFilterRepair16_SORT_AXIS_SSE2
1010
1011 const __m128i cma1 = _mm_max_epi16(c, ma1);
1012 const __m128i cma2 = _mm_max_epi16(c, ma2);
1013 const __m128i cma3 = _mm_max_epi16(c, ma3);
1014 const __m128i cma4 = _mm_max_epi16(c, ma4);
1015
1016 const __m128i cmi1 = _mm_min_epi16(c, mi1);
1017 const __m128i cmi2 = _mm_min_epi16(c, mi2);
1018 const __m128i cmi3 = _mm_min_epi16(c, mi3);
1019 const __m128i cmi4 = _mm_min_epi16(c, mi4);
1020
1021 const __m128i clipped1 = limit_epi16(c, mi1, ma1);
1022 const __m128i clipped2 = limit_epi16(c, mi2, ma2);
1023 const __m128i clipped3 = limit_epi16(c, mi3, ma3);
1024 const __m128i clipped4 = limit_epi16(c, mi4, ma4);
1025
1026 const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
1027 const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
1028 const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
1029 const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
1030 const __m128i cu = _mm_xor_si128(c, mask_sign);
1031
1032 const __m128i c1u = abs_dif_epu16(cu, clipped1u);
1033 const __m128i c2u = abs_dif_epu16(cu, clipped2u);
1034 const __m128i c3u = abs_dif_epu16(cu, clipped3u);
1035 const __m128i c4u = abs_dif_epu16(cu, clipped4u);
1036
1037 const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
1038 const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
1039 const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
1040 const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
1041
1042 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
1043
1044 __m128i result = select_16_equ(mindiff, c1, limit_epi16(cr, cmi1, cma1), cr);
1045 result = select_16_equ(mindiff, c3, limit_epi16(cr, cmi3, cma3), result);
1046 result = select_16_equ(mindiff, c2, limit_epi16(cr, cmi2, cma2), result);
1047 return select_16_equ(mindiff, c4, limit_epi16(cr, cmi4, cma4), result);
1048 }
1049 #endif
1050 };
1051
1052 class OpRG16 {
1053 public:
1054 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1055 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1056 AvsFilterRepair16_SORT_AXIS_CPP
1057
1058 const int d1 = ma1 - mi1;
1059 const int d2 = ma2 - mi2;
1060 const int d3 = ma3 - mi3;
1061 const int d4 = ma4 - mi4;
1062
1063 const int c1 = limit((std::abs(c - limit(c, mi1, ma1)) << 1) + d1, 0, 0xFFFF);
1064 const int c2 = limit((std::abs(c - limit(c, mi2, ma2)) << 1) + d2, 0, 0xFFFF);
1065 const int c3 = limit((std::abs(c - limit(c, mi3, ma3)) << 1) + d3, 0, 0xFFFF);
1066 const int c4 = limit((std::abs(c - limit(c, mi4, ma4)) << 1) + d4, 0, 0xFFFF);
1067
1068 const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
1069
1070 int mi;
1071 int ma;
1072 if (mindiff == c4) {
1073 mi = mi4;
1074 ma = ma4;
1075 } else if (mindiff == c2) {
1076 mi = mi2;
1077 ma = ma2;
1078 } else if (mindiff == c3) {
1079 mi = mi3;
1080 ma = ma3;
1081 } else {
1082 mi = mi1;
1083 ma = ma1;
1084 }
1085
1086 mi = std::min(mi, c);
1087 ma = std::max(ma, c);
1088
1089 return (limit(cr, mi, ma));
1090 }
1091 #ifdef VS_TARGET_CPU_X86
1092 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1093 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1094 AvsFilterRepair16_READ_PIX
1095 AvsFilterRepair16_SORT_AXIS_SSE2
1096
1097 const __m128i cma1 = _mm_max_epi16(c, ma1);
1098 const __m128i cma2 = _mm_max_epi16(c, ma2);
1099 const __m128i cma3 = _mm_max_epi16(c, ma3);
1100 const __m128i cma4 = _mm_max_epi16(c, ma4);
1101
1102 const __m128i cmi1 = _mm_min_epi16(c, mi1);
1103 const __m128i cmi2 = _mm_min_epi16(c, mi2);
1104 const __m128i cmi3 = _mm_min_epi16(c, mi3);
1105 const __m128i cmi4 = _mm_min_epi16(c, mi4);
1106
1107 const __m128i d1 = _mm_sub_epi16(ma1, mi1);
1108 const __m128i d2 = _mm_sub_epi16(ma2, mi2);
1109 const __m128i d3 = _mm_sub_epi16(ma3, mi3);
1110 const __m128i d4 = _mm_sub_epi16(ma4, mi4);
1111
1112 const __m128i clipped1 = limit_epi16(c, mi1, ma1);
1113 const __m128i clipped2 = limit_epi16(c, mi2, ma2);
1114 const __m128i clipped3 = limit_epi16(c, mi3, ma3);
1115 const __m128i clipped4 = limit_epi16(c, mi4, ma4);
1116
1117 const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
1118 const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
1119 const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
1120 const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
1121 const __m128i cu = _mm_xor_si128(c, mask_sign);
1122
1123 const __m128i absdiff1 = abs_dif_epu16(cu, clipped1u);
1124 const __m128i absdiff2 = abs_dif_epu16(cu, clipped2u);
1125 const __m128i absdiff3 = abs_dif_epu16(cu, clipped3u);
1126 const __m128i absdiff4 = abs_dif_epu16(cu, clipped4u);
1127
1128 const __m128i c1u = _mm_adds_epu16(_mm_adds_epu16(absdiff1, absdiff1), d1);
1129 const __m128i c2u = _mm_adds_epu16(_mm_adds_epu16(absdiff2, absdiff2), d2);
1130 const __m128i c3u = _mm_adds_epu16(_mm_adds_epu16(absdiff3, absdiff3), d3);
1131 const __m128i c4u = _mm_adds_epu16(_mm_adds_epu16(absdiff4, absdiff4), d4);
1132
1133 const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
1134 const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
1135 const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
1136 const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
1137
1138 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
1139
1140 __m128i result = select_16_equ(mindiff, c1, limit_epi16(cr, cmi1, cma1), cr);
1141 result = select_16_equ(mindiff, c3, limit_epi16(cr, cmi3, cma3), result);
1142 result = select_16_equ(mindiff, c2, limit_epi16(cr, cmi2, cma2), result);
1143 return select_16_equ(mindiff, c4, limit_epi16(cr, cmi4, cma4), result);
1144 }
1145 #endif
1146 };
1147
1148 class OpRG17 {
1149 public:
1150 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1151 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1152 AvsFilterRepair16_SORT_AXIS_CPP
1153
1154 const int l = std::max(std::max(mi1, mi2), std::max(mi3, mi4));
1155 const int u = std::min(std::min(ma1, ma2), std::min(ma3, ma4));
1156
1157 const int mi = std::min(std::min(l, u), c);
1158 const int ma = std::max(std::max(l, u), c);
1159
1160 return (limit(cr, mi, ma));
1161 }
1162 #ifdef VS_TARGET_CPU_X86
1163 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1164 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1165 AvsFilterRepair16_READ_PIX
1166 AvsFilterRepair16_SORT_AXIS_SSE2
1167
1168 const __m128i lower = _mm_max_epi16(_mm_max_epi16(mi1, mi2), _mm_max_epi16(mi3, mi4));
1169 const __m128i upper = _mm_min_epi16(_mm_min_epi16(ma1, ma2), _mm_min_epi16(ma3, ma4));
1170
1171 const __m128i real_upper = _mm_max_epi16(_mm_max_epi16(upper, lower), c);
1172 const __m128i real_lower = _mm_min_epi16(_mm_min_epi16(upper, lower), c);
1173
1174 return limit_epi16(cr, real_lower, real_upper);
1175 }
1176 #endif
1177 };
1178
1179 class OpRG18 {
1180 public:
1181 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1182 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1183 const int d1 = std::max(std::abs(c - a1), std::abs(c - a8));
1184 const int d2 = std::max(std::abs(c - a2), std::abs(c - a7));
1185 const int d3 = std::max(std::abs(c - a3), std::abs(c - a6));
1186 const int d4 = std::max(std::abs(c - a4), std::abs(c - a5));
1187
1188 const int mindiff = std::min(std::min(d1, d2), std::min(d3, d4));
1189
1190 int mi;
1191 int ma;
1192 if (mindiff == d4) {
1193 mi = std::min(a4, a5);
1194 ma = std::max(a4, a5);
1195 } else if (mindiff == d2) {
1196 mi = std::min(a2, a7);
1197 ma = std::max(a2, a7);
1198 } else if (mindiff == d3) {
1199 mi = std::min(a3, a6);
1200 ma = std::max(a3, a6);
1201 } else {
1202 mi = std::min(a1, a8);
1203 ma = std::max(a1, a8);
1204 }
1205
1206 mi = std::min(mi, c);
1207 ma = std::max(ma, c);
1208
1209 return (limit(cr, mi, ma));
1210 }
1211 #ifdef VS_TARGET_CPU_X86
1212 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1213 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1214 AvsFilterRepair16_READ_PIX
1215
1216 const __m128i a1u = _mm_xor_si128(a1, mask_sign);
1217 const __m128i a2u = _mm_xor_si128(a2, mask_sign);
1218 const __m128i a3u = _mm_xor_si128(a3, mask_sign);
1219 const __m128i a4u = _mm_xor_si128(a4, mask_sign);
1220 const __m128i a5u = _mm_xor_si128(a5, mask_sign);
1221 const __m128i a6u = _mm_xor_si128(a6, mask_sign);
1222 const __m128i a7u = _mm_xor_si128(a7, mask_sign);
1223 const __m128i a8u = _mm_xor_si128(a8, mask_sign);
1224 const __m128i cu = _mm_xor_si128(c, mask_sign);
1225
1226 const __m128i absdiff1u = abs_dif_epu16(cu, a1u);
1227 const __m128i absdiff2u = abs_dif_epu16(cu, a2u);
1228 const __m128i absdiff3u = abs_dif_epu16(cu, a3u);
1229 const __m128i absdiff4u = abs_dif_epu16(cu, a4u);
1230 const __m128i absdiff5u = abs_dif_epu16(cu, a5u);
1231 const __m128i absdiff6u = abs_dif_epu16(cu, a6u);
1232 const __m128i absdiff7u = abs_dif_epu16(cu, a7u);
1233 const __m128i absdiff8u = abs_dif_epu16(cu, a8u);
1234
1235 const __m128i absdiff1 = _mm_xor_si128(absdiff1u, mask_sign);
1236 const __m128i absdiff2 = _mm_xor_si128(absdiff2u, mask_sign);
1237 const __m128i absdiff3 = _mm_xor_si128(absdiff3u, mask_sign);
1238 const __m128i absdiff4 = _mm_xor_si128(absdiff4u, mask_sign);
1239 const __m128i absdiff5 = _mm_xor_si128(absdiff5u, mask_sign);
1240 const __m128i absdiff6 = _mm_xor_si128(absdiff6u, mask_sign);
1241 const __m128i absdiff7 = _mm_xor_si128(absdiff7u, mask_sign);
1242 const __m128i absdiff8 = _mm_xor_si128(absdiff8u, mask_sign);
1243
1244 const __m128i d1 = _mm_max_epi16(absdiff1, absdiff8);
1245 const __m128i d2 = _mm_max_epi16(absdiff2, absdiff7);
1246 const __m128i d3 = _mm_max_epi16(absdiff3, absdiff6);
1247 const __m128i d4 = _mm_max_epi16(absdiff4, absdiff5);
1248
1249 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4));
1250
1251 const __m128i mi1 = _mm_min_epi16(c, _mm_min_epi16(a1, a8));
1252 const __m128i mi2 = _mm_min_epi16(c, _mm_min_epi16(a2, a7));
1253 const __m128i mi3 = _mm_min_epi16(c, _mm_min_epi16(a3, a6));
1254 const __m128i mi4 = _mm_min_epi16(c, _mm_min_epi16(a4, a5));
1255
1256 const __m128i ma1 = _mm_max_epi16(c, _mm_max_epi16(a1, a8));
1257 const __m128i ma2 = _mm_max_epi16(c, _mm_max_epi16(a2, a7));
1258 const __m128i ma3 = _mm_max_epi16(c, _mm_max_epi16(a3, a6));
1259 const __m128i ma4 = _mm_max_epi16(c, _mm_max_epi16(a4, a5));
1260
1261 const __m128i c1 = limit_epi16(cr, mi1, ma1);
1262 const __m128i c2 = limit_epi16(cr, mi2, ma2);
1263 const __m128i c3 = limit_epi16(cr, mi3, ma3);
1264 const __m128i c4 = limit_epi16(cr, mi4, ma4);
1265
1266 __m128i result = select_16_equ(mindiff, d1, c1, cr);
1267 result = select_16_equ(mindiff, d3, c3, result);
1268 result = select_16_equ(mindiff, d2, c2, result);
1269 return select_16_equ(mindiff, d4, c4, result);
1270 }
1271 #endif
1272 };
1273
1274 class OpRG19 {
1275 public:
1276 typedef ConvUnsigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1277 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1278 const int d1 = std::abs(c - a1);
1279 const int d2 = std::abs(c - a2);
1280 const int d3 = std::abs(c - a3);
1281 const int d4 = std::abs(c - a4);
1282 const int d5 = std::abs(c - a5);
1283 const int d6 = std::abs(c - a6);
1284 const int d7 = std::abs(c - a7);
1285 const int d8 = std::abs(c - a8);
1286
1287 const int mindiff = std::min(std::min(std::min(d1, d2), std::min(d3, d4)), std::min(std::min(d5, d6), std::min(d7, d8)));
1288
1289 return limit(cr, limit(c - mindiff, 0, 0xFFFF), limit(c + mindiff, 0, 0xFFFF));
1290 }
1291 #ifdef VS_TARGET_CPU_X86
1292 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1293 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1294 AvsFilterRepair16_READ_PIX
1295
1296 const __m128i d1u = abs_dif_epu16(c, a1);
1297 const __m128i d2u = abs_dif_epu16(c, a2);
1298 const __m128i d3u = abs_dif_epu16(c, a3);
1299 const __m128i d4u = abs_dif_epu16(c, a4);
1300 const __m128i d5u = abs_dif_epu16(c, a5);
1301 const __m128i d6u = abs_dif_epu16(c, a6);
1302 const __m128i d7u = abs_dif_epu16(c, a7);
1303 const __m128i d8u = abs_dif_epu16(c, a8);
1304
1305 const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1306 const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1307 const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1308 const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1309 const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1310 const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1311 const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1312 const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1313
1314 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4)), _mm_min_epi16(_mm_min_epi16(d5, d6), _mm_min_epi16(d7, d8)));
1315
1316 const __m128i mindiffu = _mm_xor_si128(mindiff, mask_sign);
1317
1318 const __m128i mi = _mm_xor_si128(_mm_subs_epu16(c, mindiffu), mask_sign);
1319 const __m128i ma = _mm_xor_si128(_mm_adds_epu16(c, mindiffu), mask_sign);
1320
1321 return _mm_xor_si128(limit_epi16(_mm_xor_si128(cr, mask_sign), mi, ma), mask_sign);
1322 }
1323 #endif
1324 };
1325
1326 class OpRG20 {
1327 public:
1328 typedef ConvUnsigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1329 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1330 const int d1 = std::abs(c - a1);
1331 const int d2 = std::abs(c - a2);
1332 const int d3 = std::abs(c - a3);
1333 const int d4 = std::abs(c - a4);
1334 const int d5 = std::abs(c - a5);
1335 const int d6 = std::abs(c - a6);
1336 const int d7 = std::abs(c - a7);
1337 const int d8 = std::abs(c - a8);
1338
1339 int mindiff = std::min(d1, d2);
1340 int maxdiff = std::max(d1, d2);
1341
1342 maxdiff = limit(maxdiff, mindiff, d3);
1343 mindiff = std::min(mindiff, d3);
1344
1345 maxdiff = limit(maxdiff, mindiff, d4);
1346 mindiff = std::min(mindiff, d4);
1347
1348 maxdiff = limit(maxdiff, mindiff, d5);
1349 mindiff = std::min(mindiff, d5);
1350
1351 maxdiff = limit(maxdiff, mindiff, d6);
1352 mindiff = std::min(mindiff, d6);
1353
1354 maxdiff = limit(maxdiff, mindiff, d7);
1355 mindiff = std::min(mindiff, d7);
1356
1357 maxdiff = limit(maxdiff, mindiff, d8);
1358
1359 return limit(cr, limit(c - maxdiff, 0, 0xFFFF), limit(c + maxdiff, 0, 0xFFFF));
1360 }
1361 #ifdef VS_TARGET_CPU_X86
1362 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1363 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1364 AvsFilterRepair16_READ_PIX
1365
1366 const __m128i d1u = abs_dif_epu16(c, a1);
1367 const __m128i d2u = abs_dif_epu16(c, a2);
1368 const __m128i d3u = abs_dif_epu16(c, a3);
1369 const __m128i d4u = abs_dif_epu16(c, a4);
1370 const __m128i d5u = abs_dif_epu16(c, a5);
1371 const __m128i d6u = abs_dif_epu16(c, a6);
1372 const __m128i d7u = abs_dif_epu16(c, a7);
1373 const __m128i d8u = abs_dif_epu16(c, a8);
1374
1375 const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1376 const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1377 const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1378 const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1379 const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1380 const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1381 const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1382 const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1383
1384 __m128i mindiff = _mm_min_epi16(d1, d2);
1385 __m128i maxdiff = _mm_max_epi16(d1, d2);
1386
1387 maxdiff = limit_epi16(maxdiff, mindiff, d3);
1388 mindiff = _mm_min_epi16(mindiff, d3);
1389
1390 maxdiff = limit_epi16(maxdiff, mindiff, d4);
1391 mindiff = _mm_min_epi16(mindiff, d4);
1392
1393 maxdiff = limit_epi16(maxdiff, mindiff, d5);
1394 mindiff = _mm_min_epi16(mindiff, d5);
1395
1396 maxdiff = limit_epi16(maxdiff, mindiff, d6);
1397 mindiff = _mm_min_epi16(mindiff, d6);
1398
1399 maxdiff = limit_epi16(maxdiff, mindiff, d7);
1400 mindiff = _mm_min_epi16(mindiff, d7);
1401
1402 maxdiff = limit_epi16(maxdiff, mindiff, d8);
1403
1404 const __m128i maxdiffu = _mm_xor_si128(maxdiff, mask_sign);
1405
1406 const __m128i mi = _mm_xor_si128(_mm_subs_epu16(c, maxdiffu), mask_sign);
1407 const __m128i ma = _mm_xor_si128(_mm_adds_epu16(c, maxdiffu), mask_sign);
1408
1409 return _mm_xor_si128(limit_epi16(_mm_xor_si128(cr, mask_sign), mi, ma), mask_sign);
1410 }
1411 #endif
1412 };
1413
1414 class OpRG21 {
1415 public:
1416 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1417 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1418 AvsFilterRepair16_SORT_AXIS_CPP
1419
1420 const int d1 = limit(ma1 - c, 0, 0xFFFF);
1421 const int d2 = limit(ma2 - c, 0, 0xFFFF);
1422 const int d3 = limit(ma3 - c, 0, 0xFFFF);
1423 const int d4 = limit(ma4 - c, 0, 0xFFFF);
1424
1425 const int rd1 = limit(c - mi1, 0, 0xFFFF);
1426 const int rd2 = limit(c - mi2, 0, 0xFFFF);
1427 const int rd3 = limit(c - mi3, 0, 0xFFFF);
1428 const int rd4 = limit(c - mi4, 0, 0xFFFF);
1429
1430 const int u1 = std::max(d1, rd1);
1431 const int u2 = std::max(d2, rd2);
1432 const int u3 = std::max(d3, rd3);
1433 const int u4 = std::max(d4, rd4);
1434
1435 const int u = std::min(std::min(u1, u2), std::min(u3, u4));
1436
1437 return limit(cr, limit(c - u, 0, 0xFFFF), limit(c + u, 0, 0xFFFF));
1438 }
1439 #ifdef VS_TARGET_CPU_X86
1440 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1441 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1442 AvsFilterRepair16_READ_PIX
1443 AvsFilterRepair16_SORT_AXIS_SSE2
1444
1445 const __m128i d1 = _mm_subs_epi16(ma1, c);
1446 const __m128i d2 = _mm_subs_epi16(ma2, c);
1447 const __m128i d3 = _mm_subs_epi16(ma3, c);
1448 const __m128i d4 = _mm_subs_epi16(ma4, c);
1449
1450 const __m128i rd1 = _mm_subs_epi16(c, mi1);
1451 const __m128i rd2 = _mm_subs_epi16(c, mi2);
1452 const __m128i rd3 = _mm_subs_epi16(c, mi3);
1453 const __m128i rd4 = _mm_subs_epi16(c, mi4);
1454
1455 const __m128i u1 = _mm_max_epi16(d1, rd1);
1456 const __m128i u2 = _mm_max_epi16(d2, rd2);
1457 const __m128i u3 = _mm_max_epi16(d3, rd3);
1458 const __m128i u4 = _mm_max_epi16(d4, rd4);
1459
1460 const __m128i u = _mm_min_epi16(_mm_min_epi16(u1, u2), _mm_min_epi16(u3, u4));
1461
1462 const __m128i mi = _mm_subs_epi16(c, u);
1463 const __m128i ma = _mm_adds_epi16(c, u);
1464
1465 return limit_epi16(cr, mi, ma);
1466 }
1467 #endif
1468 };
1469
1470 class OpRG22 {
1471 public:
1472 typedef ConvUnsigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1473 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1474 const int d1 = std::abs(cr - a1);
1475 const int d2 = std::abs(cr - a2);
1476 const int d3 = std::abs(cr - a3);
1477 const int d4 = std::abs(cr - a4);
1478 const int d5 = std::abs(cr - a5);
1479 const int d6 = std::abs(cr - a6);
1480 const int d7 = std::abs(cr - a7);
1481 const int d8 = std::abs(cr - a8);
1482
1483 const int mindiff = std::min(std::min(std::min(d1, d2), std::min(d3, d4)), std::min(std::min(d5, d6), std::min(d7, d8)));
1484
1485 return limit(c, limit(cr - mindiff, 0, 0xFFFF), limit(cr + mindiff, 0, 0xFFFF));
1486 }
1487 #ifdef VS_TARGET_CPU_X86
1488 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1489 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1490 AvsFilterRepair16_READ_PIX
1491
1492 const __m128i d1u = abs_dif_epu16(cr, a1);
1493 const __m128i d2u = abs_dif_epu16(cr, a2);
1494 const __m128i d3u = abs_dif_epu16(cr, a3);
1495 const __m128i d4u = abs_dif_epu16(cr, a4);
1496 const __m128i d5u = abs_dif_epu16(cr, a5);
1497 const __m128i d6u = abs_dif_epu16(cr, a6);
1498 const __m128i d7u = abs_dif_epu16(cr, a7);
1499 const __m128i d8u = abs_dif_epu16(cr, a8);
1500
1501 const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1502 const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1503 const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1504 const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1505 const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1506 const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1507 const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1508 const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1509
1510 const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4)), _mm_min_epi16(_mm_min_epi16(d5, d6), _mm_min_epi16(d7, d8)));
1511
1512 const __m128i mindiffu = _mm_xor_si128(mindiff, mask_sign);
1513
1514 const __m128i mi = _mm_xor_si128(_mm_subs_epu16(cr, mindiffu), mask_sign);
1515 const __m128i ma = _mm_xor_si128(_mm_adds_epu16(cr, mindiffu), mask_sign);
1516
1517 return _mm_xor_si128(limit_epi16(_mm_xor_si128(c, mask_sign), mi, ma), mask_sign);
1518 }
1519 #endif
1520 };
1521
1522 class OpRG23 {
1523 public:
1524 typedef ConvUnsigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1525 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1526 const int d1 = std::abs(cr - a1);
1527 const int d2 = std::abs(cr - a2);
1528 const int d3 = std::abs(cr - a3);
1529 const int d4 = std::abs(cr - a4);
1530 const int d5 = std::abs(cr - a5);
1531 const int d6 = std::abs(cr - a6);
1532 const int d7 = std::abs(cr - a7);
1533 const int d8 = std::abs(cr - a8);
1534
1535 int mindiff = std::min(d1, d2);
1536 int maxdiff = std::max(d1, d2);
1537
1538 maxdiff = limit(maxdiff, mindiff, d3);
1539 mindiff = std::min(mindiff, d3);
1540
1541 maxdiff = limit(maxdiff, mindiff, d4);
1542 mindiff = std::min(mindiff, d4);
1543
1544 maxdiff = limit(maxdiff, mindiff, d5);
1545 mindiff = std::min(mindiff, d5);
1546
1547 maxdiff = limit(maxdiff, mindiff, d6);
1548 mindiff = std::min(mindiff, d6);
1549
1550 maxdiff = limit(maxdiff, mindiff, d7);
1551 mindiff = std::min(mindiff, d7);
1552
1553 maxdiff = limit(maxdiff, mindiff, d8);
1554
1555 return limit(c, limit(cr - maxdiff, 0, 0xFFFF), limit(cr + maxdiff, 0, 0xFFFF));
1556 }
1557 #ifdef VS_TARGET_CPU_X86
1558 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1559 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1560 AvsFilterRepair16_READ_PIX
1561
1562 const __m128i d1u = abs_dif_epu16(cr, a1);
1563 const __m128i d2u = abs_dif_epu16(cr, a2);
1564 const __m128i d3u = abs_dif_epu16(cr, a3);
1565 const __m128i d4u = abs_dif_epu16(cr, a4);
1566 const __m128i d5u = abs_dif_epu16(cr, a5);
1567 const __m128i d6u = abs_dif_epu16(cr, a6);
1568 const __m128i d7u = abs_dif_epu16(cr, a7);
1569 const __m128i d8u = abs_dif_epu16(cr, a8);
1570
1571 const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1572 const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1573 const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1574 const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1575 const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1576 const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1577 const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1578 const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1579
1580 __m128i mindiff = _mm_min_epi16(d1, d2);
1581 __m128i maxdiff = _mm_max_epi16(d1, d2);
1582
1583 maxdiff = limit_epi16(maxdiff, mindiff, d3);
1584 mindiff = _mm_min_epi16(mindiff, d3);
1585
1586 maxdiff = limit_epi16(maxdiff, mindiff, d4);
1587 mindiff = _mm_min_epi16(mindiff, d4);
1588
1589 maxdiff = limit_epi16(maxdiff, mindiff, d5);
1590 mindiff = _mm_min_epi16(mindiff, d5);
1591
1592 maxdiff = limit_epi16(maxdiff, mindiff, d6);
1593 mindiff = _mm_min_epi16(mindiff, d6);
1594
1595 maxdiff = limit_epi16(maxdiff, mindiff, d7);
1596 mindiff = _mm_min_epi16(mindiff, d7);
1597
1598 maxdiff = limit_epi16(maxdiff, mindiff, d8);
1599
1600 const __m128i maxdiffu = _mm_xor_si128(maxdiff, mask_sign);
1601
1602 const __m128i mi = _mm_xor_si128(_mm_subs_epu16(cr, maxdiffu), mask_sign);
1603 const __m128i ma = _mm_xor_si128(_mm_adds_epu16(cr, maxdiffu), mask_sign);
1604
1605 return _mm_xor_si128(limit_epi16(_mm_xor_si128(c, mask_sign), mi, ma), mask_sign);
1606 }
1607 #endif
1608 };
1609
1610 class OpRG24 {
1611 public:
1612 typedef ConvSigned ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1613 static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1614 AvsFilterRepair16_SORT_AXIS_CPP
1615
1616 const int d1 = limit(ma1 - cr, 0, 0xFFFF);
1617 const int d2 = limit(ma2 - cr, 0, 0xFFFF);
1618 const int d3 = limit(ma3 - cr, 0, 0xFFFF);
1619 const int d4 = limit(ma4 - cr, 0, 0xFFFF);
1620
1621 const int rd1 = limit(cr - mi1, 0, 0xFFFF);
1622 const int rd2 = limit(cr - mi2, 0, 0xFFFF);
1623 const int rd3 = limit(cr - mi3, 0, 0xFFFF);
1624 const int rd4 = limit(cr - mi4, 0, 0xFFFF);
1625
1626 const int u1 = std::max(d1, rd1);
1627 const int u2 = std::max(d2, rd2);
1628 const int u3 = std::max(d3, rd3);
1629 const int u4 = std::max(d4, rd4);
1630
1631 const int u = std::min(std::min(u1, u2), std::min(u3, u4));
1632
1633 return limit(c, limit(cr - u, 0, 0xFFFF), limit(cr + u, 0, 0xFFFF));
1634 }
1635 #ifdef VS_TARGET_CPU_X86
1636 template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1637 static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1638 AvsFilterRepair16_READ_PIX
1639 AvsFilterRepair16_SORT_AXIS_SSE2
1640
1641 const __m128i d1 = _mm_subs_epi16(ma1, cr);
1642 const __m128i d2 = _mm_subs_epi16(ma2, cr);
1643 const __m128i d3 = _mm_subs_epi16(ma3, cr);
1644 const __m128i d4 = _mm_subs_epi16(ma4, cr);
1645
1646 const __m128i rd1 = _mm_subs_epi16(cr, mi1);
1647 const __m128i rd2 = _mm_subs_epi16(cr, mi2);
1648 const __m128i rd3 = _mm_subs_epi16(cr, mi3);
1649 const __m128i rd4 = _mm_subs_epi16(cr, mi4);
1650
1651 const __m128i u1 = _mm_max_epi16(d1, rd1);
1652 const __m128i u2 = _mm_max_epi16(d2, rd2);
1653 const __m128i u3 = _mm_max_epi16(d3, rd3);
1654 const __m128i u4 = _mm_max_epi16(d4, rd4);
1655
1656 const __m128i u = _mm_min_epi16(_mm_min_epi16(u1, u2), _mm_min_epi16(u3, u4));
1657
1658 const __m128i mi = _mm_subs_epi16(cr, u);
1659 const __m128i ma = _mm_adds_epi16(cr, u);
1660
1661 return limit_epi16(c, mi, ma);
1662 }
1663 #endif
1664 };
1665
1666
1667 template <class OP, class T>
1668 class PlaneProc {
1669 public:
1670
process_subplane_cpp(const T * src1_ptr,const T * src2_ptr,T * dst_ptr,int stride,int width,int height)1671 static void process_subplane_cpp (const T *src1_ptr, const T *src2_ptr, T *dst_ptr, int stride, int width, int height)
1672 {
1673 const int y_b = 1;
1674 const int y_e = height - 1;
1675
1676 dst_ptr += y_b * stride;
1677 src1_ptr += y_b * stride;
1678 src2_ptr += y_b * stride;
1679
1680 const int x_e = width - 1;
1681
1682 for (int y = y_b; y < y_e; ++y)
1683 {
1684 dst_ptr [0] = src1_ptr [0];
1685
1686 process_row_cpp (
1687 dst_ptr,
1688 src1_ptr,
1689 src2_ptr,
1690 stride,
1691 1,
1692 x_e
1693 );
1694
1695 dst_ptr [x_e] = src1_ptr [x_e];
1696
1697 dst_ptr += stride;
1698 src1_ptr += stride;
1699 src2_ptr += stride;
1700 }
1701 }
1702
process_row_cpp(T * dst_ptr,const T * src1_ptr,const T * src2_ptr,int stride_src,int x_beg,int x_end)1703 static void process_row_cpp (T *dst_ptr, const T *src1_ptr, const T *src2_ptr, int stride_src, int x_beg, int x_end)
1704 {
1705 const int om = stride_src - 1;
1706 const int o0 = stride_src ;
1707 const int op = stride_src + 1;
1708
1709 src1_ptr += x_beg;
1710 src2_ptr += x_beg;
1711
1712 for (int x = x_beg; x < x_end; ++x)
1713 {
1714 const int cr = src1_ptr [0];
1715 const int a1 = src2_ptr [-op];
1716 const int a2 = src2_ptr [-o0];
1717 const int a3 = src2_ptr [-om];
1718 const int a4 = src2_ptr [-1 ];
1719 const int c = src2_ptr [ 0 ];
1720 const int a5 = src2_ptr [ 1 ];
1721 const int a6 = src2_ptr [ om];
1722 const int a7 = src2_ptr [ o0];
1723 const int a8 = src2_ptr [ op];
1724
1725 const int res = OP::rg (cr, a1, a2, a3, a4, c, a5, a6, a7, a8);
1726
1727 dst_ptr [x] = res;
1728
1729 ++ src1_ptr;
1730 ++ src2_ptr;
1731 }
1732 }
1733
1734 #ifdef VS_TARGET_CPU_X86
process_subplane_sse2(const T * src1_ptr,const T * src2_ptr,T * dst_ptr,int stride,int width,int height)1735 static void process_subplane_sse2 (const T *src1_ptr, const T *src2_ptr, T *dst_ptr, int stride, int width, int height)
1736 {
1737 const int y_b = 1;
1738 const int y_e = height - 1;
1739
1740 dst_ptr += y_b * stride;
1741 src1_ptr += y_b * stride;
1742 src2_ptr += y_b * stride;
1743
1744 const __m128i mask_sign = _mm_set1_epi16 (-0x8000);
1745
1746 const int x_e = width - 1;
1747 const int w8 = ((width - 2) & -8) + 1;
1748
1749 for (int y = y_b; y < y_e; ++y)
1750 {
1751 dst_ptr [0] = src1_ptr [0];
1752
1753 for (int x = 1; x < w8; x += 8)
1754 {
1755 __m128i res = OP::rg (
1756 src1_ptr + x,
1757 src2_ptr + x,
1758 stride,
1759 mask_sign
1760 );
1761
1762 res = OP::ConvSign::cv (res, mask_sign);
1763 if (sizeof(T) == 1)
1764 _mm_storel_epi64 (reinterpret_cast<__m128i *>(dst_ptr + x), _mm_packus_epi16 (res, res));
1765 else
1766 _mm_storeu_si128 (reinterpret_cast<__m128i *>(dst_ptr + x), res);
1767 }
1768
1769 process_row_cpp (
1770 dst_ptr,
1771 src1_ptr,
1772 src2_ptr,
1773 stride,
1774 w8,
1775 x_e
1776 );
1777
1778 dst_ptr [x_e] = src1_ptr [x_e];
1779
1780 dst_ptr += stride;
1781 src1_ptr += stride;
1782 src2_ptr += stride;
1783 }
1784 }
1785
1786 template <class OP1, class T1>
do_process_plane_sse2(const VSFrameRef * src1_frame,const VSFrameRef * src2_frame,VSFrameRef * dst_frame,int plane_id,const VSAPI * vsapi)1787 static void do_process_plane_sse2 (const VSFrameRef *src1_frame, const VSFrameRef *src2_frame, VSFrameRef *dst_frame, int plane_id, const VSAPI *vsapi)
1788 {
1789 const int w = vsapi->getFrameWidth(src1_frame, plane_id);
1790 const int h = vsapi->getFrameHeight(src1_frame, plane_id);
1791 T1 * dst_ptr = reinterpret_cast<T1*>(vsapi->getWritePtr(dst_frame, plane_id));
1792 const int stride = vsapi->getStride(src1_frame, plane_id);
1793
1794 const T1* src1_ptr = reinterpret_cast<const T1*>(vsapi->getReadPtr(src1_frame, plane_id));
1795 const T1* src2_ptr = reinterpret_cast<const T1*>(vsapi->getReadPtr(src2_frame, plane_id));
1796
1797 // First line
1798 memcpy (dst_ptr, src1_ptr, stride);
1799
1800 // Main content
1801 PlaneProc<OP1, T1>::process_subplane_sse2(src1_ptr, src2_ptr, dst_ptr, stride/sizeof(T1), w, h);
1802
1803 // Last line
1804 const int lp = (h - 1) * stride/sizeof(T1);
1805 memcpy (dst_ptr + lp, src1_ptr + lp, stride);
1806 }
1807
1808 #endif
1809
1810 template <class OP1, class T1>
do_process_plane_cpp(const VSFrameRef * src1_frame,const VSFrameRef * src2_frame,VSFrameRef * dst_frame,int plane_id,const VSAPI * vsapi)1811 static void do_process_plane_cpp (const VSFrameRef *src1_frame, const VSFrameRef *src2_frame, VSFrameRef *dst_frame, int plane_id, const VSAPI *vsapi)
1812 {
1813 const int w = vsapi->getFrameWidth(src1_frame, plane_id);
1814 const int h = vsapi->getFrameHeight(src1_frame, plane_id);
1815 T1 * dst_ptr = reinterpret_cast<T1*>(vsapi->getWritePtr(dst_frame, plane_id));
1816 const int stride = vsapi->getStride(src1_frame, plane_id);
1817
1818 const T1* src1_ptr = reinterpret_cast<const T1*>(vsapi->getReadPtr(src1_frame, plane_id));
1819 const T1* src2_ptr = reinterpret_cast<const T1*>(vsapi->getReadPtr(src2_frame, plane_id));
1820
1821 // First line
1822 memcpy (dst_ptr, src1_ptr, stride);
1823
1824 // Main content
1825 PlaneProc<OP1, T1>::process_subplane_cpp(src1_ptr, src2_ptr, dst_ptr, stride/sizeof(T1), w, h);
1826
1827 // Last line
1828 const int lp = (h - 1) * stride/sizeof(T1);
1829 memcpy (dst_ptr + lp, src1_ptr + lp, stride);
1830 }
1831
1832 };
1833
1834 typedef struct {
1835 VSNodeRef *node1;
1836 VSNodeRef *node2;
1837 const VSVideoInfo *vi;
1838 int mode[3];
1839 } RepairData;
1840
repairInit(VSMap * in,VSMap * out,void ** instanceData,VSNode * node,VSCore * core,const VSAPI * vsapi)1841 static void VS_CC repairInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
1842 RepairData *d = static_cast<RepairData *>(*instanceData);
1843 vsapi->setVideoInfo(d->vi, 1, node);
1844 }
1845
repairGetFrame(int n,int activationReason,void ** instanceData,void ** frameData,VSFrameContext * frameCtx,VSCore * core,const VSAPI * vsapi)1846 static const VSFrameRef *VS_CC repairGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
1847 RepairData *d = static_cast<RepairData *>(*instanceData);
1848
1849 if (activationReason == arInitial) {
1850 vsapi->requestFrameFilter(n, d->node1, frameCtx);
1851 vsapi->requestFrameFilter(n, d->node2, frameCtx);
1852 } else if (activationReason == arAllFramesReady) {
1853 const VSFrameRef *src1_frame = vsapi->getFrameFilter(n, d->node1, frameCtx);
1854 const VSFrameRef *src2_frame = vsapi->getFrameFilter(n, d->node2, frameCtx);
1855 int planes[3] = {0, 1, 2};
1856 const VSFrameRef * cp_planes[3] = { d->mode[0] ? nullptr : src1_frame, d->mode[1] ? nullptr : src1_frame, d->mode[2] ? nullptr : src1_frame };
1857 VSFrameRef *dst_frame = vsapi->newVideoFrame2(vsapi->getFrameFormat(src1_frame), vsapi->getFrameWidth(src1_frame, 0), vsapi->getFrameHeight(src1_frame, 0), cp_planes, planes, src1_frame, core);
1858
1859
1860 #define PROC_ARGS_16(op) PlaneProc <op, uint16_t>::do_process_plane_cpp<op, uint16_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1861 #define PROC_ARGS_8(op) PlaneProc <op, uint16_t>::do_process_plane_cpp<op, uint8_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1862
1863 #ifdef VS_TARGET_CPU_X86
1864 #define PROC_ARGS_16_FAST(op) PlaneProc <op, uint16_t>::do_process_plane_sse2<op, uint16_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1865 #define PROC_ARGS_8_FAST(op) PlaneProc <op, uint8_t>::do_process_plane_sse2<op, uint8_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1866 #else
1867 #define PROC_ARGS_16_FAST(op) PROC_ARGS_16(op)
1868 #define PROC_ARGS_8_FAST(op) PROC_ARGS_8(op)
1869 #endif
1870
1871
1872 if (d->vi->format->bytesPerSample == 1) {
1873 for (int i = 0; i < d->vi->format->numPlanes; i++) {
1874 switch (d->mode[i])
1875 {
1876 case 1: PROC_ARGS_8_FAST(OpRG01)
1877 case 2: PROC_ARGS_8_FAST(OpRG02)
1878 case 3: PROC_ARGS_8_FAST(OpRG03)
1879 case 4: PROC_ARGS_8_FAST(OpRG04)
1880 case 5: PROC_ARGS_8_FAST(OpRG05)
1881 case 6: PROC_ARGS_8_FAST(OpRG06)
1882 case 7: PROC_ARGS_8_FAST(OpRG07)
1883 case 8: PROC_ARGS_8_FAST(OpRG08)
1884 case 9: PROC_ARGS_8_FAST(OpRG09)
1885 case 10: PROC_ARGS_8_FAST(OpRG10)
1886 case 11: PROC_ARGS_8_FAST(OpRG01)
1887 case 12: PROC_ARGS_8_FAST(OpRG12)
1888 case 13: PROC_ARGS_8_FAST(OpRG13)
1889 case 14: PROC_ARGS_8_FAST(OpRG14)
1890 case 15: PROC_ARGS_8_FAST(OpRG15)
1891 case 16: PROC_ARGS_8_FAST(OpRG16)
1892 case 17: PROC_ARGS_8_FAST(OpRG17)
1893 case 18: PROC_ARGS_8_FAST(OpRG18)
1894 case 19: PROC_ARGS_8_FAST(OpRG19)
1895 case 20: PROC_ARGS_8_FAST(OpRG20)
1896 case 21: PROC_ARGS_8_FAST(OpRG21)
1897 case 22: PROC_ARGS_8_FAST(OpRG22)
1898 case 23: PROC_ARGS_8_FAST(OpRG23)
1899 case 24: PROC_ARGS_8_FAST(OpRG24)
1900 default: break;
1901 }
1902 }
1903 } else {
1904 for (int i = 0; i < d->vi->format->numPlanes; i++) {
1905 switch (d->mode[i])
1906 {
1907 case 1: PROC_ARGS_16_FAST(OpRG01)
1908 case 2: PROC_ARGS_16_FAST(OpRG02)
1909 case 3: PROC_ARGS_16_FAST(OpRG03)
1910 case 4: PROC_ARGS_16_FAST(OpRG04)
1911 case 5: PROC_ARGS_16_FAST(OpRG05)
1912 case 6: PROC_ARGS_16_FAST(OpRG06)
1913 case 7: PROC_ARGS_16_FAST(OpRG07)
1914 case 8: PROC_ARGS_16_FAST(OpRG08)
1915 case 9: PROC_ARGS_16_FAST(OpRG09)
1916 case 10: PROC_ARGS_16_FAST(OpRG10)
1917 case 11: PROC_ARGS_16_FAST(OpRG01)
1918 case 12: PROC_ARGS_16_FAST(OpRG12)
1919 case 13: PROC_ARGS_16_FAST(OpRG13)
1920 case 14: PROC_ARGS_16_FAST(OpRG14)
1921 case 15: PROC_ARGS_16_FAST(OpRG15)
1922 case 16: PROC_ARGS_16_FAST(OpRG16)
1923 case 17: PROC_ARGS_16_FAST(OpRG17)
1924 case 18: PROC_ARGS_16_FAST(OpRG18)
1925 case 19: PROC_ARGS_16_FAST(OpRG19)
1926 case 20: PROC_ARGS_16_FAST(OpRG20)
1927 case 21: PROC_ARGS_16_FAST(OpRG21)
1928 case 22: PROC_ARGS_16_FAST(OpRG22)
1929 case 23: PROC_ARGS_16_FAST(OpRG23)
1930 case 24: PROC_ARGS_16_FAST(OpRG24)
1931 default: break;
1932 }
1933 }
1934 }
1935
1936 vsapi->freeFrame(src1_frame);
1937 vsapi->freeFrame(src2_frame);
1938 return dst_frame;
1939 }
1940
1941 return nullptr;
1942 }
1943
repairFree(void * instanceData,VSCore * core,const VSAPI * vsapi)1944 static void VS_CC repairFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
1945 RepairData *d = static_cast<RepairData *>(instanceData);
1946 vsapi->freeNode(d->node1);
1947 vsapi->freeNode(d->node2);
1948 delete d;
1949 }
1950
repairCreate(const VSMap * in,VSMap * out,void * userData,VSCore * core,const VSAPI * vsapi)1951 void VS_CC repairCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
1952 RepairData d;
1953
1954 d.node1 = vsapi->propGetNode(in, "clip", 0, nullptr);
1955 d.vi = vsapi->getVideoInfo(d.node1);
1956
1957 if (!isConstantFormat(d.vi)) {
1958 vsapi->freeNode(d.node1);
1959 vsapi->setError(out, "Repair: Only constant format input supported");
1960 return;
1961 }
1962
1963 d.node2 = vsapi->propGetNode(in, "repairclip", 0, nullptr);
1964
1965 if (!isSameFormat(d.vi, vsapi->getVideoInfo(d.node2))) {
1966 vsapi->freeNode(d.node1);
1967 vsapi->freeNode(d.node2);
1968 vsapi->setError(out, "Repair: Input clips must have the same format");
1969 return;
1970 }
1971
1972 if (d.vi->format->sampleType != stInteger || (d.vi->format->bytesPerSample != 1 && d.vi->format->bytesPerSample != 2)) {
1973 vsapi->freeNode(d.node1);
1974 vsapi->freeNode(d.node2);
1975 vsapi->setError(out, "Repair: Only 8-16 bit int formats supported");
1976 return;
1977 }
1978
1979 int n = d.vi->format->numPlanes;
1980 int m = vsapi->propNumElements(in, "mode");
1981 if (n < m) {
1982 vsapi->freeNode(d.node1);
1983 vsapi->freeNode(d.node2);
1984 vsapi->setError(out, "Repair: Number of modes specified must be equal or fewer than the number of input planes");
1985 return;
1986 }
1987
1988 for (int i = 0; i < 3; i++) {
1989 if (i < m) {
1990 d.mode[i] = int64ToIntS(vsapi->propGetInt(in, "mode", i, nullptr));
1991 if (d.mode[i] < 0 || d.mode[i] > 24)
1992 {
1993 vsapi->freeNode(d.node1);
1994 vsapi->freeNode(d.node2);
1995 vsapi->setError(out, "Repair: Invalid mode specified, only 0-24 supported");
1996 return;
1997 }
1998 } else {
1999 d.mode[i] = d.mode[i - 1];
2000 }
2001 }
2002
2003 RepairData *data = new RepairData(d);
2004
2005 vsapi->createFilter(in, out, "Repair", repairInit, repairGetFrame, repairFree, fmParallel, 0, data, core);
2006 }
2007