1 /*****************************************************************************
2 
3         AvsFilterRemoveGrain/Repair16
4         Author: Laurent de Soras, 2012
5         Modified for VapourSynth by Fredrik Mellbin 2013
6 
7 --- Legal stuff ---
8 
9 This program is free software. It comes without any warranty, to
10 the extent permitted by applicable law. You can redistribute it
11 and/or modify it under the terms of the Do What The Fuck You Want
12 To Public License, Version 2, as published by Sam Hocevar. See
13 http://sam.zoy.org/wtfpl/COPYING for more details.
14 
15 *Tab=3***********************************************************************/
16 
17 #include "shared.h"
18 
19 #ifdef VS_TARGET_CPU_X86
20 
21 class ConvSigned
22 {
23 public:
cv(__m128i a,__m128i m)24     static __forceinline __m128i cv (__m128i a, __m128i m)
25     {
26         return (_mm_xor_si128 (a, m));
27     }
28 };
29 
30 
31 class ConvUnsigned
32 {
33 public:
cv(__m128i a,__m128i m)34     static __forceinline __m128i cv (__m128i a, __m128i m)
35     {
36         return (a);
37     }
38 };
39 
40 #define AvsFilterRepair16_READ_PIX    \
41    const int      om = stride_src2 - 1;     \
42    const int      o0 = stride_src2    ;     \
43    const int      op = stride_src2 + 1;     \
44    __m128i        cr, a1, a2, a3, a4, c, a5, a6, a7, a8; \
45    if (sizeof(T) == 1) { \
46        __m128i zeroreg = _mm_setzero_si128(); \
47        cr = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src1_ptr + 0)), zeroreg), mask_sign); \
48        a1 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - op)), zeroreg), mask_sign); \
49        a2 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - o0)), zeroreg), mask_sign); \
50        a3 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - om)), zeroreg), mask_sign); \
51        a4 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr - 1 )), zeroreg), mask_sign); \
52        c  = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + 0 )), zeroreg), mask_sign); \
53        a5 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + 1 )), zeroreg), mask_sign); \
54        a6 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + om)), zeroreg), mask_sign); \
55        a7 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + o0)), zeroreg), mask_sign); \
56        a8 = ConvSign::cv (_mm_unpacklo_epi8(_mm_loadl_epi64 (reinterpret_cast<const __m128i *>(src2_ptr + op)), zeroreg), mask_sign); \
57    } else {     \
58        cr = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src1_ptr + 0 )), mask_sign); \
59        a1 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - op)), mask_sign); \
60        a2 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - o0)), mask_sign); \
61        a3 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - om)), mask_sign); \
62        a4 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr - 1 )), mask_sign); \
63        c  = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + 0 )), mask_sign); \
64        a5 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + 1 )), mask_sign); \
65        a6 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + om)), mask_sign); \
66        a7 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + o0)), mask_sign); \
67        a8 = ConvSign::cv (_mm_loadu_si128 (reinterpret_cast<const __m128i *>(src2_ptr + op)), mask_sign); \
68    }
69 
70 #define AvsFilterRepair16_SORT_AXIS_SSE2   \
71     const __m128i  ma1 = _mm_max_epi16(a1, a8); \
72     const __m128i  mi1 = _mm_min_epi16(a1, a8); \
73     const __m128i  ma2 = _mm_max_epi16(a2, a7); \
74     const __m128i  mi2 = _mm_min_epi16(a2, a7); \
75     const __m128i  ma3 = _mm_max_epi16(a3, a6); \
76     const __m128i  mi3 = _mm_min_epi16(a3, a6); \
77     const __m128i  ma4 = _mm_max_epi16(a4, a5); \
78     const __m128i  mi4 = _mm_min_epi16(a4, a5);
79 
80 #else
81 
82 class ConvSigned
83 {
84 };
85 
86 
87 class ConvUnsigned
88 {
89 };
90 #endif
91 
92 #define AvsFilterRepair16_SORT_AXIS_CPP \
93     const int      ma1 = std::max(a1, a8);   \
94     const int      mi1 = std::min(a1, a8);   \
95     const int      ma2 = std::max(a2, a7);   \
96     const int      mi2 = std::min(a2, a7);   \
97     const int      ma3 = std::max(a3, a6);   \
98     const int      mi3 = std::min(a3, a6);   \
99     const int      ma4 = std::max(a4, a5);   \
100     const int      mi4 = std::min(a4, a5);
101 
102 
103 class OpRG01
104 {
105 public:
106     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)107     static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
108         const int        mi = std::min (std::min (
109             std::min (std::min (a1, a2), std::min (a3, a4)),
110             std::min (std::min (a5, a6), std::min (a7, a8))
111         ), c);
112         const int        ma = std::max (std::max (
113             std::max (std::max (a1, a2), std::max (a3, a4)),
114             std::max (std::max (a5, a6), std::max (a7, a8))
115         ), c);
116 
117         return (limit (cr, mi, ma));
118     }
119 #ifdef VS_TARGET_CPU_X86
120     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)121     static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
122         AvsFilterRepair16_READ_PIX
123 
124         const __m128i    mi = _mm_min_epi16 (_mm_min_epi16 (
125             _mm_min_epi16 (_mm_min_epi16 (a1, a2), _mm_min_epi16 (a3, a4)),
126             _mm_min_epi16 (_mm_min_epi16 (a5, a6), _mm_min_epi16 (a7, a8))
127         ), c);
128         const __m128i    ma = _mm_max_epi16 (_mm_max_epi16 (
129             _mm_max_epi16 (_mm_max_epi16 (a1, a2), _mm_max_epi16 (a3, a4)),
130             _mm_max_epi16 (_mm_max_epi16 (a5, a6), _mm_max_epi16 (a7, a8))
131         ), c);
132 
133         return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
134     }
135 #endif
136 };
137 
138 class OpRG02
139 {
140 public:
141     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)142     static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
143         int                a [9] = { a1, a2, a3, a4, c, a5, a6, a7, a8 };
144 
145         std::sort (&a [0], (&a [8]) + 1);
146 
147         return (limit (cr, a [2-1], a [7]));
148     }
149 #ifdef VS_TARGET_CPU_X86
150     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)151     static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
152         AvsFilterRepair16_READ_PIX
153 
154         sort_pair (a1, a8);
155 
156         sort_pair (a1,  c);
157         sort_pair (a2, a5);
158         sort_pair (a3, a6);
159         sort_pair (a4, a7);
160         sort_pair ( c, a8);
161 
162         sort_pair (a1, a3);
163         sort_pair ( c, a6);
164         sort_pair (a2, a4);
165         sort_pair (a5, a7);
166 
167         sort_pair (a3, a8);
168 
169         sort_pair (a3,  c);
170         sort_pair (a6, a8);
171         sort_pair (a4, a5);
172 
173         a2 = _mm_max_epi16 (a1, a2);    // sort_pair (a1, a2);
174         a3 = _mm_min_epi16 (a3, a4);    // sort_pair (a3, a4);
175         sort_pair ( c, a5);
176         a7 = _mm_max_epi16 (a6, a7);    // sort_pair (a6, a7);
177 
178         sort_pair (a2, a8);
179 
180         a2 = _mm_min_epi16 (a2,  c);    // sort_pair (a2,  c);
181         a8 = _mm_max_epi16 (a5, a8);    // sort_pair (a5, a8);
182 
183         a2 = _mm_min_epi16 (a2, a3);    // sort_pair (a2, a3);
184         a7 = _mm_min_epi16 (a7, a8);    // sort_pair (a7, a8);
185 
186         return (_mm_min_epi16 (_mm_max_epi16 (cr, a2), a7));
187     }
188 #endif
189 };
190 
191 class OpRG03
192 {
193 public:
194     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)195     static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
196         int                a [9] = { a1, a2, a3, a4, c, a5, a6, a7, a8 };
197 
198         std::sort (&a [0], (&a [8]) + 1);
199 
200         return (limit (cr, a [3-1], a [6]));
201     }
202 #ifdef VS_TARGET_CPU_X86
203     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)204     static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
205         AvsFilterRepair16_READ_PIX
206 
207         sort_pair (a1, a8);
208 
209         sort_pair (a1,  c);
210         sort_pair (a2, a5);
211         sort_pair (a3, a6);
212         sort_pair (a4, a7);
213         sort_pair ( c, a8);
214 
215         sort_pair (a1, a3);
216         sort_pair ( c, a6);
217         sort_pair (a2, a4);
218         sort_pair (a5, a7);
219 
220         sort_pair (a3, a8);
221 
222         sort_pair (a3,  c);
223         sort_pair (a6, a8);
224         sort_pair (a4, a5);
225 
226         a2 = _mm_max_epi16 (a1, a2);    // sort_pair (a1, a2);
227         sort_pair (a3, a4);
228         sort_pair ( c, a5);
229         a6 = _mm_min_epi16 (a6, a7);    // sort_pair (a6, a7);
230 
231         sort_pair (a2, a8);
232 
233         a2 = _mm_min_epi16 (a2,  c);    // sort_pair (a2,  c);
234         a6 = _mm_max_epi16 (a4, a6);    // sort_pair (a4, a6);
235         a5 = _mm_min_epi16 (a5, a8);    // sort_pair (a5, a8);
236 
237         a3 = _mm_max_epi16 (a2, a3);    // sort_pair (a2, a3);
238         a6 = _mm_max_epi16 (a5, a6);    // sort_pair (a5, a6);
239 
240         return (_mm_min_epi16 (_mm_max_epi16 (cr, a3), a6));
241     }
242 #endif
243 };
244 
245 class OpRG04
246 {
247 public:
248     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)249     static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
250         int                a [9] = { a1, a2, a3, a4, c, a5, a6, a7, a8 };
251 
252         std::sort (&a [0], (&a [8]) + 1);
253 
254         return (limit (cr, a [4-1], a [5]));
255     }
256 #ifdef VS_TARGET_CPU_X86
257     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)258     static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
259         // http://jgamble.ripco.net/cgi-bin/nw.cgi?inputs=9&algorithm=batcher&output=text
260 
261         AvsFilterRepair16_READ_PIX
262 
263         sort_pair (a1, a8);
264 
265         sort_pair (a1,  c);
266         sort_pair (a2, a5);
267         sort_pair (a3, a6);
268         sort_pair (a4, a7);
269         sort_pair ( c, a8);
270 
271         sort_pair (a1, a3);
272         sort_pair ( c, a6);
273         sort_pair (a2, a4);
274         sort_pair (a5, a7);
275 
276         sort_pair (a3, a8);
277 
278         sort_pair (a3,  c);
279         sort_pair (a6, a8);
280         sort_pair (a4, a5);
281 
282         a2 = _mm_max_epi16 (a1, a2);    // sort_pair (a1, a2);
283         a4 = _mm_max_epi16 (a3, a4);    // sort_pair (a3, a4);
284         sort_pair ( c, a5);
285         a6 = _mm_min_epi16 (a6, a7);    // sort_pair (a6, a7);
286 
287         sort_pair (a2, a8);
288 
289         c  = _mm_max_epi16 (a2,  c);    // sort_pair (a2,  c);
290         sort_pair (a4, a6);
291         a5 = _mm_min_epi16 (a5, a8);    // sort_pair (a5, a8);
292 
293         a4 = _mm_min_epi16 (a4,  c);    // sort_pair (a4,  c);
294         a5 = _mm_min_epi16 (a5, a6);    // sort_pair (a5, a6);
295 
296         return (_mm_min_epi16 (_mm_max_epi16 (cr, a4), a5));
297     }
298 #endif
299 };
300 
301 class OpRG05
302 {
303 public:
304     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)305     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
306         const int mal1 = std::max(std::max(a1, a8), c);
307         const int mil1 = std::min(std::min(a1, a8), c);
308 
309         const int mal2 = std::max(std::max(a2, a7), c);
310         const int mil2 = std::min(std::min(a2, a7), c);
311 
312         const int mal3 = std::max(std::max(a3, a6), c);
313         const int mil3 = std::min(std::min(a3, a6), c);
314 
315         const int mal4 = std::max(std::max(a4, a5), c);
316         const int mil4 = std::min(std::min(a4, a5), c);
317 
318         const int clipped1 = limit(cr, mil1, mal1);
319         const int clipped2 = limit(cr, mil2, mal2);
320         const int clipped3 = limit(cr, mil3, mal3);
321         const int clipped4 = limit(cr, mil4, mal4);
322 
323         const int c1 = std::abs(cr - clipped1);
324         const int c2 = std::abs(cr - clipped2);
325         const int c3 = std::abs(cr - clipped3);
326         const int c4 = std::abs(cr - clipped4);
327 
328         const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
329 
330         if (mindiff == c4)
331             return clipped4;
332         else if (mindiff == c2)
333             return clipped2;
334         else if (mindiff == c3)
335             return clipped3;
336         else
337             return clipped1;
338     }
339 #ifdef VS_TARGET_CPU_X86
340     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)341     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
342         AvsFilterRepair16_READ_PIX
343 
344         const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
345         const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
346 
347         const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
348         const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
349 
350         const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
351         const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
352 
353         const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
354         const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
355 
356         const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
357         const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
358         const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
359         const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
360 
361         const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
362         const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
363         const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
364         const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
365         const __m128i cru = _mm_xor_si128(cr, mask_sign);
366 
367         const __m128i c1u = abs_dif_epu16(cru, clipped1u);
368         const __m128i c2u = abs_dif_epu16(cru, clipped2u);
369         const __m128i c3u = abs_dif_epu16(cru, clipped3u);
370         const __m128i c4u = abs_dif_epu16(cru, clipped4u);
371 
372         const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
373         const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
374         const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
375         const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
376 
377         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
378 
379         __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
380         result = select_16_equ(mindiff, c3, clipped3, result);
381         result = select_16_equ(mindiff, c2, clipped2, result);
382         return select_16_equ(mindiff, c4, clipped4, result);
383 }
384 #endif
385 };
386 
387 class OpRG06
388 {
389 public:
390     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)391     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
392         const int mal1 = std::max(std::max(a1, a8), c);
393         const int mil1 = std::min(std::min(a1, a8), c);
394 
395         const int mal2 = std::max(std::max(a2, a7), c);
396         const int mil2 = std::min(std::min(a2, a7), c);
397 
398         const int mal3 = std::max(std::max(a3, a6), c);
399         const int mil3 = std::min(std::min(a3, a6), c);
400 
401         const int mal4 = std::max(std::max(a4, a5), c);
402         const int mil4 = std::min(std::min(a4, a5), c);
403 
404         const int d1 = mal1 - mil1;
405         const int d2 = mal2 - mil2;
406         const int d3 = mal3 - mil3;
407         const int d4 = mal4 - mil4;
408 
409         const int clipped1 = limit(cr, mil1, mal1);
410         const int clipped2 = limit(cr, mil2, mal2);
411         const int clipped3 = limit(cr, mil3, mal3);
412         const int clipped4 = limit(cr, mil4, mal4);
413 
414         const int c1 = limit((std::abs(cr - clipped1) << 1) + d1, 0, 0xFFFF);
415         const int c2 = limit((std::abs(cr - clipped2) << 1) + d2, 0, 0xFFFF);
416         const int c3 = limit((std::abs(cr - clipped3) << 1) + d3, 0, 0xFFFF);
417         const int c4 = limit((std::abs(cr - clipped4) << 1) + d4, 0, 0xFFFF);
418 
419         const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
420 
421         if (mindiff == c4)
422             return clipped4;
423         else if (mindiff == c2)
424             return clipped2;
425         else if (mindiff == c3)
426             return clipped3;
427         else
428             return clipped1;
429     }
430 #ifdef VS_TARGET_CPU_X86
431     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)432     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
433         AvsFilterRepair16_READ_PIX
434 
435         const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
436         const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
437 
438         const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
439         const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
440 
441         const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
442         const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
443 
444         const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
445         const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
446 
447         const __m128i d1 = _mm_sub_epi16(mal1, mil1);
448         const __m128i d2 = _mm_sub_epi16(mal2, mil2);
449         const __m128i d3 = _mm_sub_epi16(mal3, mil3);
450         const __m128i d4 = _mm_sub_epi16(mal4, mil4);
451 
452         const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
453         const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
454         const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
455         const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
456 
457         const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
458         const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
459         const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
460         const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
461         const __m128i cru = _mm_xor_si128(cr, mask_sign);
462 
463         const __m128i absdiff1 = abs_dif_epu16(cru, clipped1u);
464         const __m128i absdiff2 = abs_dif_epu16(cru, clipped2u);
465         const __m128i absdiff3 = abs_dif_epu16(cru, clipped3u);
466         const __m128i absdiff4 = abs_dif_epu16(cru, clipped4u);
467 
468         const __m128i c1u = _mm_adds_epu16(_mm_adds_epu16(absdiff1, absdiff1), d1);
469         const __m128i c2u = _mm_adds_epu16(_mm_adds_epu16(absdiff2, absdiff2), d2);
470         const __m128i c3u = _mm_adds_epu16(_mm_adds_epu16(absdiff3, absdiff3), d3);
471         const __m128i c4u = _mm_adds_epu16(_mm_adds_epu16(absdiff4, absdiff4), d4);
472 
473         const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
474         const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
475         const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
476         const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
477 
478         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
479 
480         __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
481         result = select_16_equ(mindiff, c3, clipped3, result);
482         result = select_16_equ(mindiff, c2, clipped2, result);
483         return select_16_equ(mindiff, c4, clipped4, result);
484     }
485 #endif
486 };
487 
488 class OpRG07
489 {
490 public:
491     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)492     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
493         const int mal1 = std::max(std::max(a1, a8), c);
494         const int mil1 = std::min(std::min(a1, a8), c);
495 
496         const int mal2 = std::max(std::max(a2, a7), c);
497         const int mil2 = std::min(std::min(a2, a7), c);
498 
499         const int mal3 = std::max(std::max(a3, a6), c);
500         const int mil3 = std::min(std::min(a3, a6), c);
501 
502         const int mal4 = std::max(std::max(a4, a5), c);
503         const int mil4 = std::min(std::min(a4, a5), c);
504 
505         const int d1 = mal1 - mil1;
506         const int d2 = mal2 - mil2;
507         const int d3 = mal3 - mil3;
508         const int d4 = mal4 - mil4;
509 
510         const int clipped1 = limit(cr, mil1, mal1);
511         const int clipped2 = limit(cr, mil2, mal2);
512         const int clipped3 = limit(cr, mil3, mal3);
513         const int clipped4 = limit(cr, mil4, mal4);
514 
515         const int c1 = std::abs(cr - clipped1) + d1;
516         const int c2 = std::abs(cr - clipped2) + d2;
517         const int c3 = std::abs(cr - clipped3) + d3;
518         const int c4 = std::abs(cr - clipped4) + d4;
519 
520         const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
521 
522         if (mindiff == c4)
523             return clipped4;
524         else if (mindiff == c2)
525             return clipped2;
526         else if (mindiff == c3)
527             return clipped3;
528         else
529             return clipped1;
530     }
531 #ifdef VS_TARGET_CPU_X86
532     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)533     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
534         AvsFilterRepair16_READ_PIX
535 
536         const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
537         const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
538 
539         const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
540         const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
541 
542         const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
543         const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
544 
545         const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
546         const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
547 
548         const __m128i d1 = _mm_sub_epi16(mal1, mil1);
549         const __m128i d2 = _mm_sub_epi16(mal2, mil2);
550         const __m128i d3 = _mm_sub_epi16(mal3, mil3);
551         const __m128i d4 = _mm_sub_epi16(mal4, mil4);
552 
553         const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
554         const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
555         const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
556         const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
557 
558         const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
559         const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
560         const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
561         const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
562         const __m128i cru = _mm_xor_si128(cr, mask_sign);
563 
564         //todo: what happens when this overflows?
565         const __m128i c1u = _mm_adds_epu16(abs_dif_epu16(cru, clipped1u), d1);
566         const __m128i c2u = _mm_adds_epu16(abs_dif_epu16(cru, clipped2u), d2);
567         const __m128i c3u = _mm_adds_epu16(abs_dif_epu16(cru, clipped3u), d3);
568         const __m128i c4u = _mm_adds_epu16(abs_dif_epu16(cru, clipped4u), d4);
569 
570         const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
571         const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
572         const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
573         const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
574 
575         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
576 
577         __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
578         result = select_16_equ(mindiff, c3, clipped3, result);
579         result = select_16_equ(mindiff, c2, clipped2, result);
580         return select_16_equ(mindiff, c4, clipped4, result);
581     }
582 #endif
583 };
584 
585 class OpRG08
586 {
587 public:
588     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)589     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
590         const int mal1 = std::max(std::max(a1, a8), c);
591         const int mil1 = std::min(std::min(a1, a8), c);
592 
593         const int mal2 = std::max(std::max(a2, a7), c);
594         const int mil2 = std::min(std::min(a2, a7), c);
595 
596         const int mal3 = std::max(std::max(a3, a6), c);
597         const int mil3 = std::min(std::min(a3, a6), c);
598 
599         const int mal4 = std::max(std::max(a4, a5), c);
600         const int mil4 = std::min(std::min(a4, a5), c);
601 
602         const int d1 = mal1 - mil1;
603         const int d2 = mal2 - mil2;
604         const int d3 = mal3 - mil3;
605         const int d4 = mal4 - mil4;
606 
607         const int clipped1 = limit(cr, mil1, mal1);
608         const int clipped2 = limit(cr, mil2, mal2);
609         const int clipped3 = limit(cr, mil3, mal3);
610         const int clipped4 = limit(cr, mil4, mal4);
611 
612         const int c1 = limit(std::abs(cr - clipped1) + (d1 << 1), 0, 0xFFFF);
613         const int c2 = limit(std::abs(cr - clipped2) + (d2 << 1), 0, 0xFFFF);
614         const int c3 = limit(std::abs(cr - clipped3) + (d3 << 1), 0, 0xFFFF);
615         const int c4 = limit(std::abs(cr - clipped4) + (d4 << 1), 0, 0xFFFF);
616 
617         const int mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
618 
619         if (mindiff == c4)
620             return clipped4;
621         else if (mindiff == c2)
622             return clipped2;
623         else if (mindiff == c3)
624             return clipped3;
625         else
626             return clipped1;
627     }
628 #ifdef VS_TARGET_CPU_X86
629     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)630     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
631         AvsFilterRepair16_READ_PIX
632 
633         const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
634         const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
635 
636         const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
637         const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
638 
639         const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
640         const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
641 
642         const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
643         const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
644 
645         const __m128i d1 = _mm_sub_epi16(mal1, mil1);
646         const __m128i d2 = _mm_sub_epi16(mal2, mil2);
647         const __m128i d3 = _mm_sub_epi16(mal3, mil3);
648         const __m128i d4 = _mm_sub_epi16(mal4, mil4);
649 
650         const __m128i clipped1 = limit_epi16(cr, mil1, mal1);
651         const __m128i clipped2 = limit_epi16(cr, mil2, mal2);
652         const __m128i clipped3 = limit_epi16(cr, mil3, mal3);
653         const __m128i clipped4 = limit_epi16(cr, mil4, mal4);
654 
655         const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
656         const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
657         const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
658         const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
659         const __m128i cru = _mm_xor_si128(cr, mask_sign);
660 
661         const __m128i c1u = _mm_adds_epu16(abs_dif_epu16(cru, clipped1u), _mm_adds_epu16(d1, d1));
662         const __m128i c2u = _mm_adds_epu16(abs_dif_epu16(cru, clipped2u), _mm_adds_epu16(d2, d2));
663         const __m128i c3u = _mm_adds_epu16(abs_dif_epu16(cru, clipped3u), _mm_adds_epu16(d3, d3));
664         const __m128i c4u = _mm_adds_epu16(abs_dif_epu16(cru, clipped4u), _mm_adds_epu16(d4, d4));
665 
666         const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
667         const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
668         const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
669         const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
670 
671         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
672 
673         __m128i result = select_16_equ(mindiff, c1, clipped1, cr);
674         result = select_16_equ(mindiff, c3, clipped3, result);
675         result = select_16_equ(mindiff, c2, clipped2, result);
676         return select_16_equ(mindiff, c4, clipped4, result);
677     }
678 #endif
679 };
680 
681 class OpRG09
682 {
683 public:
684     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)685     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
686         const int mal1 = std::max(std::max(a1, a8), c);
687         const int mil1 = std::min(std::min(a1, a8), c);
688 
689         const int mal2 = std::max(std::max(a2, a7), c);
690         const int mil2 = std::min(std::min(a2, a7), c);
691 
692         const int mal3 = std::max(std::max(a3, a6), c);
693         const int mil3 = std::min(std::min(a3, a6), c);
694 
695         const int mal4 = std::max(std::max(a4, a5), c);
696         const int mil4 = std::min(std::min(a4, a5), c);
697 
698         const int d1 = mal1 - mil1;
699         const int d2 = mal2 - mil2;
700         const int d3 = mal3 - mil3;
701         const int d4 = mal4 - mil4;
702 
703         const int mindiff = std::min(std::min(d1, d2), std::min(d3, d4));
704 
705         if (mindiff == d4)
706             return limit(cr, mil4, mal4);
707         else if (mindiff == d2)
708             return limit(cr, mil2, mal2);
709         else if (mindiff == d3)
710             return limit(cr, mil3, mal3);
711         else
712             return limit(cr, mil1, mal1);
713     }
714 #ifdef VS_TARGET_CPU_X86
715     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)716     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
717         AvsFilterRepair16_READ_PIX
718 
719         const __m128i mal1 = _mm_max_epi16(_mm_max_epi16(a1, a8), c);
720         const __m128i mil1 = _mm_min_epi16(_mm_min_epi16(a1, a8), c);
721 
722         const __m128i mal2 = _mm_max_epi16(_mm_max_epi16(a2, a7), c);
723         const __m128i mil2 = _mm_min_epi16(_mm_min_epi16(a2, a7), c);
724 
725         const __m128i mal3 = _mm_max_epi16(_mm_max_epi16(a3, a6), c);
726         const __m128i mil3 = _mm_min_epi16(_mm_min_epi16(a3, a6), c);
727 
728         const __m128i mal4 = _mm_max_epi16(_mm_max_epi16(a4, a5), c);
729         const __m128i mil4 = _mm_min_epi16(_mm_min_epi16(a4, a5), c);
730 
731         const __m128i d1 = _mm_sub_epi16(mal1, mil1);
732         const __m128i d2 = _mm_sub_epi16(mal2, mil2);
733         const __m128i d3 = _mm_sub_epi16(mal3, mil3);
734         const __m128i d4 = _mm_sub_epi16(mal4, mil4);
735 
736         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4));
737 
738         __m128i result = select_16_equ(mindiff, d1, limit_epi16(cr, mil1, mal1), cr);
739         result = select_16_equ(mindiff, d3, limit_epi16(cr, mil3, mal3), result);
740         result = select_16_equ(mindiff, d2, limit_epi16(cr, mil2, mal2), result);
741         return select_16_equ(mindiff, d4, limit_epi16(cr, mil4, mal4), result);
742     }
743 #endif
744 };
745 
746 class OpRG10
747 {
748 public:
749     typedef    ConvUnsigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)750     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
751         const int d1 = std::abs(cr - a1);
752         const int d2 = std::abs(cr - a2);
753         const int d3 = std::abs(cr - a3);
754         const int d4 = std::abs(cr - a4);
755         const int d5 = std::abs(cr - a5);
756         const int d6 = std::abs(cr - a6);
757         const int d7 = std::abs(cr - a7);
758         const int d8 = std::abs(cr - a8);
759         const int dc = std::abs(cr - c);
760 
761         const int mindiff = std::min(std::min(std::min(std::min(d1, d2), std::min(d3, d4)), std::min(std::min(d5, d6), std::min(d7, d8))), dc);
762 
763         if (mindiff == d7)
764             return a7;
765         else if (mindiff == d8)
766             return a8;
767         else if (mindiff == d6)
768             return a6;
769         else if (mindiff == d2)
770             return a2;
771         else if (mindiff == d3)
772             return a3;
773         else if (mindiff == d1)
774             return a1;
775         else if (mindiff == d5)
776             return a5;
777         else if (mindiff == dc)
778             return c;
779         else
780             return a4;
781     }
782 #ifdef VS_TARGET_CPU_X86
783     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)784     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
785         AvsFilterRepair16_READ_PIX
786 
787         const __m128i d1u = abs_dif_epu16(cr, a1);
788         const __m128i d2u = abs_dif_epu16(cr, a2);
789         const __m128i d3u = abs_dif_epu16(cr, a3);
790         const __m128i d4u = abs_dif_epu16(cr, a4);
791         const __m128i d5u = abs_dif_epu16(cr, a5);
792         const __m128i d6u = abs_dif_epu16(cr, a6);
793         const __m128i d7u = abs_dif_epu16(cr, a7);
794         const __m128i d8u = abs_dif_epu16(cr, a8);
795         const __m128i dcu = abs_dif_epu16(cr, c);
796 
797         const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
798         const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
799         const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
800         const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
801         const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
802         const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
803         const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
804         const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
805         const __m128i dc = _mm_xor_si128(dcu, mask_sign);
806 
807         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4)), _mm_min_epi16(_mm_min_epi16(d5, d6), _mm_min_epi16(d7, d8))), dc);
808 
809         __m128i result = select_16_equ(mindiff, d4, a4, c);
810         result = select_16_equ(mindiff, dc, c, result);
811         result = select_16_equ(mindiff, d5, a5, result);
812         result = select_16_equ(mindiff, d1, a1, result);
813         result = select_16_equ(mindiff, d3, a3, result);
814         result = select_16_equ(mindiff, d2, a2, result);
815         result = select_16_equ(mindiff, d6, a6, result);
816         result = select_16_equ(mindiff, d8, a8, result);
817         return select_16_equ(mindiff, d7, a7, result);
818     }
819 #endif
820 };
821 
822 class OpRG12
823 {
824 public:
825     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)826     static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
827         int                a [8] = { a1, a2, a3, a4, a5, a6, a7, a8 };
828 
829         std::sort (&a [0], (&a [0]) + 8);
830         const int        mi = std::min (a [2-1], c);
831         const int        ma = std::max (a [7-1], c);
832 
833         return (limit (cr, mi, ma));
834     }
835 #ifdef VS_TARGET_CPU_X86
836     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)837     static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
838         AvsFilterRepair16_READ_PIX
839 
840         sort_pair (a1, a2);
841         sort_pair (a3, a4);
842         sort_pair (a5, a6);
843         sort_pair (a7, a8);
844 
845         sort_pair (a1, a3);
846         sort_pair (a2, a4);
847         sort_pair (a5, a7);
848         sort_pair (a6, a8);
849 
850         sort_pair (a2, a3);
851         sort_pair (a6, a7);
852 
853         a5 = _mm_max_epi16 (a1, a5);    // sort_pair (a1, a5);
854         sort_pair (a2, a6);
855         sort_pair (a3, a7);
856         a4 = _mm_min_epi16 (a4, a8);    // sort_pair (a4, a8);
857 
858         a3 = _mm_min_epi16 (a3, a5);    // sort_pair (a3, a5);
859         a6 = _mm_max_epi16 (a4, a6);    // sort_pair (a4, a6);
860 
861         a2 = _mm_min_epi16 (a2, a3);    // sort_pair (a2, a3);
862         a7 = _mm_max_epi16 (a6, a7);    // sort_pair (a6, a7);
863 
864         const __m128i    mi = _mm_min_epi16 (c, a2);
865         const __m128i    ma = _mm_max_epi16 (c, a7);
866 
867         return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
868     }
869 #endif
870 };
871 
872 class OpRG13
873 {
874 public:
875     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)876     static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
877         int                a [8] = { a1, a2, a3, a4, a5, a6, a7, a8 };
878 
879         std::sort (&a [0], (&a [0]) + 8);
880         const int        mi = std::min (a [3-1], c);
881         const int        ma = std::max (a [6-1], c);
882 
883         return (limit (cr, mi, ma));
884     }
885 #ifdef VS_TARGET_CPU_X86
886     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)887     static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
888         AvsFilterRepair16_READ_PIX
889 
890         sort_pair (a1, a2);
891         sort_pair (a3, a4);
892         sort_pair (a5, a6);
893         sort_pair (a7, a8);
894 
895         sort_pair (a1, a3);
896         sort_pair (a2, a4);
897         sort_pair (a5, a7);
898         sort_pair (a6, a8);
899 
900         sort_pair (a2, a3);
901         sort_pair (a6, a7);
902 
903         a5 = _mm_max_epi16 (a1, a5);    // sort_pair (a1, a5);
904         sort_pair (a2, a6);
905         sort_pair (a3, a7);
906         a4 = _mm_min_epi16 (a4, a8);    // sort_pair (a4, a8);
907 
908         a3 = _mm_min_epi16 (a3, a5);    // sort_pair (a3, a5);
909         a6 = _mm_max_epi16 (a4, a6);    // sort_pair (a4, a6);
910 
911         a3 = _mm_max_epi16 (a2, a3);    // sort_pair (a2, a3);
912         a6 = _mm_min_epi16 (a6, a7);    // sort_pair (a6, a7);
913 
914         const __m128i    mi = _mm_min_epi16 (c, a3);
915         const __m128i    ma = _mm_max_epi16 (c, a6);
916 
917         return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
918     }
919 #endif
920 };
921 
922 class OpRG14
923 {
924 public:
925     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)926     static __forceinline int rg (int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
927         int                a [8] = { a1, a2, a3, a4, a5, a6, a7, a8 };
928 
929         std::sort (&a [0], (&a [0]) + 8);
930         const int        mi = std::min (a [4-1], c);
931         const int        ma = std::max (a [5-1], c);
932 
933         return (limit (cr, mi, ma));
934     }
935 #ifdef VS_TARGET_CPU_X86
936     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)937     static __forceinline __m128i rg (const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
938         AvsFilterRepair16_READ_PIX
939 
940         sort_pair (a1, a2);
941         sort_pair (a3, a4);
942         sort_pair (a5, a6);
943         sort_pair (a7, a8);
944 
945         sort_pair (a1, a3);
946         sort_pair (a2, a4);
947         sort_pair (a5, a7);
948         sort_pair (a6, a8);
949 
950         sort_pair (a2, a3);
951         sort_pair (a6, a7);
952 
953         a5 = _mm_max_epi16 (a1, a5);    // sort_pair (a1, a5);
954         a6 = _mm_max_epi16 (a2, a6);    // sort_pair (a2, a6);
955         a3 = _mm_min_epi16 (a3, a7);    // sort_pair (a3, a7);
956         a4 = _mm_min_epi16 (a4, a8);    // sort_pair (a4, a8);
957 
958         a5 = _mm_max_epi16 (a3, a5);    // sort_pair (a3, a5);
959         a4 = _mm_min_epi16 (a4, a6);    // sort_pair (a4, a6);
960 
961         sort_pair (a4, a5);
962 
963         const __m128i    mi = _mm_min_epi16 (c, a4);
964         const __m128i    ma = _mm_max_epi16 (c, a5);
965 
966         return (_mm_min_epi16 (_mm_max_epi16 (cr, mi), ma));
967     }
968 #endif
969 };
970 
971 class OpRG15 {
972 public:
973     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)974     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
975         AvsFilterRepair16_SORT_AXIS_CPP
976 
977         const int      c1 = std::abs(c - limit(c, mi1, ma1));
978         const int      c2 = std::abs(c - limit(c, mi2, ma2));
979         const int      c3 = std::abs(c - limit(c, mi3, ma3));
980         const int      c4 = std::abs(c - limit(c, mi4, ma4));
981 
982         const int      mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
983 
984         int            mi;
985         int            ma;
986         if (mindiff == c4) {
987             mi = mi4;
988             ma = ma4;
989         } else if (mindiff == c2) {
990             mi = mi2;
991             ma = ma2;
992         } else if (mindiff == c3) {
993             mi = mi3;
994             ma = ma3;
995         } else {
996             mi = mi1;
997             ma = ma1;
998         }
999 
1000         mi = std::min(mi, c);
1001         ma = std::max(ma, c);
1002 
1003         return (limit(cr, mi, ma));
1004     }
1005 #ifdef VS_TARGET_CPU_X86
1006     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1007     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1008         AvsFilterRepair16_READ_PIX
1009         AvsFilterRepair16_SORT_AXIS_SSE2
1010 
1011         const __m128i cma1 = _mm_max_epi16(c, ma1);
1012         const __m128i cma2 = _mm_max_epi16(c, ma2);
1013         const __m128i cma3 = _mm_max_epi16(c, ma3);
1014         const __m128i cma4 = _mm_max_epi16(c, ma4);
1015 
1016         const __m128i cmi1 = _mm_min_epi16(c, mi1);
1017         const __m128i cmi2 = _mm_min_epi16(c, mi2);
1018         const __m128i cmi3 = _mm_min_epi16(c, mi3);
1019         const __m128i cmi4 = _mm_min_epi16(c, mi4);
1020 
1021         const __m128i clipped1 = limit_epi16(c, mi1, ma1);
1022         const __m128i clipped2 = limit_epi16(c, mi2, ma2);
1023         const __m128i clipped3 = limit_epi16(c, mi3, ma3);
1024         const __m128i clipped4 = limit_epi16(c, mi4, ma4);
1025 
1026         const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
1027         const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
1028         const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
1029         const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
1030         const __m128i cu = _mm_xor_si128(c, mask_sign);
1031 
1032         const __m128i c1u = abs_dif_epu16(cu, clipped1u);
1033         const __m128i c2u = abs_dif_epu16(cu, clipped2u);
1034         const __m128i c3u = abs_dif_epu16(cu, clipped3u);
1035         const __m128i c4u = abs_dif_epu16(cu, clipped4u);
1036 
1037         const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
1038         const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
1039         const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
1040         const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
1041 
1042         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
1043 
1044         __m128i result = select_16_equ(mindiff, c1, limit_epi16(cr, cmi1, cma1), cr);
1045         result = select_16_equ(mindiff, c3, limit_epi16(cr, cmi3, cma3), result);
1046         result = select_16_equ(mindiff, c2, limit_epi16(cr, cmi2, cma2), result);
1047         return select_16_equ(mindiff, c4, limit_epi16(cr, cmi4, cma4), result);
1048     }
1049 #endif
1050 };
1051 
1052 class OpRG16 {
1053 public:
1054     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1055     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1056         AvsFilterRepair16_SORT_AXIS_CPP
1057 
1058         const int      d1 = ma1 - mi1;
1059         const int      d2 = ma2 - mi2;
1060         const int      d3 = ma3 - mi3;
1061         const int      d4 = ma4 - mi4;
1062 
1063         const int      c1 = limit((std::abs(c - limit(c, mi1, ma1)) << 1) + d1, 0, 0xFFFF);
1064         const int      c2 = limit((std::abs(c - limit(c, mi2, ma2)) << 1) + d2, 0, 0xFFFF);
1065         const int      c3 = limit((std::abs(c - limit(c, mi3, ma3)) << 1) + d3, 0, 0xFFFF);
1066         const int      c4 = limit((std::abs(c - limit(c, mi4, ma4)) << 1) + d4, 0, 0xFFFF);
1067 
1068         const int      mindiff = std::min(std::min(c1, c2), std::min(c3, c4));
1069 
1070         int            mi;
1071         int            ma;
1072         if (mindiff == c4) {
1073             mi = mi4;
1074             ma = ma4;
1075         } else if (mindiff == c2) {
1076             mi = mi2;
1077             ma = ma2;
1078         } else if (mindiff == c3) {
1079             mi = mi3;
1080             ma = ma3;
1081         } else {
1082             mi = mi1;
1083             ma = ma1;
1084         }
1085 
1086         mi = std::min(mi, c);
1087         ma = std::max(ma, c);
1088 
1089         return (limit(cr, mi, ma));
1090     }
1091 #ifdef VS_TARGET_CPU_X86
1092     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1093     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1094         AvsFilterRepair16_READ_PIX
1095         AvsFilterRepair16_SORT_AXIS_SSE2
1096 
1097         const __m128i cma1 = _mm_max_epi16(c, ma1);
1098         const __m128i cma2 = _mm_max_epi16(c, ma2);
1099         const __m128i cma3 = _mm_max_epi16(c, ma3);
1100         const __m128i cma4 = _mm_max_epi16(c, ma4);
1101 
1102         const __m128i cmi1 = _mm_min_epi16(c, mi1);
1103         const __m128i cmi2 = _mm_min_epi16(c, mi2);
1104         const __m128i cmi3 = _mm_min_epi16(c, mi3);
1105         const __m128i cmi4 = _mm_min_epi16(c, mi4);
1106 
1107         const __m128i d1 = _mm_sub_epi16(ma1, mi1);
1108         const __m128i d2 = _mm_sub_epi16(ma2, mi2);
1109         const __m128i d3 = _mm_sub_epi16(ma3, mi3);
1110         const __m128i d4 = _mm_sub_epi16(ma4, mi4);
1111 
1112         const __m128i clipped1 = limit_epi16(c, mi1, ma1);
1113         const __m128i clipped2 = limit_epi16(c, mi2, ma2);
1114         const __m128i clipped3 = limit_epi16(c, mi3, ma3);
1115         const __m128i clipped4 = limit_epi16(c, mi4, ma4);
1116 
1117         const __m128i clipped1u = _mm_xor_si128(clipped1, mask_sign);
1118         const __m128i clipped2u = _mm_xor_si128(clipped2, mask_sign);
1119         const __m128i clipped3u = _mm_xor_si128(clipped3, mask_sign);
1120         const __m128i clipped4u = _mm_xor_si128(clipped4, mask_sign);
1121         const __m128i cu = _mm_xor_si128(c, mask_sign);
1122 
1123         const __m128i absdiff1 = abs_dif_epu16(cu, clipped1u);
1124         const __m128i absdiff2 = abs_dif_epu16(cu, clipped2u);
1125         const __m128i absdiff3 = abs_dif_epu16(cu, clipped3u);
1126         const __m128i absdiff4 = abs_dif_epu16(cu, clipped4u);
1127 
1128         const __m128i c1u = _mm_adds_epu16(_mm_adds_epu16(absdiff1, absdiff1), d1);
1129         const __m128i c2u = _mm_adds_epu16(_mm_adds_epu16(absdiff2, absdiff2), d2);
1130         const __m128i c3u = _mm_adds_epu16(_mm_adds_epu16(absdiff3, absdiff3), d3);
1131         const __m128i c4u = _mm_adds_epu16(_mm_adds_epu16(absdiff4, absdiff4), d4);
1132 
1133         const __m128i c1 = _mm_xor_si128(c1u, mask_sign);
1134         const __m128i c2 = _mm_xor_si128(c2u, mask_sign);
1135         const __m128i c3 = _mm_xor_si128(c3u, mask_sign);
1136         const __m128i c4 = _mm_xor_si128(c4u, mask_sign);
1137 
1138         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(c1, c2), _mm_min_epi16(c3, c4));
1139 
1140         __m128i result = select_16_equ(mindiff, c1, limit_epi16(cr, cmi1, cma1), cr);
1141         result = select_16_equ(mindiff, c3, limit_epi16(cr, cmi3, cma3), result);
1142         result = select_16_equ(mindiff, c2, limit_epi16(cr, cmi2, cma2), result);
1143         return select_16_equ(mindiff, c4, limit_epi16(cr, cmi4, cma4), result);
1144     }
1145 #endif
1146 };
1147 
1148 class OpRG17 {
1149 public:
1150     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1151     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1152         AvsFilterRepair16_SORT_AXIS_CPP
1153 
1154         const int      l = std::max(std::max(mi1, mi2), std::max(mi3, mi4));
1155         const int      u = std::min(std::min(ma1, ma2), std::min(ma3, ma4));
1156 
1157         const int      mi = std::min(std::min(l, u), c);
1158         const int      ma = std::max(std::max(l, u), c);
1159 
1160         return (limit(cr, mi, ma));
1161     }
1162 #ifdef VS_TARGET_CPU_X86
1163     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1164     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1165         AvsFilterRepair16_READ_PIX
1166         AvsFilterRepair16_SORT_AXIS_SSE2
1167 
1168         const __m128i lower = _mm_max_epi16(_mm_max_epi16(mi1, mi2), _mm_max_epi16(mi3, mi4));
1169         const __m128i upper = _mm_min_epi16(_mm_min_epi16(ma1, ma2), _mm_min_epi16(ma3, ma4));
1170 
1171         const __m128i real_upper = _mm_max_epi16(_mm_max_epi16(upper, lower), c);
1172         const __m128i real_lower = _mm_min_epi16(_mm_min_epi16(upper, lower), c);
1173 
1174         return limit_epi16(cr, real_lower, real_upper);
1175     }
1176 #endif
1177 };
1178 
1179 class OpRG18 {
1180 public:
1181     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1182     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1183         const int      d1 = std::max(std::abs(c - a1), std::abs(c - a8));
1184         const int      d2 = std::max(std::abs(c - a2), std::abs(c - a7));
1185         const int      d3 = std::max(std::abs(c - a3), std::abs(c - a6));
1186         const int      d4 = std::max(std::abs(c - a4), std::abs(c - a5));
1187 
1188         const int      mindiff = std::min(std::min(d1, d2), std::min(d3, d4));
1189 
1190         int            mi;
1191         int            ma;
1192         if (mindiff == d4) {
1193             mi = std::min(a4, a5);
1194             ma = std::max(a4, a5);
1195         } else if (mindiff == d2) {
1196             mi = std::min(a2, a7);
1197             ma = std::max(a2, a7);
1198         } else if (mindiff == d3) {
1199             mi = std::min(a3, a6);
1200             ma = std::max(a3, a6);
1201         } else {
1202             mi = std::min(a1, a8);
1203             ma = std::max(a1, a8);
1204         }
1205 
1206         mi = std::min(mi, c);
1207         ma = std::max(ma, c);
1208 
1209         return (limit(cr, mi, ma));
1210     }
1211 #ifdef VS_TARGET_CPU_X86
1212     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1213     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1214         AvsFilterRepair16_READ_PIX
1215 
1216         const __m128i a1u = _mm_xor_si128(a1, mask_sign);
1217         const __m128i a2u = _mm_xor_si128(a2, mask_sign);
1218         const __m128i a3u = _mm_xor_si128(a3, mask_sign);
1219         const __m128i a4u = _mm_xor_si128(a4, mask_sign);
1220         const __m128i a5u = _mm_xor_si128(a5, mask_sign);
1221         const __m128i a6u = _mm_xor_si128(a6, mask_sign);
1222         const __m128i a7u = _mm_xor_si128(a7, mask_sign);
1223         const __m128i a8u = _mm_xor_si128(a8, mask_sign);
1224         const __m128i cu = _mm_xor_si128(c, mask_sign);
1225 
1226         const __m128i absdiff1u = abs_dif_epu16(cu, a1u);
1227         const __m128i absdiff2u = abs_dif_epu16(cu, a2u);
1228         const __m128i absdiff3u = abs_dif_epu16(cu, a3u);
1229         const __m128i absdiff4u = abs_dif_epu16(cu, a4u);
1230         const __m128i absdiff5u = abs_dif_epu16(cu, a5u);
1231         const __m128i absdiff6u = abs_dif_epu16(cu, a6u);
1232         const __m128i absdiff7u = abs_dif_epu16(cu, a7u);
1233         const __m128i absdiff8u = abs_dif_epu16(cu, a8u);
1234 
1235         const __m128i absdiff1 = _mm_xor_si128(absdiff1u, mask_sign);
1236         const __m128i absdiff2 = _mm_xor_si128(absdiff2u, mask_sign);
1237         const __m128i absdiff3 = _mm_xor_si128(absdiff3u, mask_sign);
1238         const __m128i absdiff4 = _mm_xor_si128(absdiff4u, mask_sign);
1239         const __m128i absdiff5 = _mm_xor_si128(absdiff5u, mask_sign);
1240         const __m128i absdiff6 = _mm_xor_si128(absdiff6u, mask_sign);
1241         const __m128i absdiff7 = _mm_xor_si128(absdiff7u, mask_sign);
1242         const __m128i absdiff8 = _mm_xor_si128(absdiff8u, mask_sign);
1243 
1244         const __m128i d1 = _mm_max_epi16(absdiff1, absdiff8);
1245         const __m128i d2 = _mm_max_epi16(absdiff2, absdiff7);
1246         const __m128i d3 = _mm_max_epi16(absdiff3, absdiff6);
1247         const __m128i d4 = _mm_max_epi16(absdiff4, absdiff5);
1248 
1249         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4));
1250 
1251         const __m128i mi1 = _mm_min_epi16(c, _mm_min_epi16(a1, a8));
1252         const __m128i mi2 = _mm_min_epi16(c, _mm_min_epi16(a2, a7));
1253         const __m128i mi3 = _mm_min_epi16(c, _mm_min_epi16(a3, a6));
1254         const __m128i mi4 = _mm_min_epi16(c, _mm_min_epi16(a4, a5));
1255 
1256         const __m128i ma1 = _mm_max_epi16(c, _mm_max_epi16(a1, a8));
1257         const __m128i ma2 = _mm_max_epi16(c, _mm_max_epi16(a2, a7));
1258         const __m128i ma3 = _mm_max_epi16(c, _mm_max_epi16(a3, a6));
1259         const __m128i ma4 = _mm_max_epi16(c, _mm_max_epi16(a4, a5));
1260 
1261         const __m128i c1 = limit_epi16(cr, mi1, ma1);
1262         const __m128i c2 = limit_epi16(cr, mi2, ma2);
1263         const __m128i c3 = limit_epi16(cr, mi3, ma3);
1264         const __m128i c4 = limit_epi16(cr, mi4, ma4);
1265 
1266         __m128i result = select_16_equ(mindiff, d1, c1, cr);
1267         result = select_16_equ(mindiff, d3, c3, result);
1268         result = select_16_equ(mindiff, d2, c2, result);
1269         return select_16_equ(mindiff, d4, c4, result);
1270     }
1271 #endif
1272 };
1273 
1274 class OpRG19 {
1275 public:
1276     typedef    ConvUnsigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1277     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1278         const int d1 = std::abs(c - a1);
1279         const int d2 = std::abs(c - a2);
1280         const int d3 = std::abs(c - a3);
1281         const int d4 = std::abs(c - a4);
1282         const int d5 = std::abs(c - a5);
1283         const int d6 = std::abs(c - a6);
1284         const int d7 = std::abs(c - a7);
1285         const int d8 = std::abs(c - a8);
1286 
1287         const int mindiff = std::min(std::min(std::min(d1, d2), std::min(d3, d4)), std::min(std::min(d5, d6), std::min(d7, d8)));
1288 
1289         return limit(cr, limit(c - mindiff, 0, 0xFFFF), limit(c + mindiff, 0, 0xFFFF));
1290     }
1291 #ifdef VS_TARGET_CPU_X86
1292     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1293     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1294         AvsFilterRepair16_READ_PIX
1295 
1296         const __m128i d1u = abs_dif_epu16(c, a1);
1297         const __m128i d2u = abs_dif_epu16(c, a2);
1298         const __m128i d3u = abs_dif_epu16(c, a3);
1299         const __m128i d4u = abs_dif_epu16(c, a4);
1300         const __m128i d5u = abs_dif_epu16(c, a5);
1301         const __m128i d6u = abs_dif_epu16(c, a6);
1302         const __m128i d7u = abs_dif_epu16(c, a7);
1303         const __m128i d8u = abs_dif_epu16(c, a8);
1304 
1305         const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1306         const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1307         const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1308         const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1309         const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1310         const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1311         const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1312         const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1313 
1314         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4)), _mm_min_epi16(_mm_min_epi16(d5, d6), _mm_min_epi16(d7, d8)));
1315 
1316         const __m128i mindiffu = _mm_xor_si128(mindiff, mask_sign);
1317 
1318         const __m128i mi = _mm_xor_si128(_mm_subs_epu16(c, mindiffu), mask_sign);
1319         const __m128i ma = _mm_xor_si128(_mm_adds_epu16(c, mindiffu), mask_sign);
1320 
1321         return _mm_xor_si128(limit_epi16(_mm_xor_si128(cr, mask_sign), mi, ma), mask_sign);
1322     }
1323 #endif
1324 };
1325 
1326 class OpRG20 {
1327 public:
1328     typedef    ConvUnsigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1329     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1330         const int d1 = std::abs(c - a1);
1331         const int d2 = std::abs(c - a2);
1332         const int d3 = std::abs(c - a3);
1333         const int d4 = std::abs(c - a4);
1334         const int d5 = std::abs(c - a5);
1335         const int d6 = std::abs(c - a6);
1336         const int d7 = std::abs(c - a7);
1337         const int d8 = std::abs(c - a8);
1338 
1339         int mindiff = std::min(d1, d2);
1340         int maxdiff = std::max(d1, d2);
1341 
1342         maxdiff = limit(maxdiff, mindiff, d3);
1343         mindiff = std::min(mindiff, d3);
1344 
1345         maxdiff = limit(maxdiff, mindiff, d4);
1346         mindiff = std::min(mindiff, d4);
1347 
1348         maxdiff = limit(maxdiff, mindiff, d5);
1349         mindiff = std::min(mindiff, d5);
1350 
1351         maxdiff = limit(maxdiff, mindiff, d6);
1352         mindiff = std::min(mindiff, d6);
1353 
1354         maxdiff = limit(maxdiff, mindiff, d7);
1355         mindiff = std::min(mindiff, d7);
1356 
1357         maxdiff = limit(maxdiff, mindiff, d8);
1358 
1359         return limit(cr, limit(c - maxdiff, 0, 0xFFFF), limit(c + maxdiff, 0, 0xFFFF));
1360     }
1361 #ifdef VS_TARGET_CPU_X86
1362     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1363     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1364         AvsFilterRepair16_READ_PIX
1365 
1366         const __m128i d1u = abs_dif_epu16(c, a1);
1367         const __m128i d2u = abs_dif_epu16(c, a2);
1368         const __m128i d3u = abs_dif_epu16(c, a3);
1369         const __m128i d4u = abs_dif_epu16(c, a4);
1370         const __m128i d5u = abs_dif_epu16(c, a5);
1371         const __m128i d6u = abs_dif_epu16(c, a6);
1372         const __m128i d7u = abs_dif_epu16(c, a7);
1373         const __m128i d8u = abs_dif_epu16(c, a8);
1374 
1375         const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1376         const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1377         const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1378         const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1379         const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1380         const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1381         const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1382         const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1383 
1384         __m128i mindiff = _mm_min_epi16(d1, d2);
1385         __m128i maxdiff = _mm_max_epi16(d1, d2);
1386 
1387         maxdiff = limit_epi16(maxdiff, mindiff, d3);
1388         mindiff = _mm_min_epi16(mindiff, d3);
1389 
1390         maxdiff = limit_epi16(maxdiff, mindiff, d4);
1391         mindiff = _mm_min_epi16(mindiff, d4);
1392 
1393         maxdiff = limit_epi16(maxdiff, mindiff, d5);
1394         mindiff = _mm_min_epi16(mindiff, d5);
1395 
1396         maxdiff = limit_epi16(maxdiff, mindiff, d6);
1397         mindiff = _mm_min_epi16(mindiff, d6);
1398 
1399         maxdiff = limit_epi16(maxdiff, mindiff, d7);
1400         mindiff = _mm_min_epi16(mindiff, d7);
1401 
1402         maxdiff = limit_epi16(maxdiff, mindiff, d8);
1403 
1404         const __m128i maxdiffu = _mm_xor_si128(maxdiff, mask_sign);
1405 
1406         const __m128i mi = _mm_xor_si128(_mm_subs_epu16(c, maxdiffu), mask_sign);
1407         const __m128i ma = _mm_xor_si128(_mm_adds_epu16(c, maxdiffu), mask_sign);
1408 
1409         return _mm_xor_si128(limit_epi16(_mm_xor_si128(cr, mask_sign), mi, ma), mask_sign);
1410     }
1411 #endif
1412 };
1413 
1414 class OpRG21 {
1415 public:
1416     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1417     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1418         AvsFilterRepair16_SORT_AXIS_CPP
1419 
1420         const int d1 = limit(ma1 - c, 0, 0xFFFF);
1421         const int d2 = limit(ma2 - c, 0, 0xFFFF);
1422         const int d3 = limit(ma3 - c, 0, 0xFFFF);
1423         const int d4 = limit(ma4 - c, 0, 0xFFFF);
1424 
1425         const int rd1 = limit(c - mi1, 0, 0xFFFF);
1426         const int rd2 = limit(c - mi2, 0, 0xFFFF);
1427         const int rd3 = limit(c - mi3, 0, 0xFFFF);
1428         const int rd4 = limit(c - mi4, 0, 0xFFFF);
1429 
1430         const int u1 = std::max(d1, rd1);
1431         const int u2 = std::max(d2, rd2);
1432         const int u3 = std::max(d3, rd3);
1433         const int u4 = std::max(d4, rd4);
1434 
1435         const int u = std::min(std::min(u1, u2), std::min(u3, u4));
1436 
1437         return limit(cr, limit(c - u, 0, 0xFFFF), limit(c + u, 0, 0xFFFF));
1438     }
1439 #ifdef VS_TARGET_CPU_X86
1440     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1441     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1442         AvsFilterRepair16_READ_PIX
1443         AvsFilterRepair16_SORT_AXIS_SSE2
1444 
1445         const __m128i d1 = _mm_subs_epi16(ma1, c);
1446         const __m128i d2 = _mm_subs_epi16(ma2, c);
1447         const __m128i d3 = _mm_subs_epi16(ma3, c);
1448         const __m128i d4 = _mm_subs_epi16(ma4, c);
1449 
1450         const __m128i rd1 = _mm_subs_epi16(c, mi1);
1451         const __m128i rd2 = _mm_subs_epi16(c, mi2);
1452         const __m128i rd3 = _mm_subs_epi16(c, mi3);
1453         const __m128i rd4 = _mm_subs_epi16(c, mi4);
1454 
1455         const __m128i u1 = _mm_max_epi16(d1, rd1);
1456         const __m128i u2 = _mm_max_epi16(d2, rd2);
1457         const __m128i u3 = _mm_max_epi16(d3, rd3);
1458         const __m128i u4 = _mm_max_epi16(d4, rd4);
1459 
1460         const __m128i u = _mm_min_epi16(_mm_min_epi16(u1, u2), _mm_min_epi16(u3, u4));
1461 
1462         const __m128i mi = _mm_subs_epi16(c, u);
1463         const __m128i ma = _mm_adds_epi16(c, u);
1464 
1465         return limit_epi16(cr, mi, ma);
1466     }
1467 #endif
1468 };
1469 
1470 class OpRG22 {
1471 public:
1472     typedef    ConvUnsigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1473     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1474         const int d1 = std::abs(cr - a1);
1475         const int d2 = std::abs(cr - a2);
1476         const int d3 = std::abs(cr - a3);
1477         const int d4 = std::abs(cr - a4);
1478         const int d5 = std::abs(cr - a5);
1479         const int d6 = std::abs(cr - a6);
1480         const int d7 = std::abs(cr - a7);
1481         const int d8 = std::abs(cr - a8);
1482 
1483         const int mindiff = std::min(std::min(std::min(d1, d2), std::min(d3, d4)), std::min(std::min(d5, d6), std::min(d7, d8)));
1484 
1485         return limit(c, limit(cr - mindiff, 0, 0xFFFF), limit(cr + mindiff, 0, 0xFFFF));
1486     }
1487 #ifdef VS_TARGET_CPU_X86
1488     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1489     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1490         AvsFilterRepair16_READ_PIX
1491 
1492         const __m128i d1u = abs_dif_epu16(cr, a1);
1493         const __m128i d2u = abs_dif_epu16(cr, a2);
1494         const __m128i d3u = abs_dif_epu16(cr, a3);
1495         const __m128i d4u = abs_dif_epu16(cr, a4);
1496         const __m128i d5u = abs_dif_epu16(cr, a5);
1497         const __m128i d6u = abs_dif_epu16(cr, a6);
1498         const __m128i d7u = abs_dif_epu16(cr, a7);
1499         const __m128i d8u = abs_dif_epu16(cr, a8);
1500 
1501         const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1502         const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1503         const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1504         const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1505         const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1506         const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1507         const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1508         const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1509 
1510         const __m128i mindiff = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(d1, d2), _mm_min_epi16(d3, d4)), _mm_min_epi16(_mm_min_epi16(d5, d6), _mm_min_epi16(d7, d8)));
1511 
1512         const __m128i mindiffu = _mm_xor_si128(mindiff, mask_sign);
1513 
1514         const __m128i mi = _mm_xor_si128(_mm_subs_epu16(cr, mindiffu), mask_sign);
1515         const __m128i ma = _mm_xor_si128(_mm_adds_epu16(cr, mindiffu), mask_sign);
1516 
1517         return _mm_xor_si128(limit_epi16(_mm_xor_si128(c, mask_sign), mi, ma), mask_sign);
1518     }
1519 #endif
1520 };
1521 
1522 class OpRG23 {
1523 public:
1524     typedef    ConvUnsigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1525     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1526         const int d1 = std::abs(cr - a1);
1527         const int d2 = std::abs(cr - a2);
1528         const int d3 = std::abs(cr - a3);
1529         const int d4 = std::abs(cr - a4);
1530         const int d5 = std::abs(cr - a5);
1531         const int d6 = std::abs(cr - a6);
1532         const int d7 = std::abs(cr - a7);
1533         const int d8 = std::abs(cr - a8);
1534 
1535         int mindiff = std::min(d1, d2);
1536         int maxdiff = std::max(d1, d2);
1537 
1538         maxdiff = limit(maxdiff, mindiff, d3);
1539         mindiff = std::min(mindiff, d3);
1540 
1541         maxdiff = limit(maxdiff, mindiff, d4);
1542         mindiff = std::min(mindiff, d4);
1543 
1544         maxdiff = limit(maxdiff, mindiff, d5);
1545         mindiff = std::min(mindiff, d5);
1546 
1547         maxdiff = limit(maxdiff, mindiff, d6);
1548         mindiff = std::min(mindiff, d6);
1549 
1550         maxdiff = limit(maxdiff, mindiff, d7);
1551         mindiff = std::min(mindiff, d7);
1552 
1553         maxdiff = limit(maxdiff, mindiff, d8);
1554 
1555         return limit(c, limit(cr - maxdiff, 0, 0xFFFF), limit(cr + maxdiff, 0, 0xFFFF));
1556     }
1557 #ifdef VS_TARGET_CPU_X86
1558     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1559     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1560         AvsFilterRepair16_READ_PIX
1561 
1562         const __m128i d1u = abs_dif_epu16(cr, a1);
1563         const __m128i d2u = abs_dif_epu16(cr, a2);
1564         const __m128i d3u = abs_dif_epu16(cr, a3);
1565         const __m128i d4u = abs_dif_epu16(cr, a4);
1566         const __m128i d5u = abs_dif_epu16(cr, a5);
1567         const __m128i d6u = abs_dif_epu16(cr, a6);
1568         const __m128i d7u = abs_dif_epu16(cr, a7);
1569         const __m128i d8u = abs_dif_epu16(cr, a8);
1570 
1571         const __m128i d1 = _mm_xor_si128(d1u, mask_sign);
1572         const __m128i d2 = _mm_xor_si128(d2u, mask_sign);
1573         const __m128i d3 = _mm_xor_si128(d3u, mask_sign);
1574         const __m128i d4 = _mm_xor_si128(d4u, mask_sign);
1575         const __m128i d5 = _mm_xor_si128(d5u, mask_sign);
1576         const __m128i d6 = _mm_xor_si128(d6u, mask_sign);
1577         const __m128i d7 = _mm_xor_si128(d7u, mask_sign);
1578         const __m128i d8 = _mm_xor_si128(d8u, mask_sign);
1579 
1580         __m128i mindiff = _mm_min_epi16(d1, d2);
1581         __m128i maxdiff = _mm_max_epi16(d1, d2);
1582 
1583         maxdiff = limit_epi16(maxdiff, mindiff, d3);
1584         mindiff = _mm_min_epi16(mindiff, d3);
1585 
1586         maxdiff = limit_epi16(maxdiff, mindiff, d4);
1587         mindiff = _mm_min_epi16(mindiff, d4);
1588 
1589         maxdiff = limit_epi16(maxdiff, mindiff, d5);
1590         mindiff = _mm_min_epi16(mindiff, d5);
1591 
1592         maxdiff = limit_epi16(maxdiff, mindiff, d6);
1593         mindiff = _mm_min_epi16(mindiff, d6);
1594 
1595         maxdiff = limit_epi16(maxdiff, mindiff, d7);
1596         mindiff = _mm_min_epi16(mindiff, d7);
1597 
1598         maxdiff = limit_epi16(maxdiff, mindiff, d8);
1599 
1600         const __m128i maxdiffu = _mm_xor_si128(maxdiff, mask_sign);
1601 
1602         const __m128i mi = _mm_xor_si128(_mm_subs_epu16(cr, maxdiffu), mask_sign);
1603         const __m128i ma = _mm_xor_si128(_mm_adds_epu16(cr, maxdiffu), mask_sign);
1604 
1605         return _mm_xor_si128(limit_epi16(_mm_xor_si128(c, mask_sign), mi, ma), mask_sign);
1606     }
1607 #endif
1608 };
1609 
1610 class OpRG24 {
1611 public:
1612     typedef    ConvSigned    ConvSign;
rg(int cr,int a1,int a2,int a3,int a4,int c,int a5,int a6,int a7,int a8)1613     static __forceinline int rg(int cr, int a1, int a2, int a3, int a4, int c, int a5, int a6, int a7, int a8) {
1614         AvsFilterRepair16_SORT_AXIS_CPP
1615 
1616         const int d1 = limit(ma1 - cr, 0, 0xFFFF);
1617         const int d2 = limit(ma2 - cr, 0, 0xFFFF);
1618         const int d3 = limit(ma3 - cr, 0, 0xFFFF);
1619         const int d4 = limit(ma4 - cr, 0, 0xFFFF);
1620 
1621         const int rd1 = limit(cr - mi1, 0, 0xFFFF);
1622         const int rd2 = limit(cr - mi2, 0, 0xFFFF);
1623         const int rd3 = limit(cr - mi3, 0, 0xFFFF);
1624         const int rd4 = limit(cr - mi4, 0, 0xFFFF);
1625 
1626         const int u1 = std::max(d1, rd1);
1627         const int u2 = std::max(d2, rd2);
1628         const int u3 = std::max(d3, rd3);
1629         const int u4 = std::max(d4, rd4);
1630 
1631         const int u = std::min(std::min(u1, u2), std::min(u3, u4));
1632 
1633         return limit(c, limit(cr - u, 0, 0xFFFF), limit(cr + u, 0, 0xFFFF));
1634     }
1635 #ifdef VS_TARGET_CPU_X86
1636     template<typename T>
rg(const T * src1_ptr,const T * src2_ptr,int stride_src2,__m128i mask_sign)1637     static __forceinline __m128i rg(const T *src1_ptr, const T *src2_ptr, int stride_src2, __m128i mask_sign) {
1638         AvsFilterRepair16_READ_PIX
1639         AvsFilterRepair16_SORT_AXIS_SSE2
1640 
1641         const __m128i d1 = _mm_subs_epi16(ma1, cr);
1642         const __m128i d2 = _mm_subs_epi16(ma2, cr);
1643         const __m128i d3 = _mm_subs_epi16(ma3, cr);
1644         const __m128i d4 = _mm_subs_epi16(ma4, cr);
1645 
1646         const __m128i rd1 = _mm_subs_epi16(cr, mi1);
1647         const __m128i rd2 = _mm_subs_epi16(cr, mi2);
1648         const __m128i rd3 = _mm_subs_epi16(cr, mi3);
1649         const __m128i rd4 = _mm_subs_epi16(cr, mi4);
1650 
1651         const __m128i u1 = _mm_max_epi16(d1, rd1);
1652         const __m128i u2 = _mm_max_epi16(d2, rd2);
1653         const __m128i u3 = _mm_max_epi16(d3, rd3);
1654         const __m128i u4 = _mm_max_epi16(d4, rd4);
1655 
1656         const __m128i u = _mm_min_epi16(_mm_min_epi16(u1, u2), _mm_min_epi16(u3, u4));
1657 
1658         const __m128i mi = _mm_subs_epi16(cr, u);
1659         const __m128i ma = _mm_adds_epi16(cr, u);
1660 
1661         return limit_epi16(c, mi, ma);
1662     }
1663 #endif
1664 };
1665 
1666 
1667 template <class OP, class T>
1668 class PlaneProc {
1669 public:
1670 
process_subplane_cpp(const T * src1_ptr,const T * src2_ptr,T * dst_ptr,int stride,int width,int height)1671 static void process_subplane_cpp (const T *src1_ptr, const T *src2_ptr, T *dst_ptr, int stride, int width, int height)
1672 {
1673     const int        y_b = 1;
1674     const int        y_e = height - 1;
1675 
1676     dst_ptr += y_b * stride;
1677     src1_ptr += y_b * stride;
1678     src2_ptr += y_b * stride;
1679 
1680     const int        x_e = width - 1;
1681 
1682     for (int y = y_b; y < y_e; ++y)
1683     {
1684         dst_ptr [0] = src1_ptr [0];
1685 
1686         process_row_cpp (
1687             dst_ptr,
1688             src1_ptr,
1689             src2_ptr,
1690             stride,
1691             1,
1692             x_e
1693         );
1694 
1695         dst_ptr [x_e] = src1_ptr [x_e];
1696 
1697         dst_ptr += stride;
1698         src1_ptr += stride;
1699         src2_ptr += stride;
1700     }
1701 }
1702 
process_row_cpp(T * dst_ptr,const T * src1_ptr,const T * src2_ptr,int stride_src,int x_beg,int x_end)1703 static void process_row_cpp (T *dst_ptr, const T *src1_ptr, const T *src2_ptr, int stride_src, int x_beg, int x_end)
1704 {
1705     const int      om = stride_src - 1;
1706     const int      o0 = stride_src    ;
1707     const int      op = stride_src + 1;
1708 
1709     src1_ptr += x_beg;
1710     src2_ptr += x_beg;
1711 
1712     for (int x = x_beg; x < x_end; ++x)
1713     {
1714         const int       cr = src1_ptr [0];
1715         const int        a1 = src2_ptr [-op];
1716         const int        a2 = src2_ptr [-o0];
1717         const int        a3 = src2_ptr [-om];
1718         const int        a4 = src2_ptr [-1 ];
1719         const int        c  = src2_ptr [ 0 ];
1720         const int        a5 = src2_ptr [ 1 ];
1721         const int        a6 = src2_ptr [ om];
1722         const int        a7 = src2_ptr [ o0];
1723         const int        a8 = src2_ptr [ op];
1724 
1725         const int        res = OP::rg (cr, a1, a2, a3, a4, c, a5, a6, a7, a8);
1726 
1727         dst_ptr [x] = res;
1728 
1729         ++ src1_ptr;
1730         ++ src2_ptr;
1731     }
1732 }
1733 
1734 #ifdef VS_TARGET_CPU_X86
process_subplane_sse2(const T * src1_ptr,const T * src2_ptr,T * dst_ptr,int stride,int width,int height)1735 static void process_subplane_sse2 (const T *src1_ptr, const T *src2_ptr, T *dst_ptr, int stride, int width, int height)
1736 {
1737     const int        y_b = 1;
1738     const int        y_e = height - 1;
1739 
1740     dst_ptr += y_b * stride;
1741     src1_ptr += y_b * stride;
1742     src2_ptr += y_b * stride;
1743 
1744     const __m128i    mask_sign = _mm_set1_epi16 (-0x8000);
1745 
1746     const int        x_e =   width - 1;
1747     const int        w8  = ((width - 2) & -8) + 1;
1748 
1749     for (int y = y_b; y < y_e; ++y)
1750     {
1751         dst_ptr [0] = src1_ptr [0];
1752 
1753         for (int x = 1; x < w8; x += 8)
1754         {
1755             __m128i            res = OP::rg (
1756                 src1_ptr + x,
1757                 src2_ptr + x,
1758                 stride,
1759                 mask_sign
1760             );
1761 
1762             res = OP::ConvSign::cv (res, mask_sign);
1763             if (sizeof(T) == 1)
1764                 _mm_storel_epi64 (reinterpret_cast<__m128i *>(dst_ptr + x), _mm_packus_epi16 (res, res));
1765             else
1766                 _mm_storeu_si128 (reinterpret_cast<__m128i *>(dst_ptr + x), res);
1767         }
1768 
1769         process_row_cpp (
1770             dst_ptr,
1771             src1_ptr,
1772             src2_ptr,
1773             stride,
1774             w8,
1775             x_e
1776         );
1777 
1778         dst_ptr [x_e] = src1_ptr [x_e];
1779 
1780         dst_ptr += stride;
1781         src1_ptr += stride;
1782         src2_ptr += stride;
1783     }
1784 }
1785 
1786 template <class OP1, class T1>
do_process_plane_sse2(const VSFrameRef * src1_frame,const VSFrameRef * src2_frame,VSFrameRef * dst_frame,int plane_id,const VSAPI * vsapi)1787 static void do_process_plane_sse2 (const VSFrameRef *src1_frame, const VSFrameRef *src2_frame, VSFrameRef *dst_frame, int plane_id, const VSAPI *vsapi)
1788 {
1789     const int        w             = vsapi->getFrameWidth(src1_frame, plane_id);
1790     const int        h             = vsapi->getFrameHeight(src1_frame, plane_id);
1791     T1 *            dst_ptr       = reinterpret_cast<T1*>(vsapi->getWritePtr(dst_frame, plane_id));
1792     const int        stride        = vsapi->getStride(src1_frame, plane_id);
1793 
1794     const T1*        src1_ptr       = reinterpret_cast<const T1*>(vsapi->getReadPtr(src1_frame, plane_id));
1795     const T1*        src2_ptr       = reinterpret_cast<const T1*>(vsapi->getReadPtr(src2_frame, plane_id));
1796 
1797     // First line
1798     memcpy (dst_ptr, src1_ptr, stride);
1799 
1800     // Main content
1801     PlaneProc<OP1, T1>::process_subplane_sse2(src1_ptr, src2_ptr, dst_ptr, stride/sizeof(T1), w, h);
1802 
1803     // Last line
1804     const int        lp = (h - 1) * stride/sizeof(T1);
1805     memcpy (dst_ptr + lp, src1_ptr + lp, stride);
1806 }
1807 
1808 #endif
1809 
1810 template <class OP1, class T1>
do_process_plane_cpp(const VSFrameRef * src1_frame,const VSFrameRef * src2_frame,VSFrameRef * dst_frame,int plane_id,const VSAPI * vsapi)1811 static void do_process_plane_cpp (const VSFrameRef *src1_frame, const VSFrameRef *src2_frame, VSFrameRef *dst_frame, int plane_id, const VSAPI *vsapi)
1812 {
1813     const int        w             = vsapi->getFrameWidth(src1_frame, plane_id);
1814     const int        h             = vsapi->getFrameHeight(src1_frame, plane_id);
1815     T1 *            dst_ptr       = reinterpret_cast<T1*>(vsapi->getWritePtr(dst_frame, plane_id));
1816     const int        stride        = vsapi->getStride(src1_frame, plane_id);
1817 
1818     const T1*        src1_ptr       = reinterpret_cast<const T1*>(vsapi->getReadPtr(src1_frame, plane_id));
1819     const T1*        src2_ptr       = reinterpret_cast<const T1*>(vsapi->getReadPtr(src2_frame, plane_id));
1820 
1821     // First line
1822     memcpy (dst_ptr, src1_ptr, stride);
1823 
1824     // Main content
1825     PlaneProc<OP1, T1>::process_subplane_cpp(src1_ptr, src2_ptr, dst_ptr, stride/sizeof(T1), w, h);
1826 
1827     // Last line
1828     const int        lp = (h - 1) * stride/sizeof(T1);
1829     memcpy (dst_ptr + lp, src1_ptr + lp, stride);
1830 }
1831 
1832 };
1833 
1834 typedef struct {
1835     VSNodeRef *node1;
1836     VSNodeRef *node2;
1837     const VSVideoInfo *vi;
1838     int mode[3];
1839 } RepairData;
1840 
repairInit(VSMap * in,VSMap * out,void ** instanceData,VSNode * node,VSCore * core,const VSAPI * vsapi)1841 static void VS_CC repairInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
1842     RepairData *d = static_cast<RepairData *>(*instanceData);
1843     vsapi->setVideoInfo(d->vi, 1, node);
1844 }
1845 
repairGetFrame(int n,int activationReason,void ** instanceData,void ** frameData,VSFrameContext * frameCtx,VSCore * core,const VSAPI * vsapi)1846 static const VSFrameRef *VS_CC repairGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
1847     RepairData *d = static_cast<RepairData *>(*instanceData);
1848 
1849     if (activationReason == arInitial) {
1850         vsapi->requestFrameFilter(n, d->node1, frameCtx);
1851         vsapi->requestFrameFilter(n, d->node2, frameCtx);
1852     } else if (activationReason == arAllFramesReady) {
1853         const VSFrameRef *src1_frame = vsapi->getFrameFilter(n, d->node1, frameCtx);
1854         const VSFrameRef *src2_frame = vsapi->getFrameFilter(n, d->node2, frameCtx);
1855         int planes[3] = {0, 1, 2};
1856         const VSFrameRef * cp_planes[3] = { d->mode[0] ? nullptr : src1_frame, d->mode[1] ? nullptr : src1_frame, d->mode[2] ? nullptr : src1_frame };
1857         VSFrameRef *dst_frame = vsapi->newVideoFrame2(vsapi->getFrameFormat(src1_frame), vsapi->getFrameWidth(src1_frame, 0), vsapi->getFrameHeight(src1_frame, 0), cp_planes, planes, src1_frame, core);
1858 
1859 
1860 #define PROC_ARGS_16(op) PlaneProc <op, uint16_t>::do_process_plane_cpp<op, uint16_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1861 #define PROC_ARGS_8(op) PlaneProc <op, uint16_t>::do_process_plane_cpp<op, uint8_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1862 
1863 #ifdef VS_TARGET_CPU_X86
1864 #define PROC_ARGS_16_FAST(op) PlaneProc <op, uint16_t>::do_process_plane_sse2<op, uint16_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1865 #define PROC_ARGS_8_FAST(op) PlaneProc <op, uint8_t>::do_process_plane_sse2<op, uint8_t>(src1_frame, src2_frame, dst_frame, i, vsapi); break;
1866 #else
1867 #define PROC_ARGS_16_FAST(op) PROC_ARGS_16(op)
1868 #define PROC_ARGS_8_FAST(op) PROC_ARGS_8(op)
1869 #endif
1870 
1871 
1872         if (d->vi->format->bytesPerSample == 1) {
1873             for (int i = 0; i < d->vi->format->numPlanes; i++) {
1874                 switch (d->mode[i])
1875                 {
1876                     case  1: PROC_ARGS_8_FAST(OpRG01)
1877                     case  2: PROC_ARGS_8_FAST(OpRG02)
1878                     case  3: PROC_ARGS_8_FAST(OpRG03)
1879                     case  4: PROC_ARGS_8_FAST(OpRG04)
1880                     case  5: PROC_ARGS_8_FAST(OpRG05)
1881                     case  6: PROC_ARGS_8_FAST(OpRG06)
1882                     case  7: PROC_ARGS_8_FAST(OpRG07)
1883                     case  8: PROC_ARGS_8_FAST(OpRG08)
1884                     case  9: PROC_ARGS_8_FAST(OpRG09)
1885                     case 10: PROC_ARGS_8_FAST(OpRG10)
1886                     case 11: PROC_ARGS_8_FAST(OpRG01)
1887                     case 12: PROC_ARGS_8_FAST(OpRG12)
1888                     case 13: PROC_ARGS_8_FAST(OpRG13)
1889                     case 14: PROC_ARGS_8_FAST(OpRG14)
1890                     case 15: PROC_ARGS_8_FAST(OpRG15)
1891                     case 16: PROC_ARGS_8_FAST(OpRG16)
1892                     case 17: PROC_ARGS_8_FAST(OpRG17)
1893                     case 18: PROC_ARGS_8_FAST(OpRG18)
1894                     case 19: PROC_ARGS_8_FAST(OpRG19)
1895                     case 20: PROC_ARGS_8_FAST(OpRG20)
1896                     case 21: PROC_ARGS_8_FAST(OpRG21)
1897                     case 22: PROC_ARGS_8_FAST(OpRG22)
1898                     case 23: PROC_ARGS_8_FAST(OpRG23)
1899                     case 24: PROC_ARGS_8_FAST(OpRG24)
1900                     default: break;
1901                 }
1902             }
1903         } else {
1904             for (int i = 0; i < d->vi->format->numPlanes; i++) {
1905                 switch (d->mode[i])
1906                 {
1907                     case  1: PROC_ARGS_16_FAST(OpRG01)
1908                     case  2: PROC_ARGS_16_FAST(OpRG02)
1909                     case  3: PROC_ARGS_16_FAST(OpRG03)
1910                     case  4: PROC_ARGS_16_FAST(OpRG04)
1911                     case  5: PROC_ARGS_16_FAST(OpRG05)
1912                     case  6: PROC_ARGS_16_FAST(OpRG06)
1913                     case  7: PROC_ARGS_16_FAST(OpRG07)
1914                     case  8: PROC_ARGS_16_FAST(OpRG08)
1915                     case  9: PROC_ARGS_16_FAST(OpRG09)
1916                     case 10: PROC_ARGS_16_FAST(OpRG10)
1917                     case 11: PROC_ARGS_16_FAST(OpRG01)
1918                     case 12: PROC_ARGS_16_FAST(OpRG12)
1919                     case 13: PROC_ARGS_16_FAST(OpRG13)
1920                     case 14: PROC_ARGS_16_FAST(OpRG14)
1921                     case 15: PROC_ARGS_16_FAST(OpRG15)
1922                     case 16: PROC_ARGS_16_FAST(OpRG16)
1923                     case 17: PROC_ARGS_16_FAST(OpRG17)
1924                     case 18: PROC_ARGS_16_FAST(OpRG18)
1925                     case 19: PROC_ARGS_16_FAST(OpRG19)
1926                     case 20: PROC_ARGS_16_FAST(OpRG20)
1927                     case 21: PROC_ARGS_16_FAST(OpRG21)
1928                     case 22: PROC_ARGS_16_FAST(OpRG22)
1929                     case 23: PROC_ARGS_16_FAST(OpRG23)
1930                     case 24: PROC_ARGS_16_FAST(OpRG24)
1931                     default: break;
1932                 }
1933             }
1934         }
1935 
1936         vsapi->freeFrame(src1_frame);
1937         vsapi->freeFrame(src2_frame);
1938         return dst_frame;
1939     }
1940 
1941     return nullptr;
1942 }
1943 
repairFree(void * instanceData,VSCore * core,const VSAPI * vsapi)1944 static void VS_CC repairFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
1945     RepairData *d = static_cast<RepairData *>(instanceData);
1946     vsapi->freeNode(d->node1);
1947     vsapi->freeNode(d->node2);
1948     delete d;
1949 }
1950 
repairCreate(const VSMap * in,VSMap * out,void * userData,VSCore * core,const VSAPI * vsapi)1951 void VS_CC repairCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
1952     RepairData d;
1953 
1954     d.node1 = vsapi->propGetNode(in, "clip", 0, nullptr);
1955     d.vi = vsapi->getVideoInfo(d.node1);
1956 
1957     if (!isConstantFormat(d.vi)) {
1958         vsapi->freeNode(d.node1);
1959         vsapi->setError(out, "Repair: Only constant format input supported");
1960         return;
1961     }
1962 
1963     d.node2 = vsapi->propGetNode(in, "repairclip", 0, nullptr);
1964 
1965     if (!isSameFormat(d.vi, vsapi->getVideoInfo(d.node2))) {
1966         vsapi->freeNode(d.node1);
1967         vsapi->freeNode(d.node2);
1968         vsapi->setError(out, "Repair: Input clips must have the same format");
1969         return;
1970     }
1971 
1972     if (d.vi->format->sampleType != stInteger || (d.vi->format->bytesPerSample != 1 && d.vi->format->bytesPerSample != 2)) {
1973         vsapi->freeNode(d.node1);
1974         vsapi->freeNode(d.node2);
1975         vsapi->setError(out, "Repair: Only 8-16 bit int formats supported");
1976         return;
1977     }
1978 
1979     int n = d.vi->format->numPlanes;
1980     int m = vsapi->propNumElements(in, "mode");
1981     if (n < m) {
1982         vsapi->freeNode(d.node1);
1983         vsapi->freeNode(d.node2);
1984         vsapi->setError(out, "Repair: Number of modes specified must be equal or fewer than the number of input planes");
1985         return;
1986     }
1987 
1988     for (int i = 0; i < 3; i++) {
1989         if (i < m) {
1990             d.mode[i] = int64ToIntS(vsapi->propGetInt(in, "mode", i, nullptr));
1991             if (d.mode[i] < 0 || d.mode[i] > 24)
1992             {
1993                 vsapi->freeNode(d.node1);
1994                 vsapi->freeNode(d.node2);
1995                 vsapi->setError(out, "Repair: Invalid mode specified, only 0-24 supported");
1996                 return;
1997             }
1998         } else {
1999             d.mode[i] = d.mode[i - 1];
2000         }
2001     }
2002 
2003     RepairData *data = new RepairData(d);
2004 
2005     vsapi->createFilter(in, out, "Repair", repairInit, repairGetFrame, repairFree, fmParallel, 0, data, core);
2006 }
2007