1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; Copyright © 2019, VideoLabs
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30SECTION_RODATA 16
31
32%macro DUP8 1-*
33    %rep %0
34        times 8 db %1
35        %rotate 1
36    %endrep
37%endmacro
38
39div_table_sse4:  dd 840, 420, 280, 210, 168, 140, 120, 105
40                 dd 420, 210, 140, 105, 105, 105, 105, 105
41div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
42                 dw 168, 168, 140, 140, 120, 120, 105, 105
43                 dw 420, 420, 210, 210, 140, 140, 105, 105
44                 dw 105, 105, 105, 105, 105, 105, 105, 105
45const shufw_6543210x, \
46            db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
47shufb_lohi: db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
48pw_8:      times 8 dw 8
49pw_128:    times 8 dw 128
50pw_256:    times 8 dw 256
51pw_2048:   times 8 dw 2048
52pw_0x7FFF: times 8 dw 0x7FFF
53pw_0x8000: times 8 dw 0x8000
54tap_table: ; masks for 8-bit shift emulation
55           DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
56           ; weights
57           DUP8 4, 2, 3, 3, 2, 1
58           ; taps indices
59           db -1 * 16 + 1, -2 * 16 + 2
60           db  0 * 16 + 1, -1 * 16 + 2
61           db  0 * 16 + 1,  0 * 16 + 2
62           db  0 * 16 + 1,  1 * 16 + 2
63           db  1 * 16 + 1,  2 * 16 + 2
64           db  1 * 16 + 0,  2 * 16 + 1
65           db  1 * 16 + 0,  2 * 16 + 0
66           db  1 * 16 + 0,  2 * 16 - 1
67           ; the last 6 are repeats of the first 6 so we don't need to & 7
68           db -1 * 16 + 1, -2 * 16 + 2
69           db  0 * 16 + 1, -1 * 16 + 2
70           db  0 * 16 + 1,  0 * 16 + 2
71           db  0 * 16 + 1,  1 * 16 + 2
72           db  1 * 16 + 1,  2 * 16 + 2
73           db  1 * 16 + 0,  2 * 16 + 1
74
75SECTION .text
76
77%macro movif32 2
78 %if ARCH_X86_32
79    mov     %1, %2
80 %endif
81%endmacro
82
83%macro PMOVZXBW 2-3 0 ; %3 = half
84 %if cpuflag(sse4) && %3 == 0
85    pmovzxbw        %1, %2
86 %else
87  %if %3 == 1
88    movd            %1, %2
89  %else
90    movq            %1, %2
91  %endif
92    punpcklbw       %1, m7
93 %endif
94%endmacro
95
96%macro PSHUFB_0 2
97 %if cpuflag(ssse3)
98    pshufb          %1, %2
99 %else
100    punpcklbw       %1, %1
101    pshuflw         %1, %1, q0000
102    punpcklqdq      %1, %1
103 %endif
104%endmacro
105
106%macro MOVDDUP 2
107%if cpuflag(ssse3)
108    movddup         %1, %2
109%else
110    movq            %1, %2
111    punpcklqdq      %1, %1
112%endif
113%endmacro
114
115%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
116    ; load p0/p1
117    movsx         offq, byte [dirq+kq+%1+14*8]  ; off1
118 %if %6 == 4
119    movq            m5, [stkq+offq*2+32*0]      ; p0
120    movhps          m5, [stkq+offq*2+32*1]
121 %else
122    movu            m5, [stkq+offq*2+32*0]      ; p0
123 %endif
124    neg           offq                          ; -off1
125 %if %6 == 4
126    movq            m6, [stkq+offq*2+32*0]      ; p1
127    movhps          m6, [stkq+offq*2+32*1]
128 %else
129    movu            m6, [stkq+offq*2+32*0]      ; p1
130 %endif
131 %if %7
132  %if cpuflag(sse4)
133    ; out of bounds values are set to a value that is a both a large unsigned
134    ; value and a negative signed value.
135    ; use signed max and unsigned min to remove them
136    pmaxsw          m7, m5
137    pminuw          m8, m5
138    pmaxsw          m7, m6
139    pminuw          m8, m6
140  %else
141    pcmpeqw         m3, m14, m5
142    pminsw          m8, m5     ; min after p0
143    pandn           m3, m5
144    pmaxsw          m7, m3     ; max after p0
145    pcmpeqw         m3, m14, m6
146    pminsw          m8, m6     ; min after p1
147    pandn           m3, m6
148    pmaxsw          m7, m3     ; max after p1
149  %endif
150 %endif
151
152    ; accumulate sum[m13] over p0/p1
153    psubw           m5, m4     ; diff_p0(p0 - px)
154    psubw           m6, m4     ; diff_p1(p1 - px)
155    packsswb        m5, m6     ; convert pixel diff to 8-bit
156 %if cpuflag(ssse3)
157    pshufb          m5, m13    ; group diffs p0 and p1 into pairs
158    pabsb           m6, m5
159    psignb          m3, %5, m5
160 %else
161    movlhps         m6, m5
162    punpckhbw       m6, m5
163    pxor            m5, m5
164    pcmpgtb         m5, m6
165    paddb           m6, m5
166    pxor            m6, m5
167    paddb           m3, %5, m5
168    pxor            m3, m5
169 %endif
170    pand            m9, %3, m6 ; emulate 8-bit shift
171    psrlw           m9, %2
172    psubusb         m5, %4, m9
173    pminub          m5, m6     ; constrain(diff_p)
174 %if cpuflag(ssse3)
175    pmaddubsw       m5, m3     ; constrain(diff_p) * taps
176 %else
177    psrlw           m9, m5, 8
178    psraw           m6, m3, 8
179    psllw           m5, 8
180    psllw           m3, 8
181    pmullw          m9, m6
182    pmulhw          m5, m3
183    paddw           m5, m9
184 %endif
185    paddw           m0, m5
186%endmacro
187
188%macro LOAD_BODY 3 ; dst, src, block_width
189 %if %3 == 4
190    PMOVZXBW        m0, [%2+strideq*0]
191    PMOVZXBW        m1, [%2+strideq*1]
192    PMOVZXBW        m2, [%2+strideq*2]
193    PMOVZXBW        m3, [%2+stride3q]
194    mova     [%1+32*0], m0
195    mova     [%1+32*1], m1
196    mova     [%1+32*2], m2
197    mova     [%1+32*3], m3
198 %else
199    movu            m0, [%2+strideq*0]
200    movu            m1, [%2+strideq*1]
201    movu            m2, [%2+strideq*2]
202    movu            m3, [%2+stride3q]
203    punpcklbw       m4, m0, m7
204    punpckhbw       m0, m7
205    mova  [%1+32*0+ 0], m4
206    mova  [%1+32*0+16], m0
207    punpcklbw       m4, m1, m7
208    punpckhbw       m1, m7
209    mova  [%1+32*1+ 0], m4
210    mova  [%1+32*1+16], m1
211    punpcklbw       m4, m2, m7
212    punpckhbw       m2, m7
213    mova  [%1+32*2+ 0], m4
214    mova  [%1+32*2+16], m2
215    punpcklbw       m4, m3, m7
216    punpckhbw       m3, m7
217    mova  [%1+32*3+ 0], m4
218    mova  [%1+32*3+16], m3
219 %endif
220%endmacro
221
222%macro CDEF_FILTER_END 2 ; w, minmax
223    pxor            m6, m6
224    pcmpgtw         m6, m0
225    paddw           m0, m6
226 %if cpuflag(ssse3)
227    pmulhrsw        m0, m15
228 %else
229    paddw           m0, m15
230    psraw           m0, 4
231 %endif
232    paddw           m4, m0
233 %if %2
234    pminsw          m4, m7
235    pmaxsw          m4, m8
236 %endif
237    packuswb        m4, m4
238 %if %1 == 4
239    movd [dstq+strideq*0], m4
240    psrlq           m4, 32
241    movd [dstq+strideq*1], m4
242    add           stkq, 32*2
243    lea           dstq, [dstq+strideq*2]
244 %else
245    movq        [dstq], m4
246    add           stkq, 32
247    add           dstq, strideq
248 %endif
249%endmacro
250
251%macro CDEF_FILTER 2 ; w, h
252 %if ARCH_X86_64
253cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \
254                                dst, stride, left, top, pri, sec, edge, stride3, dst4
255  %define px rsp+3*16+2*32
256  %define base 0
257 %else
258cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
259                                dst, stride, left, edge, stride3
260    %define       topq  r2
261    %define      dst4q  r2
262    LEA             r5, tap_table
263  %define px esp+7*16+2*32
264  %define base r5-tap_table
265 %endif
266    mov          edged, r8m
267 %if cpuflag(sse4)
268   %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
269 %else
270   %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
271 %endif
272    mova            m6, OUT_OF_BOUNDS_MEM
273    pxor            m7, m7
274
275    ; prepare pixel buffers - body/right
276 %if %2 == 8
277    lea          dst4q, [dstq+strideq*4]
278 %endif
279    lea       stride3q, [strideq*3]
280    test         edgeb, 2 ; have_right
281    jz .no_right
282    LOAD_BODY       px, dstq, %1
283 %if %2 == 8
284    LOAD_BODY  px+4*32, dst4q, %1
285 %endif
286    jmp .body_done
287.no_right:
288    PMOVZXBW        m0, [dstq+strideq*0], %1 == 4
289    PMOVZXBW        m1, [dstq+strideq*1], %1 == 4
290    PMOVZXBW        m2, [dstq+strideq*2], %1 == 4
291    PMOVZXBW        m3, [dstq+stride3q ], %1 == 4
292    mova     [px+32*0], m0
293    mova     [px+32*1], m1
294    mova     [px+32*2], m2
295    mova     [px+32*3], m3
296    movd [px+32*0+%1*2], m6
297    movd [px+32*1+%1*2], m6
298    movd [px+32*2+%1*2], m6
299    movd [px+32*3+%1*2], m6
300 %if %2 == 8
301    PMOVZXBW        m0, [dst4q+strideq*0], %1 == 4
302    PMOVZXBW        m1, [dst4q+strideq*1], %1 == 4
303    PMOVZXBW        m2, [dst4q+strideq*2], %1 == 4
304    PMOVZXBW        m3, [dst4q+stride3q ], %1 == 4
305    mova     [px+32*4], m0
306    mova     [px+32*5], m1
307    mova     [px+32*6], m2
308    mova     [px+32*7], m3
309    movd [px+32*4+%1*2], m6
310    movd [px+32*5+%1*2], m6
311    movd [px+32*6+%1*2], m6
312    movd [px+32*7+%1*2], m6
313 %endif
314.body_done:
315
316    ; top
317    movifnidn     topq, r3mp
318    test         edgeb, 4 ; have_top
319    jz .no_top
320    test         edgeb, 1 ; have_left
321    jz .top_no_left
322    test         edgeb, 2 ; have_right
323    jz .top_no_right
324 %if %1 == 4
325    PMOVZXBW        m0, [topq+strideq*0-2]
326    PMOVZXBW        m1, [topq+strideq*1-2]
327 %else
328    movu            m0, [topq+strideq*0-4]
329    movu            m1, [topq+strideq*1-4]
330    punpckhbw       m2, m0, m7
331    punpcklbw       m0, m7
332    punpckhbw       m3, m1, m7
333    punpcklbw       m1, m7
334    movu  [px-32*2+8], m2
335    movu  [px-32*1+8], m3
336 %endif
337    movu  [px-32*2-%1], m0
338    movu  [px-32*1-%1], m1
339    jmp .top_done
340.top_no_right:
341 %if %1 == 4
342    PMOVZXBW        m0, [topq+strideq*0-%1]
343    PMOVZXBW        m1, [topq+strideq*1-%1]
344    movu   [px-32*2-8], m0
345    movu   [px-32*1-8], m1
346 %else
347    movu            m0, [topq+strideq*0-%1]
348    movu            m1, [topq+strideq*1-%2]
349    punpckhbw       m2, m0, m7
350    punpcklbw       m0, m7
351    punpckhbw       m3, m1, m7
352    punpcklbw       m1, m7
353    mova  [px-32*2-16], m0
354    mova  [px-32*2+ 0], m2
355    mova  [px-32*1-16], m1
356    mova  [px-32*1+ 0], m3
357 %endif
358    movd [px-32*2+%1*2], m6
359    movd [px-32*1+%1*2], m6
360    jmp .top_done
361.top_no_left:
362    test         edgeb, 2 ; have_right
363    jz .top_no_left_right
364 %if %1 == 4
365    PMOVZXBW        m0, [topq+strideq*0]
366    PMOVZXBW        m1, [topq+strideq*1]
367 %else
368    movu            m0, [topq+strideq*0]
369    movu            m1, [topq+strideq*1]
370    punpckhbw       m2, m0, m7
371    punpcklbw       m0, m7
372    punpckhbw       m3, m1, m7
373    punpcklbw       m1, m7
374    movd  [px-32*2+16], m2
375    movd  [px-32*1+16], m3
376 %endif
377    movd  [px-32*2- 4], m6
378    movd  [px-32*1- 4], m6
379    mova  [px-32*2+ 0], m0
380    mova  [px-32*1+ 0], m1
381    jmp .top_done
382.top_no_left_right:
383    PMOVZXBW        m0, [topq+strideq*0], %1 == 4
384    PMOVZXBW        m1, [topq+strideq*1], %1 == 4
385    movd   [px-32*2-4], m6
386    movd   [px-32*1-4], m6
387    mova   [px-32*2+0], m0
388    mova   [px-32*1+0], m1
389    movd [px-32*2+%1*2], m6
390    movd [px-32*1+%1*2], m6
391    jmp .top_done
392.no_top:
393    movu  [px-32*2- 4], m6
394    movu  [px-32*1- 4], m6
395 %if %1 == 8
396    movq  [px-32*2+12], m6
397    movq  [px-32*1+12], m6
398 %endif
399.top_done:
400
401    ; left
402    test         edgeb, 1 ; have_left
403    jz .no_left
404    movifnidn    leftq, leftmp
405 %if %2 == 4
406    movq            m0, [leftq]
407 %else
408    movu            m0, [leftq]
409 %endif
410 %if %2 == 4
411    punpcklbw       m0, m7
412 %else
413    punpckhbw       m1, m0, m7
414    punpcklbw       m0, m7
415    movhlps         m3, m1
416    movd   [px+32*4-4], m1
417    movd   [px+32*6-4], m3
418    psrlq           m1, 32
419    psrlq           m3, 32
420    movd   [px+32*5-4], m1
421    movd   [px+32*7-4], m3
422 %endif
423    movhlps         m2, m0
424    movd   [px+32*0-4], m0
425    movd   [px+32*2-4], m2
426    psrlq           m0, 32
427    psrlq           m2, 32
428    movd   [px+32*1-4], m0
429    movd   [px+32*3-4], m2
430    jmp .left_done
431.no_left:
432    movd   [px+32*0-4], m6
433    movd   [px+32*1-4], m6
434    movd   [px+32*2-4], m6
435    movd   [px+32*3-4], m6
436 %if %2 == 8
437    movd   [px+32*4-4], m6
438    movd   [px+32*5-4], m6
439    movd   [px+32*6-4], m6
440    movd   [px+32*7-4], m6
441 %endif
442.left_done:
443
444    ; bottom
445 %if ARCH_X86_64
446    DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3
447 %else
448    DEFINE_ARGS dst, stride, dst8, edge, stride3
449 %endif
450    test         edgeb, 8 ; have_bottom
451    jz .no_bottom
452    lea          dst8q, [dstq+%2*strideq]
453    test         edgeb, 1 ; have_left
454    jz .bottom_no_left
455    test         edgeb, 2 ; have_right
456    jz .bottom_no_right
457 %if %1 == 4
458    PMOVZXBW        m0, [dst8q-(%1/2)]
459    PMOVZXBW        m1, [dst8q+strideq-(%1/2)]
460 %else
461    movu            m0, [dst8q-4]
462    movu            m1, [dst8q+strideq-4]
463    punpckhbw       m2, m0, m7
464    punpcklbw       m0, m7
465    punpckhbw       m3, m1, m7
466    punpcklbw       m1, m7
467    movu [px+32*(%2+0)+8], m2
468    movu [px+32*(%2+1)+8], m3
469 %endif
470    movu [px+32*(%2+0)-%1], m0
471    movu [px+32*(%2+1)-%1], m1
472    jmp .bottom_done
473.bottom_no_right:
474 %if %1 == 4
475    PMOVZXBW        m0, [dst8q-4]
476    PMOVZXBW        m1, [dst8q+strideq-4]
477    movu [px+32*(%2+0)-8], m0
478    movu [px+32*(%2+1)-8], m1
479 %else
480    movu            m0, [dst8q-8]
481    movu            m1, [dst8q+strideq-8]
482    punpckhbw       m2, m0, m7
483    punpcklbw       m0, m7
484    punpckhbw       m3, m1, m7
485    punpcklbw       m1, m7
486    mova [px+32*(%2+0)-16], m0
487    mova [px+32*(%2+0)+ 0], m2
488    mova [px+32*(%2+1)-16], m1
489    mova [px+32*(%2+1)+ 0], m3
490    movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
491 %endif
492    movd [px+32*(%2+0)+%1*2], m6
493    movd [px+32*(%2+1)+%1*2], m6
494    jmp .bottom_done
495.bottom_no_left:
496    test         edgeb, 2 ; have_right
497    jz .bottom_no_left_right
498 %if %1 == 4
499    PMOVZXBW        m0, [dst8q]
500    PMOVZXBW        m1, [dst8q+strideq]
501 %else
502    movu            m0, [dst8q]
503    movu            m1, [dst8q+strideq]
504    punpckhbw       m2, m0, m7
505    punpcklbw       m0, m7
506    punpckhbw       m3, m1, m7
507    punpcklbw       m1, m7
508    mova [px+32*(%2+0)+16], m2
509    mova [px+32*(%2+1)+16], m3
510 %endif
511    mova [px+32*(%2+0)+ 0], m0
512    mova [px+32*(%2+1)+ 0], m1
513    movd [px+32*(%2+0)- 4], m6
514    movd [px+32*(%2+1)- 4], m6
515    jmp .bottom_done
516.bottom_no_left_right:
517    PMOVZXBW        m0, [dst8q+strideq*0], %1 == 4
518    PMOVZXBW        m1, [dst8q+strideq*1], %1 == 4
519    mova [px+32*(%2+0)+ 0], m0
520    mova [px+32*(%2+1)+ 0], m1
521    movd [px+32*(%2+0)+%1*2], m6
522    movd [px+32*(%2+1)+%1*2], m6
523    movd [px+32*(%2+0)- 4], m6
524    movd [px+32*(%2+1)- 4], m6
525    jmp .bottom_done
526.no_bottom:
527    movu [px+32*(%2+0)- 4], m6
528    movu [px+32*(%2+1)- 4], m6
529 %if %1 == 8
530    movq [px+32*(%2+0)+12], m6
531    movq [px+32*(%2+1)+12], m6
532 %endif
533.bottom_done:
534
535    ; actual filter
536 %if ARCH_X86_64
537    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec
538    mova           m13, [shufb_lohi]
539 %if cpuflag(ssse3)
540    mova           m15, [pw_2048]
541 %else
542    mova           m15, [pw_8]
543 %endif
544    mova           m14, m6
545 %else
546    DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
547    %xdefine        m8  m1
548    %xdefine        m9  m2
549    %xdefine       m10  m0
550    %xdefine       m13  [base+shufb_lohi]
551    %xdefine       m14  OUT_OF_BOUNDS_MEM
552 %if cpuflag(ssse3)
553    %xdefine       m15  [base+pw_2048]
554 %else
555    %xdefine       m15  [base+pw_8]
556 %endif
557 %endif
558    movifnidn     prid, r4m
559    movifnidn     secd, r5m
560    mov       dampingd, r7m
561    movif32 [esp+0x3C], r1d
562    test          prid, prid
563    jz .sec_only
564    movd            m1, prim
565    bsr        pridmpd, prid
566    test          secd, secd
567    jz .pri_only
568    movd           m10, r5m
569    bsr           secd, secd
570    and           prid, 1
571    sub        pridmpd, dampingd
572    sub           secd, dampingd
573    xor       dampingd, dampingd
574    add           prid, prid
575    neg        pridmpd
576    cmovs      pridmpd, dampingd
577    neg           secd
578    cmovs         secd, dampingd
579    PSHUFB_0        m1, m7
580    PSHUFB_0       m10, m7
581 %if ARCH_X86_64
582    DEFINE_ARGS dst, stride, pridmp, tap, pri, sec
583    lea           tapq, [tap_table]
584    MOVDDUP        m11, [tapq+pridmpq*8] ; pri_shift_mask
585    MOVDDUP        m12, [tapq+secq*8]    ; sec_shift_mask
586    mov     [rsp+0x00], pridmpq          ; pri_shift
587    mov     [rsp+0x10], secq             ; sec_shift
588    DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h
589 %else
590    MOVDDUP         m2, [tapq+pridmpq*8]
591    MOVDDUP         m3, [tapq+secq*8]
592    mov     [esp+0x04], dampingd         ; zero upper 32 bits of psrlw
593    mov     [esp+0x34], dampingd         ; source operand in ACCUMULATE_TAP
594    mov     [esp+0x00], pridmpd
595    mov     [esp+0x30], secd
596    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
597  %define         offq  dstq
598  %define           kd  strided
599  %define           kq  strideq
600    mova    [esp+0x10], m2
601    mova    [esp+0x40], m3
602    mova    [esp+0x20], m1
603    mova    [esp+0x50], m10
604 %endif
605    mov           dird, r6m
606    lea           stkq, [px]
607    lea           priq, [tapq+8*8+priq*8] ; pri_taps
608    mov             hd, %1*%2/8
609    lea           dirq, [tapq+dirq*2]
610.v_loop:
611    movif32 [esp+0x38], dstd
612    mov             kd, 1
613 %if %1 == 4
614    movq            m4, [stkq+32*0]
615    movhps          m4, [stkq+32*1]
616 %else
617    mova            m4, [stkq+32*0]       ; px
618 %endif
619    pxor            m0, m0                ; sum
620    mova            m7, m4                ; max
621    mova            m8, m4                ; min
622.k_loop:
623    MOVDDUP         m2, [priq+kq*8]
624 %if ARCH_X86_64
625    ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
626    MOVDDUP         m2, [tapq+12*8+kq*8]
627    ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
628    ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
629 %else
630    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
631    MOVDDUP         m2, [tapq+12*8+kq*8]
632    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
633    MOVDDUP         m2, [tapq+12*8+kq*8]
634    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
635 %endif
636    dec             kd
637    jge .k_loop
638    movif32       dstq, [esp+0x38]
639    movif32    strideq, [esp+0x3C]
640    CDEF_FILTER_END %1, 1
641    dec             hd
642    jg .v_loop
643    RET
644
645.pri_only:
646%if ARCH_X86_64
647    DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero
648    lea           tapq, [tap_table]
649 %else
650    DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
651 %endif
652    and           prid, 1
653    xor          zerod, zerod
654    sub       dampingd, pridmpd
655    cmovs     dampingd, zerod
656    add           prid, prid
657    PSHUFB_0        m1, m7
658    MOVDDUP         m7, [tapq+dampingq*8]
659    mov     [rsp+0x00], dampingq
660 %if ARCH_X86_64
661    DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h
662 %else
663    mov     [rsp+0x04], zerod
664    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
665 %endif
666    mov           dird, r6m
667    lea           stkq, [px]
668    lea           priq, [tapq+8*8+priq*8]
669    mov             hd, %1*%2/8
670    lea           dirq, [tapq+dirq*2]
671.pri_v_loop:
672    movif32 [esp+0x38], dstd
673    mov             kd, 1
674 %if %1 == 4
675    movq            m4, [stkq+32*0]
676    movhps          m4, [stkq+32*1]
677 %else
678    mova            m4, [stkq+32*0]
679 %endif
680    pxor            m0, m0
681.pri_k_loop:
682    MOVDDUP         m2, [priq+kq*8]
683    ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
684    dec             kd
685    jge .pri_k_loop
686    movif32       dstq, [esp+0x38]
687    movif32    strideq, [esp+0x3C]
688    CDEF_FILTER_END %1, 0
689    dec             hd
690    jg .pri_v_loop
691    RET
692
693.sec_only:
694%if ARCH_X86_64
695    DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero
696%else
697    DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
698%endif
699    movd            m1, r5m
700    bsr           secd, secd
701    mov           dird, r6m
702    xor          zerod, zerod
703    sub       dampingd, secd
704    cmovs     dampingd, zerod
705    PSHUFB_0        m1, m7
706 %if ARCH_X86_64
707    lea           tapq, [tap_table]
708 %else
709    mov     [rsp+0x04], zerod
710 %endif
711    mov     [rsp+0x00], dampingq
712    MOVDDUP         m7, [tapq+dampingq*8]
713    lea           dirq, [tapq+dirq*2]
714 %if ARCH_X86_64
715    DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h
716 %else
717    DEFINE_ARGS dst, stride, off, stk, dir, tap, h
718 %endif
719    lea           stkq, [px]
720    mov             hd, %1*%2/8
721.sec_v_loop:
722    mov             kd, 1
723 %if %1 == 4
724    movq            m4, [stkq+32*0]
725    movhps          m4, [stkq+32*1]
726 %else
727    mova            m4, [stkq+32*0]
728 %endif
729    pxor            m0, m0
730.sec_k_loop:
731    MOVDDUP         m2, [tapq+12*8+kq*8]
732    ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
733 %if ARCH_X86_32
734    MOVDDUP         m2, [tapq+12*8+kq*8]
735 %endif
736    ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
737    dec             kd
738    jge .sec_k_loop
739    movif32    strideq, [esp+0x3C]
740    CDEF_FILTER_END %1, 0
741    dec             hd
742    jg .sec_v_loop
743    RET
744%endmacro
745
746%macro MULLD 2
747 %if cpuflag(sse4)
748    pmulld          %1, %2
749 %else
750  %if ARCH_X86_32
751   %define m15 m1
752  %endif
753    pmulhuw        m15, %1, %2
754    pmullw          %1, %2
755    pslld          m15, 16
756    paddd           %1, m15
757 %endif
758%endmacro
759
760%macro CDEF_DIR 0
761 %if ARCH_X86_64
762cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var
763    lea             r6, [strideq*3]
764    movq            m1, [srcq+strideq*0]
765    movhps          m1, [srcq+strideq*1]
766    movq            m3, [srcq+strideq*2]
767    movhps          m3, [srcq+r6       ]
768    lea           srcq, [srcq+strideq*4]
769    movq            m5, [srcq+strideq*0]
770    movhps          m5, [srcq+strideq*1]
771    movq            m7, [srcq+strideq*2]
772    movhps          m7, [srcq+r6       ]
773
774    pxor            m8, m8
775    psadbw          m9, m1, m8
776    psadbw          m2, m3, m8
777    psadbw          m4, m5, m8
778    psadbw          m6, m7, m8
779    packssdw        m9, m2
780    packssdw        m4, m6
781    packssdw        m9, m4
782
783    punpcklbw       m0, m1, m8
784    punpckhbw       m1, m8
785    punpcklbw       m2, m3, m8
786    punpckhbw       m3, m8
787    punpcklbw       m4, m5, m8
788    punpckhbw       m5, m8
789    punpcklbw       m6, m7, m8
790    punpckhbw       m7, m8
791cglobal_label .main
792    mova            m8, [pw_128]
793    psubw           m0, m8
794    psubw           m1, m8
795    psubw           m2, m8
796    psubw           m3, m8
797    psubw           m4, m8
798    psubw           m5, m8
799    psubw           m6, m8
800    psubw           m7, m8
801    psllw           m8, 3
802    psubw           m9, m8                  ; partial_sum_hv[0]
803
804    paddw           m8, m0, m1
805    paddw          m10, m2, m3
806    paddw           m8, m4
807    paddw          m10, m5
808    paddw           m8, m6
809    paddw          m10, m7
810    paddw           m8, m10                 ; partial_sum_hv[1]
811
812    pmaddwd         m8, m8
813    pmaddwd         m9, m9
814    phaddd          m9, m8
815    SWAP            m8, m9
816    MULLD           m8, [div_table%+SUFFIX+48]
817
818    pslldq          m9, m1, 2
819    psrldq         m10, m1, 14
820    pslldq         m11, m2, 4
821    psrldq         m12, m2, 12
822    pslldq         m13, m3, 6
823    psrldq         m14, m3, 10
824    paddw           m9, m0
825    paddw          m10, m12
826    paddw          m11, m13
827    paddw          m10, m14                 ; partial_sum_diag[0] top/right half
828    paddw           m9, m11                 ; partial_sum_diag[0] top/left half
829    pslldq         m11, m4, 8
830    psrldq         m12, m4, 8
831    pslldq         m13, m5, 10
832    psrldq         m14, m5, 6
833    paddw           m9, m11
834    paddw          m10, m12
835    paddw           m9, m13
836    paddw          m10, m14
837    pslldq         m11, m6, 12
838    psrldq         m12, m6, 4
839    pslldq         m13, m7, 14
840    psrldq         m14, m7, 2
841    paddw           m9, m11
842    paddw          m10, m12
843    paddw           m9, m13                 ; partial_sum_diag[0][0-7]
844    paddw          m10, m14                 ; partial_sum_diag[0][8-14,zero]
845    pshufb         m10, [shufw_6543210x]
846    punpckhwd      m11, m9, m10
847    punpcklwd       m9, m10
848    pmaddwd        m11, m11
849    pmaddwd         m9, m9
850    MULLD          m11, [div_table%+SUFFIX+16]
851    MULLD           m9, [div_table%+SUFFIX+0]
852    paddd           m9, m11                 ; cost[0a-d]
853
854    pslldq         m10, m0, 14
855    psrldq         m11, m0, 2
856    pslldq         m12, m1, 12
857    psrldq         m13, m1, 4
858    pslldq         m14, m2, 10
859    psrldq         m15, m2, 6
860    paddw          m10, m12
861    paddw          m11, m13
862    paddw          m10, m14
863    paddw          m11, m15
864    pslldq         m12, m3, 8
865    psrldq         m13, m3, 8
866    pslldq         m14, m4, 6
867    psrldq         m15, m4, 10
868    paddw          m10, m12
869    paddw          m11, m13
870    paddw          m10, m14
871    paddw          m11, m15
872    pslldq         m12, m5, 4
873    psrldq         m13, m5, 12
874    pslldq         m14, m6, 2
875    psrldq         m15, m6, 14
876    paddw          m10, m12
877    paddw          m11, m13
878    paddw          m10, m14
879    paddw          m11, m15                 ; partial_sum_diag[1][8-14,zero]
880    paddw          m10, m7                  ; partial_sum_diag[1][0-7]
881    pshufb         m11, [shufw_6543210x]
882    punpckhwd      m12, m10, m11
883    punpcklwd      m10, m11
884    pmaddwd        m12, m12
885    pmaddwd        m10, m10
886    MULLD          m12, [div_table%+SUFFIX+16]
887    MULLD          m10, [div_table%+SUFFIX+0]
888    paddd          m10, m12                 ; cost[4a-d]
889    phaddd          m9, m10                 ; cost[0a/b,4a/b]
890
891    paddw          m10, m0, m1
892    paddw          m11, m2, m3
893    paddw          m12, m4, m5
894    paddw          m13, m6, m7
895    phaddw          m0, m4
896    phaddw          m1, m5
897    phaddw          m2, m6
898    phaddw          m3, m7
899
900    ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
901    pslldq          m4, m11, 2
902    psrldq          m5, m11, 14
903    pslldq          m6, m12, 4
904    psrldq          m7, m12, 12
905    pslldq         m14, m13, 6
906    psrldq         m15, m13, 10
907    paddw           m4, m10
908    paddw           m5, m7
909    paddw           m4, m6
910    paddw           m5, m15                 ; partial_sum_alt[3] right
911    paddw           m4, m14                 ; partial_sum_alt[3] left
912    pshuflw         m6, m5, q3012
913    punpckhwd       m5, m4
914    punpcklwd       m4, m6
915    pmaddwd         m5, m5
916    pmaddwd         m4, m4
917    MULLD           m5, [div_table%+SUFFIX+48]
918    MULLD           m4, [div_table%+SUFFIX+32]
919    paddd           m4, m5                  ; cost[7a-d]
920
921    pslldq          m5, m10, 6
922    psrldq          m6, m10, 10
923    pslldq          m7, m11, 4
924    psrldq         m10, m11, 12
925    pslldq         m11, m12, 2
926    psrldq         m12, 14
927    paddw           m5, m7
928    paddw           m6, m10
929    paddw           m5, m11
930    paddw           m6, m12
931    paddw           m5, m13
932    pshuflw         m7, m6, q3012
933    punpckhwd       m6, m5
934    punpcklwd       m5, m7
935    pmaddwd         m6, m6
936    pmaddwd         m5, m5
937    MULLD           m6, [div_table%+SUFFIX+48]
938    MULLD           m5, [div_table%+SUFFIX+32]
939    paddd           m5, m6                  ; cost[5a-d]
940
941    pslldq          m6, m1, 2
942    psrldq          m7, m1, 14
943    pslldq         m10, m2, 4
944    psrldq         m11, m2, 12
945    pslldq         m12, m3, 6
946    psrldq         m13, m3, 10
947    paddw           m6, m0
948    paddw           m7, m11
949    paddw           m6, m10
950    paddw           m7, m13                 ; partial_sum_alt[3] right
951    paddw           m6, m12                 ; partial_sum_alt[3] left
952    pshuflw        m10, m7, q3012
953    punpckhwd       m7, m6
954    punpcklwd       m6, m10
955    pmaddwd         m7, m7
956    pmaddwd         m6, m6
957    MULLD           m7, [div_table%+SUFFIX+48]
958    MULLD           m6, [div_table%+SUFFIX+32]
959    paddd           m6, m7                  ; cost[1a-d]
960
961    pshufd          m0, m0, q1032
962    pshufd          m1, m1, q1032
963    pshufd          m2, m2, q1032
964    pshufd          m3, m3, q1032
965
966    pslldq         m10, m0, 6
967    psrldq         m11, m0, 10
968    pslldq         m12, m1, 4
969    psrldq         m13, m1, 12
970    pslldq         m14, m2, 2
971    psrldq          m2, 14
972    paddw          m10, m12
973    paddw          m11, m13
974    paddw          m10, m14
975    paddw          m11, m2
976    paddw          m10, m3
977    pshuflw        m12, m11, q3012
978    punpckhwd      m11, m10
979    punpcklwd      m10, m12
980    pmaddwd        m11, m11
981    pmaddwd        m10, m10
982    MULLD          m11, [div_table%+SUFFIX+48]
983    MULLD          m10, [div_table%+SUFFIX+32]
984    paddd          m10, m11                 ; cost[3a-d]
985
986    phaddd          m9, m8                  ; cost[0,4,2,6]
987    phaddd          m6, m10
988    phaddd          m5, m4
989    phaddd          m6, m5                  ; cost[1,3,5,7]
990    pshufd          m4, m9, q3120
991
992    ; now find the best cost
993  %if cpuflag(sse4)
994    pmaxsd          m9, m6
995    pshufd          m0, m9, q1032
996    pmaxsd          m0, m9
997    pshufd          m1, m0, q2301
998    pmaxsd          m0, m1                  ; best cost
999  %else
1000    pcmpgtd         m0, m9, m6
1001    pand            m9, m0
1002    pandn           m0, m6
1003    por             m9, m0
1004    pshufd          m1, m9, q1032
1005    pcmpgtd         m0, m9, m1
1006    pand            m9, m0
1007    pandn           m0, m1
1008    por             m9, m0
1009    pshufd          m1, m9, q2301
1010    pcmpgtd         m0, m9, m1
1011    pand            m9, m0
1012    pandn           m0, m1
1013    por             m0, m9
1014  %endif
1015
1016    ; get direction and variance
1017    punpckhdq       m1, m4, m6
1018    punpckldq       m4, m6
1019    psubd           m2, m0, m1
1020    psubd           m3, m0, m4
1021%if WIN64
1022    WIN64_RESTORE_XMM
1023    %define tmp rsp+stack_offset+8
1024%else
1025    %define tmp rsp-40
1026%endif
1027    mova    [tmp+0x00], m2                  ; emulate ymm in stack
1028    mova    [tmp+0x10], m3
1029    pcmpeqd         m1, m0                  ; compute best cost mask
1030    pcmpeqd         m4, m0
1031    packssdw        m4, m1
1032    pmovmskb       eax, m4                  ; get byte-idx from mask
1033    tzcnt          eax, eax
1034    mov            r1d, [tmp+rax*2]         ; get idx^4 complement from emulated ymm
1035    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
1036    shr            r1d, 10
1037    mov         [varq], r1d
1038 %else
1039cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
1040%define base r2-shufw_6543210x
1041    LEA             r2, shufw_6543210x
1042    pxor            m0, m0
1043    lea       stride3q, [strideq*3]
1044    movq            m5, [srcq+strideq*0]
1045    movhps          m5, [srcq+strideq*1]
1046    movq            m7, [srcq+strideq*2]
1047    movhps          m7, [srcq+stride3q]
1048    mova            m1, [base+pw_128]
1049    psadbw          m2, m5, m0
1050    psadbw          m3, m7, m0
1051    packssdw        m2, m3
1052    punpcklbw       m4, m5, m0
1053    punpckhbw       m5, m0
1054    punpcklbw       m6, m7, m0
1055    punpckhbw       m7, m0
1056    psubw           m4, m1
1057    psubw           m5, m1
1058    psubw           m6, m1
1059    psubw           m7, m1
1060
1061    mova    [esp+0x00], m4
1062    mova    [esp+0x10], m5
1063    mova    [esp+0x20], m6
1064    mova    [esp+0x50], m7
1065
1066    lea           srcq, [srcq+strideq*4]
1067    movq            m5, [srcq+strideq*0]
1068    movhps          m5, [srcq+strideq*1]
1069    movq            m7, [srcq+strideq*2]
1070    movhps          m7, [srcq+stride3q]
1071    psadbw          m3, m5, m0
1072    psadbw          m0, m7
1073    packssdw        m3, m0
1074    pxor            m0, m0
1075    punpcklbw       m4, m5, m0
1076    punpckhbw       m5, m0
1077    punpcklbw       m6, m7, m0
1078    punpckhbw       m7, m0
1079cglobal_label .main
1080    psubw           m4, m1
1081    psubw           m5, m1
1082    psubw           m6, m1
1083    psubw           m7, m1
1084    packssdw        m2, m3
1085    psllw           m1, 3
1086    psubw           m2, m1                  ; partial_sum_hv[0]
1087    pmaddwd         m2, m2
1088
1089    mova            m3, [esp+0x50]
1090    mova            m0, [esp+0x00]
1091    paddw           m0, [esp+0x10]
1092    paddw           m1, m3, [esp+0x20]
1093    paddw           m0, m4
1094    paddw           m1, m5
1095    paddw           m0, m6
1096    paddw           m1, m7
1097    paddw           m0, m1                  ; partial_sum_hv[1]
1098    pmaddwd         m0, m0
1099
1100    phaddd          m2, m0
1101    MULLD           m2, [base+div_table%+SUFFIX+48]
1102    mova    [esp+0x30], m2
1103
1104    mova            m1, [esp+0x10]
1105    pslldq          m0, m1, 2
1106    psrldq          m1, 14
1107    paddw           m0, [esp+0x00]
1108    pslldq          m2, m3, 6
1109    psrldq          m3, 10
1110    paddw           m0, m2
1111    paddw           m1, m3
1112    mova            m3, [esp+0x20]
1113    pslldq          m2, m3, 4
1114    psrldq          m3, 12
1115    paddw           m0, m2                  ; partial_sum_diag[0] top/left half
1116    paddw           m1, m3                  ; partial_sum_diag[0] top/right half
1117    pslldq          m2, m4, 8
1118    psrldq          m3, m4, 8
1119    paddw           m0, m2
1120    paddw           m1, m3
1121    pslldq          m2, m5, 10
1122    psrldq          m3, m5, 6
1123    paddw           m0, m2
1124    paddw           m1, m3
1125    pslldq          m2, m6, 12
1126    psrldq          m3, m6, 4
1127    paddw           m0, m2
1128    paddw           m1, m3
1129    pslldq          m2, m7, 14
1130    psrldq          m3, m7, 2
1131    paddw           m0, m2                  ; partial_sum_diag[0][0-7]
1132    paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
1133    mova            m3, [esp+0x50]
1134    pshufb          m1, [base+shufw_6543210x]
1135    punpckhwd       m2, m0, m1
1136    punpcklwd       m0, m1
1137    pmaddwd         m2, m2
1138    pmaddwd         m0, m0
1139    MULLD           m2, [base+div_table%+SUFFIX+16]
1140    MULLD           m0, [base+div_table%+SUFFIX+ 0]
1141    paddd           m0, m2                  ; cost[0a-d]
1142    mova    [esp+0x40], m0
1143
1144    mova            m1, [esp+0x00]
1145    pslldq          m0, m1, 14
1146    psrldq          m1, 2
1147    paddw           m0, m7
1148    pslldq          m2, m3, 8
1149    psrldq          m3, 8
1150    paddw           m0, m2
1151    paddw           m1, m3
1152    mova            m3, [esp+0x20]
1153    pslldq          m2, m3, 10
1154    psrldq          m3, 6
1155    paddw           m0, m2
1156    paddw           m1, m3
1157    mova            m3, [esp+0x10]
1158    pslldq          m2, m3, 12
1159    psrldq          m3, 4
1160    paddw           m0, m2
1161    paddw           m1, m3
1162    pslldq          m2, m4, 6
1163    psrldq          m3, m4, 10
1164    paddw           m0, m2
1165    paddw           m1, m3
1166    pslldq          m2, m5, 4
1167    psrldq          m3, m5, 12
1168    paddw           m0, m2
1169    paddw           m1, m3
1170    pslldq          m2, m6, 2
1171    psrldq          m3, m6, 14
1172    paddw           m0, m2                  ; partial_sum_diag[1][0-7]
1173    paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
1174    mova            m3, [esp+0x50]
1175    pshufb          m1, [base+shufw_6543210x]
1176    punpckhwd       m2, m0, m1
1177    punpcklwd       m0, m1
1178    pmaddwd         m2, m2
1179    pmaddwd         m0, m0
1180    MULLD           m2, [base+div_table%+SUFFIX+16]
1181    MULLD           m0, [base+div_table%+SUFFIX+ 0]
1182    paddd           m0, m2                  ; cost[4a-d]
1183    phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
1184    phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
1185    mova    [esp+0x30], m1
1186
1187    phaddw          m0, [esp+0x00], m4
1188    phaddw          m1, [esp+0x10], m5
1189    paddw           m4, m5
1190    mova            m2, [esp+0x20]
1191    paddw           m5, m2, m3
1192    phaddw          m2, m6
1193    paddw           m6, m7
1194    phaddw          m3, m7
1195    mova            m7, [esp+0x00]
1196    paddw           m7, [esp+0x10]
1197    mova    [esp+0x00], m0
1198    mova    [esp+0x10], m1
1199    mova    [esp+0x20], m2
1200
1201    pslldq          m1, m4, 4
1202    pslldq          m2, m6, 6
1203    pslldq          m0, m5, 2
1204    paddw           m1, m2
1205    paddw           m0, m7
1206    psrldq          m2, m5, 14
1207    paddw           m0, m1                  ; partial_sum_alt[3] left
1208    psrldq          m1, m4, 12
1209    paddw           m1, m2
1210    psrldq          m2, m6, 10
1211    paddw           m1, m2                  ; partial_sum_alt[3] right
1212    pshuflw         m1, m1, q3012
1213    punpckhwd       m2, m0, m1
1214    punpcklwd       m0, m1
1215    pmaddwd         m2, m2
1216    pmaddwd         m0, m0
1217    MULLD           m2, [base+div_table%+SUFFIX+48]
1218    MULLD           m0, [base+div_table%+SUFFIX+32]
1219    paddd           m0, m2                  ; cost[7a-d]
1220    mova    [esp+0x40], m0
1221
1222    pslldq          m0, m7, 6
1223    psrldq          m7, 10
1224    pslldq          m1, m5, 4
1225    psrldq          m5, 12
1226    pslldq          m2, m4, 2
1227    psrldq          m4, 14
1228    paddw           m0, m6
1229    paddw           m7, m5
1230    paddw           m0, m1
1231    paddw           m7, m4
1232    paddw           m0, m2
1233    pshuflw         m2, m7, q3012
1234    punpckhwd       m7, m0
1235    punpcklwd       m0, m2
1236    pmaddwd         m7, m7
1237    pmaddwd         m0, m0
1238    MULLD           m7, [base+div_table%+SUFFIX+48]
1239    MULLD           m0, [base+div_table%+SUFFIX+32]
1240    paddd           m0, m7                  ; cost[5a-d]
1241    mova    [esp+0x50], m0
1242
1243    mova            m7, [esp+0x10]
1244    mova            m2, [esp+0x20]
1245    pslldq          m0, m7, 2
1246    psrldq          m7, 14
1247    pslldq          m4, m2, 4
1248    psrldq          m2, 12
1249    pslldq          m5, m3, 6
1250    psrldq          m6, m3, 10
1251    paddw           m0, [esp+0x00]
1252    paddw           m7, m2
1253    paddw           m4, m5
1254    paddw           m7, m6                  ; partial_sum_alt[3] right
1255    paddw           m0, m4                  ; partial_sum_alt[3] left
1256    pshuflw         m2, m7, q3012
1257    punpckhwd       m7, m0
1258    punpcklwd       m0, m2
1259    pmaddwd         m7, m7
1260    pmaddwd         m0, m0
1261    MULLD           m7, [base+div_table%+SUFFIX+48]
1262    MULLD           m0, [base+div_table%+SUFFIX+32]
1263    paddd           m0, m7                  ; cost[1a-d]
1264    SWAP            m0, m4
1265
1266    pshufd          m0, [esp+0x00], q1032
1267    pshufd          m1, [esp+0x10], q1032
1268    pshufd          m2, [esp+0x20], q1032
1269    pshufd          m3, m3, q1032
1270    mova    [esp+0x00], m4
1271
1272    pslldq          m4, m0, 6
1273    psrldq          m0, 10
1274    pslldq          m5, m1, 4
1275    psrldq          m1, 12
1276    pslldq          m6, m2, 2
1277    psrldq          m2, 14
1278    paddw           m4, m3
1279    paddw           m0, m1
1280    paddw           m5, m6
1281    paddw           m0, m2
1282    paddw           m4, m5
1283    pshuflw         m2, m0, q3012
1284    punpckhwd       m0, m4
1285    punpcklwd       m4, m2
1286    pmaddwd         m0, m0
1287    pmaddwd         m4, m4
1288    MULLD           m0, [base+div_table%+SUFFIX+48]
1289    MULLD           m4, [base+div_table%+SUFFIX+32]
1290    paddd           m4, m0                   ; cost[3a-d]
1291
1292    mova            m1, [esp+0x00]
1293    mova            m2, [esp+0x50]
1294    mova            m0, [esp+0x30]          ; cost[0,4,2,6]
1295    phaddd          m1, m4
1296    phaddd          m2, [esp+0x40]          ; cost[1,3,5,7]
1297    phaddd          m1, m2
1298    pshufd          m2, m0, q3120
1299
1300    ; now find the best cost
1301  %if cpuflag(sse4)
1302    pmaxsd          m0, m1
1303    pshufd          m3, m0, q1032
1304    pmaxsd          m3, m0
1305    pshufd          m0, m3, q2301
1306    pmaxsd          m0, m3
1307  %else
1308    pcmpgtd         m3, m0, m1
1309    pand            m0, m3
1310    pandn           m3, m1
1311    por             m0, m3
1312    pshufd          m4, m0, q1032
1313    pcmpgtd         m3, m0, m4
1314    pand            m0, m3
1315    pandn           m3, m4
1316    por             m0, m3
1317    pshufd          m4, m0, q2301
1318    pcmpgtd         m3, m0, m4
1319    pand            m0, m3
1320    pandn           m3, m4
1321    por             m0, m3
1322  %endif
1323
1324    ; get direction and variance
1325    mov           vard, varm
1326    punpckhdq       m3, m2, m1
1327    punpckldq       m2, m1
1328    psubd           m1, m0, m3
1329    psubd           m4, m0, m2
1330    mova    [esp+0x00], m1                  ; emulate ymm in stack
1331    mova    [esp+0x10], m4
1332    pcmpeqd         m3, m0                  ; compute best cost mask
1333    pcmpeqd         m2, m0
1334    packssdw        m2, m3
1335    pmovmskb       eax, m2                  ; get byte-idx from mask
1336    tzcnt          eax, eax
1337    mov            r1d, [esp+eax*2]         ; get idx^4 complement from emulated ymm
1338    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
1339    shr            r1d, 10
1340    mov         [vard], r1d
1341 %endif
1342
1343    RET
1344%endmacro
1345
1346INIT_XMM sse4
1347CDEF_FILTER 8, 8
1348CDEF_FILTER 4, 8
1349CDEF_FILTER 4, 4
1350CDEF_DIR
1351
1352INIT_XMM ssse3
1353CDEF_FILTER 8, 8
1354CDEF_FILTER 4, 8
1355CDEF_FILTER 4, 4
1356CDEF_DIR
1357
1358INIT_XMM sse2
1359CDEF_FILTER 8, 8
1360CDEF_FILTER 4, 8
1361CDEF_FILTER 4, 4
1362