1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; Copyright © 2018, VideoLabs
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30SECTION_RODATA 16
31
32pb_right_ext_mask: times 16 db 0xff
33                   times 16 db 0
34pb_14x0_1_2: times 14 db 0
35             db 1, 2
36pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
37                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
38pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
39pb_0: times 16 db 0
40pb_2: times 16 db 2
41pb_3: times 16 db 3
42pb_4: times 16 db 4
43pb_15: times 16 db 15
44pb_0_1: times 8 db 0, 1
45pb_6_7: times 8 db 6, 7
46pb_14_15: times 8 db 14, 15
47pw_1: times 8 dw 1
48pw_16: times 8 dw 16
49pw_128: times 8 dw 128
50pw_255: times 8 dw 255
51pw_256: times 8 dw 256
52pw_2048: times 8 dw 2048
53pw_16380: times 8 dw 16380
54pw_5_6: times 4 dw 5, 6
55pw_0_128: times 4 dw 0, 128
56pd_1024: times 4 dd 1024
57%if ARCH_X86_32
58pd_256: times 4 dd 256
59pd_512: times 4 dd 512
60pd_2048: times 4 dd 2048
61%endif
62pd_0xF0080029: times 4 dd 0xF0080029
63pd_0xF00801C7: times 4 dd 0XF00801C7
64
65cextern sgr_x_by_x
66
67SECTION .text
68
69%if ARCH_X86_32
70 %define PIC_base_offset $$
71
72 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
73  %assign pic_reg_stk_off 4
74  %xdefine PIC_reg %1
75  %if %2 == 1
76    mov        [esp], %1
77  %endif
78    LEA      PIC_reg, PIC_base_offset
79  %if %3 == 1
80    XCHG_PIC_REG
81  %endif
82 %endmacro
83
84 %macro XCHG_PIC_REG 0
85    mov [esp+pic_reg_stk_off], PIC_reg
86    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
87    mov PIC_reg, [esp+pic_reg_stk_off]
88 %endmacro
89
90 %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
91
92%else
93 %macro XCHG_PIC_REG 0
94 %endmacro
95
96 %define PIC_sym(sym)   (sym)
97%endif
98
99%macro PALIGNR 4 ; dst, src1, src2, shift
100 %if cpuflag(ssse3)
101    palignr       %1, %2, %3, %4
102 %else
103  %assign %%i regnumof%+%1 + 1
104  %define %%tmp m %+ %%i
105    psrldq        %1, %3, %4
106    pslldq     %%tmp, %2, 16-%4
107    por           %1, %%tmp
108 %endif
109%endmacro
110
111%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
112 %if cpuflag(ssse3)
113    pmaddubsw     %1, %2
114 %else
115  %if %5 == 1
116    pxor          %3, %3
117  %endif
118    punpckhbw     %4, %1, %3
119    punpcklbw     %1, %3
120    pmaddwd       %4, %2
121    pmaddwd       %1, %2
122    packssdw      %1, %4
123 %endif
124%endmacro
125
126;;;;;;;;;;;;;;;;;;;;;;
127;;      wiener      ;;
128;;;;;;;;;;;;;;;;;;;;;;
129
130%macro WIENER_H 0
131%if ARCH_X86_64
132cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
133    mov        edged, edgem
134    movifnidn     wd, wm
135    mov           hd, hm
136%else
137cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
138    mov           r5, edgem
139    mov     [esp+12], r5
140    mov           wd, wm
141    mov           hd, hm
142    SETUP_PIC hd
143 %define m15 m0
144 %define m14 m1
145 %define m13 m2
146 %define m12 m3
147%endif
148
149    movq         m15, [fhq]
150%if cpuflag(ssse3)
151    pshufb       m12, m15, [PIC_sym(pb_6_7)]
152    pshufb       m13, m15, [PIC_sym(pb_4)]
153    pshufb       m14, m15, [PIC_sym(pb_2)]
154    pshufb       m15, m15, [PIC_sym(pb_0)]
155%else
156    pshuflw      m12, m15, q3333
157    punpcklbw    m15, m15
158    pshufhw      m13, m15, q0000
159    pshuflw      m14, m15, q2222
160    pshuflw      m15, m15, q0000
161    punpcklqdq   m12, m12
162    punpckhqdq   m13, m13
163    punpcklqdq   m14, m14
164    punpcklqdq   m15, m15
165    psraw        m13, 8
166    psraw        m14, 8
167    psraw        m15, 8
168%endif
169
170%if ARCH_X86_64
171    mova         m11, [pw_2048]
172    mova         m10, [pw_16380]
173    lea          r11, [pb_right_ext_mask]
174
175    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
176%else
177 %define m10    [PIC_sym(pw_16380)]
178 %define m11    [PIC_sym(pw_2048)]
179 %define m12    [esp+0x14]
180 %define m13    [esp+0x24]
181 %define m14    [esp+0x34]
182 %define m15    [esp+0x44]
183    mova         m12, m3
184    mova         m13, m2
185    mova         m14, m1
186    mova         m15, m0
187
188    DEFINE_ARGS dst, left, src, stride, x, w, h, edge
189 %define srcptrq    srcq
190 %define dstptrq    dstq
191 %define hd         dword [esp+ 0]
192 %define edgeb      byte  [esp+12]
193 %define xlimd      dword [esp+16]
194%endif
195
196    ; if (edge & has_right) align_w_to_16
197    ; else w -= 3, and use that as limit in x loop
198    test       edgeb, 2 ; has_right
199    jnz .align
200    mov        xlimd, -3
201    jmp .loop
202.align:
203    add           wd, 15
204    and           wd, ~15
205%if ARCH_X86_64
206    xor        xlimd, xlimd
207%else
208    mov        xlimd, 0
209%endif
210
211    ; main y loop for vertical filter
212.loop:
213%if ARCH_X86_64
214    mov      srcptrq, srcq
215    mov      dstptrq, dstq
216    lea           xd, [wq+xlimq]
217%else
218    mov      [esp+8], srcq
219    mov      [esp+4], dstq
220    mov           xd, xlimd
221    add           xd, wd
222%endif
223
224    ; load left edge pixels
225    test       edgeb, 1 ; have_left
226    jz .emu_left
227    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
228    jz .load_left_combined
229    movd          m0, [leftq]
230    movd          m1, [srcq]
231    punpckldq     m0, m1
232    pslldq        m0, 9
233    add        leftq, 4
234    jmp .left_load_done
235.load_left_combined:
236    movq          m0, [srcq-3]
237    pslldq        m0, 10
238    jmp .left_load_done
239.emu_left:
240    movd          m0, [srcq]
241%if cpuflag(ssse3)
242    pshufb        m0, [PIC_sym(pb_14x0_1_2)]
243%else
244    pslldq        m1, m0, 13
245    punpcklbw     m0, m0
246    pshuflw       m0, m0, q0000
247    punpcklqdq    m0, m0
248    psrldq        m0, 2
249    por           m0, m1
250%endif
251
252    ; load right edge pixels
253.left_load_done:
254    cmp           xd, 16
255    jg .main_load
256    test          xd, xd
257    jg .load_and_splat
258    je .splat_right
259
260    ; for very small images (w=[1-2]), edge-extend the original cache,
261    ; ugly, but only runs in very odd cases
262%if cpuflag(ssse3)
263    add           wd, wd
264 %if ARCH_X86_64
265    pshufb        m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
266 %else
267    pshufb        m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
268 %endif
269    shr           wd, 1
270%else
271    shl           wd, 4
272    pcmpeqd       m2, m2
273    movd          m3, wd
274    psrldq        m2, 2
275    punpckhbw     m1, m0, m0
276    pshufhw       m1, m1, q1122
277    psllq         m1, m3
278    pand          m0, m2
279    pandn         m2, m1
280    por           m0, m2
281    shr           wd, 4
282%endif
283
284    ; main x loop, mostly this starts in .main_load
285.splat_right:
286    ; no need to load new pixels, just extend them from the (possibly previously
287    ; extended) previous load into m0
288%if cpuflag(ssse3)
289    pshufb        m1, m0, [PIC_sym(pb_15)]
290%else
291    punpckhbw     m1, m0, m0
292    pshufhw       m1, m1, q3333
293    punpckhqdq    m1, m1
294%endif
295    jmp .main_loop
296.load_and_splat:
297    ; load new pixels and extend edge for right-most
298    movu          m1, [srcptrq+3]
299%if ARCH_X86_64
300    sub          r11, xq
301    movu          m2, [r11+16]
302    add          r11, xq
303%else
304    sub      PIC_reg, xd
305    movu          m2, [PIC_sym(pb_right_ext_mask)+16]
306    add      PIC_reg, xd
307%endif
308    movd          m3, [srcptrq+2+xq]
309%if cpuflag(ssse3)
310    pshufb        m3, [PIC_sym(pb_0)]
311%else
312    punpcklbw     m3, m3
313    pshuflw       m3, m3, q0000
314    punpcklqdq    m3, m3
315%endif
316    pand          m1, m2
317    pxor          m2, [PIC_sym(pb_right_ext_mask)]
318    pand          m3, m2
319    pxor          m2, [PIC_sym(pb_right_ext_mask)]
320    por           m1, m3
321    jmp .main_loop
322.main_load:
323    ; load subsequent line
324    movu          m1, [srcptrq+3]
325.main_loop:
326%if ARCH_X86_64
327    PALIGNR       m2, m1, m0, 10
328    PALIGNR       m3, m1, m0, 11
329    PALIGNR       m4, m1, m0, 12
330    PALIGNR       m5, m1, m0, 13
331    PALIGNR       m6, m1, m0, 14
332    PALIGNR       m7, m1, m0, 15
333
334    punpcklbw     m0, m2, m1
335    punpckhbw     m2, m1
336    punpcklbw     m8, m3, m7
337    punpckhbw     m3, m7
338    punpcklbw     m7, m4, m6
339    punpckhbw     m4, m6
340    PMADDUBSW     m0, m15, m6, m9, 1
341    PMADDUBSW     m2, m15, m6, m9, 0
342    PMADDUBSW     m8, m14, m6, m9, 0
343    PMADDUBSW     m3, m14, m6, m9, 0
344    PMADDUBSW     m7, m13, m6, m9, 0
345    PMADDUBSW     m4, m13, m6, m9, 0
346    paddw         m0, m8
347    paddw         m2, m3
348 %if cpuflag(ssse3)
349    pxor          m6, m6
350 %endif
351    punpcklbw     m3, m5, m6
352    punpckhbw     m5, m6
353    psllw         m8, m3, 7
354    psllw         m6, m5, 7
355    psubw         m8, m10
356    psubw         m6, m10
357    pmullw        m3, m12
358    pmullw        m5, m12
359    paddw         m0, m7
360    paddw         m2, m4
361    paddw         m0, m3
362    paddw         m2, m5
363    paddsw        m0, m8 ; see the avx2 for an explanation
364    paddsw        m2, m6 ; of how the clipping works here
365    psraw         m0, 3
366    psraw         m2, 3
367    paddw         m0, m11
368    paddw         m2, m11
369    mova [dstptrq+ 0], m0
370    mova [dstptrq+16], m2
371%else
372    PALIGNR       m2, m1, m0, 10
373    punpcklbw     m3, m2, m1
374    punpckhbw     m2, m1
375    PMADDUBSW     m3, m15, m4, m5, 1
376    PMADDUBSW     m2, m15, m4, m5, 0
377    PALIGNR       m4, m1, m0, 11
378    PALIGNR       m5, m1, m0, 15
379    punpcklbw     m6, m4, m5
380    punpckhbw     m4, m5
381    PMADDUBSW     m6, m14, m5, m7, 1
382    PMADDUBSW     m4, m14, m5, m7, 0
383    paddw         m3, m6
384    paddw         m2, m4
385    PALIGNR       m4, m1, m0, 12
386    PALIGNR       m5, m1, m0, 14
387    punpcklbw     m6, m4, m5
388    punpckhbw     m4, m5
389    PMADDUBSW     m6, m13, m5, m7, 1
390    PMADDUBSW     m4, m13, m5, m7, 0
391    paddw         m3, m6
392    paddw         m2, m4
393    PALIGNR       m6, m1, m0, 13
394 %if cpuflag(ssse3)
395    pxor          m5, m5
396 %endif
397    punpcklbw     m4, m6, m5
398    punpckhbw     m6, m5
399    psllw         m5, m4, 7
400    psllw         m7, m6, 7
401    psubw         m5, m10
402    psubw         m7, m10
403    pmullw        m4, m12
404    pmullw        m6, m12
405    paddw         m3, m4
406    paddw         m2, m6
407    paddsw        m3, m5
408    paddsw        m2, m7
409    psraw         m3, 3
410    psraw         m2, 3
411    paddw         m3, m11
412    paddw         m2, m11
413    mova [dstptrq+ 0], m3
414    mova [dstptrq+16], m2
415%endif
416
417    mova          m0, m1
418    add      srcptrq, 16
419    add      dstptrq, 32
420    sub           xd, 16
421    cmp           xd, 16
422    jg .main_load
423    test          xd, xd
424    jg .load_and_splat
425    cmp           xd, xlimd
426    jg .splat_right
427
428%if ARCH_X86_32
429    mov         srcq, [esp+8]
430    mov         dstq, [esp+4]
431%endif
432    add         srcq, strideq
433    add         dstq, 384*2
434    dec           hd
435    jg .loop
436    RET
437%endmacro
438
439%macro WIENER_V 0
440%if ARCH_X86_64
441cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
442    mov        edged, edgem
443    movifnidn    fvq, fvmp
444    movifnidn     hd, hm
445    movq         m15, [fvq]
446    pshufd       m14, m15, q1111
447    pshufd       m15, m15, q0000
448    paddw        m14, [pw_0_128]
449    mova         m12, [pd_1024]
450
451    DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
452
453    mov        ylimd, edged
454    and        ylimd, 8 ; have_bottom
455    shr        ylimd, 2
456    sub        ylimd, 3
457%else
458cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
459 %define ylimd [esp+12]
460
461    mov          r5d, edgem
462    and          r5d, 8
463    shr          r5d, 2
464    sub          r5d, 3
465    mov        ylimd, r5d
466    mov          fvq, fvmp
467    mov        edged, edgem
468
469    SETUP_PIC edged
470
471    movq          m0, [fvq]
472    pshufd        m1, m0, q1111
473    pshufd        m0, m0, q0000
474    paddw         m1, [PIC_sym(pw_0_128)]
475    mova  [esp+0x50], m0
476    mova  [esp+0x40], m1
477
478    DEFINE_ARGS dst, stride, mid, w, h, y, edge
479 %define mptrq      midq
480 %define dstptrq    dstq
481 %define edgeb      byte [esp]
482%endif
483
484    ; main x loop for vertical filter, does one column of 16 pixels
485.loop_x:
486    mova          m3, [midq] ; middle line
487
488    ; load top pixels
489    test       edgeb, 4 ; have_top
490    jz .emu_top
491    mova          m0, [midq-384*4]
492    mova          m2, [midq-384*2]
493    mova          m1, m0
494    jmp .load_bottom_pixels
495.emu_top:
496    mova          m0, m3
497    mova          m1, m3
498    mova          m2, m3
499
500    ; load bottom pixels
501.load_bottom_pixels:
502    mov           yd, hd
503%if ARCH_X86_64
504    mov        mptrq, midq
505    mov      dstptrq, dstq
506    add           yd, ylimd
507%else
508    mov      [esp+8], midq
509    mov      [esp+4], dstq
510    add           yd, ylimd
511%endif
512    jg .load_threelines
513
514    ; the remainder here is somewhat messy but only runs in very weird
515    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
516    ; so performance is not terribly important here...
517    je .load_twolines
518    cmp           yd, -1
519    je .load_oneline
520    ; h == 1 case
521    mova          m5, m3
522    mova          m4, m3
523    mova          m6, m3
524    jmp .loop
525.load_oneline:
526    ; h == 2 case
527    mova          m4, [midq+384*2]
528    mova          m5, m4
529    mova          m6, m4
530    jmp .loop
531.load_twolines:
532    ; h == 3 case
533    mova          m4, [midq+384*2]
534    mova          m5, [midq+384*4]
535    mova          m6, m5
536    jmp .loop
537.load_threelines:
538    ; h > 3 case
539    mova          m4, [midq+384*2]
540    mova          m5, [midq+384*4]
541    ; third line loaded in main loop below
542
543    ; main y loop for vertical filter
544.loop_load:
545    ; load one line into m6. if that pixel is no longer available, do
546    ; nothing, since m6 still has the data from the previous line in it. We
547    ; try to structure the loop so that the common case is evaluated fastest
548    mova          m6, [mptrq+384*6]
549.loop:
550%if ARCH_X86_64
551    paddw         m7, m0, m6
552    paddw         m8, m1, m5
553    paddw         m9, m2, m4
554    punpcklwd    m10, m7, m8
555    punpckhwd     m7, m8
556    punpcklwd    m11, m9, m3
557    punpckhwd     m9, m3
558    pmaddwd      m10, m15
559    pmaddwd       m7, m15
560    pmaddwd      m11, m14
561    pmaddwd       m9, m14
562    paddd        m10, m12
563    paddd         m7, m12
564    paddd        m10, m11
565    paddd         m7, m9
566    psrad        m10, 11
567    psrad         m7, 11
568    packssdw     m10, m7
569    packuswb     m10, m10
570    movq   [dstptrq], m10
571%else
572    mova  [esp+0x30], m1
573    mova  [esp+0x20], m2
574    mova  [esp+0x10], m3
575    paddw         m0, m6
576    paddw         m1, m5
577    paddw         m2, m4
578    punpcklwd     m7, m2, m3
579    punpckhwd     m2, m3
580    punpcklwd     m3, m0, m1
581    punpckhwd     m0, m1
582    mova          m1, [esp+0x50]
583    pmaddwd       m3, m1
584    pmaddwd       m0, m1
585    mova          m1, [esp+0x40]
586    pmaddwd       m7, m1
587    pmaddwd       m2, m1
588    paddd         m3, [PIC_sym(pd_1024)]
589    paddd         m0, [PIC_sym(pd_1024)]
590    paddd         m3, m7
591    paddd         m0, m2
592    psrad         m3, 11
593    psrad         m0, 11
594    packssdw      m3, m0
595    packuswb      m3, m3
596    movq      [dstq], m3
597    mova          m1, [esp+0x30]
598    mova          m2, [esp+0x20]
599    mova          m3, [esp+0x10]
600%endif
601    ; shift pixels one position
602    mova          m0, m1
603    mova          m1, m2
604    mova          m2, m3
605    mova          m3, m4
606    mova          m4, m5
607    mova          m5, m6
608    add        mptrq, 384*2
609    add      dstptrq, strideq
610    dec           yd
611    jg .loop_load
612    ; for the bottom pixels, continue using m6 (as extended edge)
613    cmp           yd, ylimd
614    jg .loop
615
616%if ARCH_X86_32
617    mov         midq, [esp+8]
618    mov         dstq, [esp+4]
619%endif
620    add         midq, 16
621    add         dstq, 8
622    sub           wd, 8
623    jg .loop_x
624    RET
625%endmacro
626
627INIT_XMM sse2
628WIENER_H
629WIENER_V
630
631INIT_XMM ssse3
632WIENER_H
633WIENER_V
634
635;;;;;;;;;;;;;;;;;;;;;;;;;;
636;;      self-guided     ;;
637;;;;;;;;;;;;;;;;;;;;;;;;;;
638
639%macro MULLD 2
640    pmulhuw       m5, %1, %2
641    pmullw        %1, %2
642    pslld         m5, 16
643    paddd         %1, m5
644%endmacro
645
646%macro GATHERDD 2
647    mova          m5, m7
648    movd         r6d, %2
649 %if ARCH_X86_64
650    movd          %1, [r5+r6]
651    pextrw       r6d, %2, 2
652    pinsrw        m5, [r5+r6+2], 3
653    pextrw       r6d, %2, 4
654    pinsrw        %1, [r5+r6+2], 5
655    pextrw       r6d, %2, 6
656    pinsrw        m5, [r5+r6+2], 7
657 %else
658    movd          %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
659    pextrw       r6d, %2, 2
660    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
661    pextrw       r6d, %2, 4
662    pinsrw        %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
663    pextrw       r6d, %2, 6
664    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
665 %endif
666    por           %1, m5
667%endmacro
668
669%if ARCH_X86_64
670cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
671    mov        xlimd, edgem
672    movifnidn     xd, xm
673    mov           hd, hm
674    mov        edged, xlimd
675    and        xlimd, 2                             ; have_right
676    add           xd, xlimd
677    xor        xlimd, 2                             ; 2*!have_right
678%else
679cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
680 %define wq     r0m
681 %define xlimd  r1m
682 %define hd     hmp
683 %define edgeb  byte edgem
684
685    mov           r6, edgem
686    and           r6, 2                             ; have_right
687    add           xd, r6
688    xor           r6, 2                             ; 2*!have_right
689    mov        xlimd, r6
690    SETUP_PIC     r6, 0
691%endif
692
693    jnz .no_right
694    add           xd, 7
695    and           xd, ~7
696.no_right:
697    pxor          m1, m1
698    lea         srcq, [srcq+xq]
699    lea         sumq, [sumq+xq*2-2]
700    lea       sumsqq, [sumsqq+xq*4-4]
701    neg           xq
702    mov           wq, xq
703%if ARCH_X86_64
704    lea          r10, [pb_right_ext_mask+16]
705%endif
706.loop_y:
707    mov           xq, wq
708
709    ; load left
710    test       edgeb, 1                             ; have_left
711    jz .no_left
712    test       leftq, leftq
713    jz .load_left_from_main
714    movd          m0, [leftq]
715    pslldq        m0, 12
716    add        leftq, 4
717    jmp .expand_x
718.no_left:
719    movd          m0, [srcq+xq]
720    pshufb        m0, [PIC_sym(pb_0)]
721    jmp .expand_x
722.load_left_from_main:
723    movd          m0, [srcq+xq-2]
724    pslldq        m0, 14
725.expand_x:
726    punpckhbw    xm0, xm1
727
728    ; when we reach this, m0 contains left two px in highest words
729    cmp           xd, -8
730    jle .loop_x
731.partial_load_and_extend:
732    movd          m3, [srcq-4]
733    pshufb        m3, [PIC_sym(pb_3)]
734    movq          m2, [srcq+xq]
735    punpcklbw     m2, m1
736    punpcklbw     m3, m1
737%if ARCH_X86_64
738    movu          m4, [r10+xq*2]
739%else
740    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
741%endif
742    pand          m2, m4
743    pandn         m4, m3
744    por           m2, m4
745    jmp .loop_x_noload
746.right_extend:
747    pshufb        m2, m0, [PIC_sym(pb_14_15)]
748    jmp .loop_x_noload
749
750.loop_x:
751    movq          m2, [srcq+xq]
752    punpcklbw     m2, m1
753.loop_x_noload:
754    palignr       m3, m2, m0, 12
755    palignr       m4, m2, m0, 14
756
757    punpcklwd     m5, m3, m2
758    punpckhwd     m6, m3, m2
759    paddw         m3, m4
760    punpcklwd     m7, m4, m1
761    punpckhwd     m4, m1
762    pmaddwd       m5, m5
763    pmaddwd       m6, m6
764    pmaddwd       m7, m7
765    pmaddwd       m4, m4
766    paddd         m5, m7
767    paddd         m6, m4
768    paddw         m3, m2
769    movu [sumq+xq*2], m3
770    movu [sumsqq+xq*4+ 0], m5
771    movu [sumsqq+xq*4+16], m6
772
773    mova          m0, m2
774    add           xq, 8
775
776    ; if x <= -8 we can reload more pixels
777    ; else if x < 0 we reload and extend (this implies have_right=0)
778    ; else if x < xlimd we extend from previous load (this implies have_right=0)
779    ; else we are done
780
781    cmp           xd, -8
782    jle .loop_x
783    test          xd, xd
784    jl .partial_load_and_extend
785    cmp           xd, xlimd
786    jl .right_extend
787
788    add       sumsqq, (384+16)*4
789    add         sumq, (384+16)*2
790    add         srcq, strideq
791    dec           hd
792    jg .loop_y
793    RET
794
795%if ARCH_X86_64
796cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
797    movifnidn  edged, edgem
798%else
799cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
800 %define sumsq_baseq dword [esp+0]
801 %define sum_baseq   dword [esp+4]
802 %define ylimd       dword [esp+8]
803 %define m8          [esp+12]
804    mov        edged, r4m
805    mov           hd, r3m
806%endif
807    mov           xq, -2
808%if ARCH_X86_64
809    mov        ylimd, edged
810    and        ylimd, 8                             ; have_bottom
811    shr        ylimd, 2
812    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
813    mov  sumsq_baseq, sumsqq
814    mov    sum_baseq, sumq
815.loop_x:
816    mov       sumsqq, sumsq_baseq
817    mov         sumq, sum_baseq
818    lea           yd, [hq+ylimq+2]
819%else
820    mov           yd, edged
821    and           yd, 8                             ; have_bottom
822    shr           yd, 2
823    sub           yd, 2                             ; -2 if have_bottom=0, else 0
824    mov  sumsq_baseq, sumsqq
825    mov    sum_baseq, sumq
826    mov        ylimd, yd
827.loop_x:
828    mov       sumsqd, sumsq_baseq
829    mov         sumd, sum_baseq
830    lea           yd, [hq+2]
831    add           yd, ylimd
832%endif
833    lea       sumsqq, [sumsqq+xq*4+4-(384+16)*4]
834    lea         sumq, [sumq+xq*2+2-(384+16)*2]
835    test       edgeb, 4                             ; have_top
836    jnz .load_top
837    movu          m0, [sumsqq+(384+16)*4*1]
838    movu          m1, [sumsqq+(384+16)*4*1+16]
839    mova          m2, m0
840    mova          m3, m1
841    mova          m4, m0
842    mova          m5, m1
843    movu          m6, [sumq+(384+16)*2*1]
844    mova          m7, m6
845    mova          m8, m6
846    jmp .loop_y_noload
847.load_top:
848    movu          m0, [sumsqq-(384+16)*4*1]      ; l2sq [left]
849    movu          m1, [sumsqq-(384+16)*4*1+16]   ; l2sq [right]
850    movu          m2, [sumsqq-(384+16)*4*0]      ; l1sq [left]
851    movu          m3, [sumsqq-(384+16)*4*0+16]   ; l1sq [right]
852    movu          m6, [sumq-(384+16)*2*1]        ; l2
853    movu          m7, [sumq-(384+16)*2*0]        ; l1
854.loop_y:
855%if ARCH_X86_64
856    movu          m8, [sumq+(384+16)*2*1]        ; l0
857%else
858    movu          m4, [sumq+(384+16)*2*1]        ; l0
859    mova          m8, m4
860%endif
861    movu          m4, [sumsqq+(384+16)*4*1]      ; l0sq [left]
862    movu          m5, [sumsqq+(384+16)*4*1+16]   ; l0sq [right]
863.loop_y_noload:
864    paddd         m0, m2
865    paddd         m1, m3
866    paddw         m6, m7
867    paddd         m0, m4
868    paddd         m1, m5
869    paddw         m6, m8
870    movu [sumsqq+ 0], m0
871    movu [sumsqq+16], m1
872    movu      [sumq], m6
873
874    ; shift position down by one
875    mova          m0, m2
876    mova          m1, m3
877    mova          m2, m4
878    mova          m3, m5
879    mova          m6, m7
880    mova          m7, m8
881    add       sumsqq, (384+16)*4
882    add         sumq, (384+16)*2
883    dec           yd
884    jg .loop_y
885    cmp           yd, ylimd
886    jg .loop_y_noload
887    add           xd, 8
888    cmp           xd, wd
889    jl .loop_x
890    RET
891
892cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
893    movifnidn     sd, sm
894    sub           aq, (384+16-1)*4
895    sub           bq, (384+16-1)*2
896    add           hd, 2
897%if ARCH_X86_64
898    LEA           r5, sgr_x_by_x-0xF03
899%else
900    SETUP_PIC r5, 0
901%endif
902    movd          m6, sd
903    pshuflw       m6, m6, q0000
904    punpcklqdq    m6, m6
905    pxor          m7, m7
906    DEFINE_ARGS a, b, w, h, x
907%if ARCH_X86_64
908    mova          m8, [pd_0xF00801C7]
909    mova          m9, [pw_256]
910    psrld        m10, m9, 13                        ; pd_2048
911    mova         m11, [pb_unpcklwdw]
912%else
913 %define m8     [PIC_sym(pd_0xF00801C7)]
914 %define m9     [PIC_sym(pw_256)]
915 %define m10    [PIC_sym(pd_2048)]
916 %define m11    [PIC_sym(pb_unpcklwdw)]
917%endif
918.loop_y:
919    mov           xq, -2
920.loop_x:
921    movq          m0, [bq+xq*2]
922    movq          m1, [bq+xq*2+(384+16)*2]
923    punpcklwd     m0, m7
924    punpcklwd     m1, m7
925    movu          m2, [aq+xq*4]
926    movu          m3, [aq+xq*4+(384+16)*4]
927    pslld         m4, m2, 3
928    pslld         m5, m3, 3
929    paddd         m2, m4                            ; aa * 9
930    paddd         m3, m5
931    pmaddwd       m4, m0, m0
932    pmaddwd       m5, m1, m1
933    pmaddwd       m0, m8
934    pmaddwd       m1, m8
935    psubd         m2, m4                            ; p = aa * 9 - bb * bb
936    psubd         m3, m5
937    MULLD         m2, m6
938    MULLD         m3, m6
939    paddusw       m2, m8
940    paddusw       m3, m8
941    psrld         m2, 20                            ; z
942    psrld         m3, 20
943    GATHERDD      m4, m2                            ; xx
944    GATHERDD      m2, m3
945    psrld         m4, 24
946    psrld         m2, 24
947    packssdw      m3, m4, m2
948    pshufb        m4, m11
949    MULLD         m0, m4
950    pshufb        m2, m11
951    MULLD         m1, m2
952    psubw         m5, m9, m3
953    paddd         m0, m10
954    paddd         m1, m10
955    psrld         m0, 12
956    psrld         m1, 12
957    movq   [bq+xq*2], m5
958    psrldq        m5, 8
959    movq [bq+xq*2+(384+16)*2], m5
960    movu   [aq+xq*4], m0
961    movu [aq+xq*4+(384+16)*4], m1
962    add           xd, 4
963    cmp           xd, wd
964    jl .loop_x
965    add           aq, (384+16)*4*2
966    add           bq, (384+16)*2*2
967    sub           hd, 2
968    jg .loop_y
969    RET
970
971%if ARCH_X86_64
972cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
973                                       tmp_base, src_base, a_base, b_base, x, y
974    movifnidn     wd, wm
975    mov           hd, hm
976    mova         m15, [pw_16]
977    mov    tmp_baseq, tq
978    mov    src_baseq, srcq
979    mov      a_baseq, aq
980    mov      b_baseq, bq
981    xor           xd, xd
982%else
983cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
984 %define tmp_baseq  [esp+8]
985 %define src_baseq  [esp+12]
986 %define a_baseq    [esp+16]
987 %define b_baseq    [esp+20]
988 %define wd         [esp+24]
989 %define hd         [esp+28]
990    mov    tmp_baseq, tq
991    mov    src_baseq, srcq
992    mov      a_baseq, aq
993    mov      b_baseq, bq
994    mov           wd, xd
995    mov           hd, yd
996    xor           xd, xd
997    SETUP_PIC yd, 1, 1
998    jmp .loop_start
999%endif
1000
1001.loop_x:
1002    mov           tq, tmp_baseq
1003    mov         srcq, src_baseq
1004    mov           aq, a_baseq
1005    mov           bq, b_baseq
1006%if ARCH_X86_32
1007.loop_start:
1008    movu          m0, [bq+xq*2-(384+16)*2-2]
1009    movu          m2, [bq+xq*2-(384+16)*2+2]
1010    mova          m1, [bq+xq*2-(384+16)*2]          ; b:top
1011    paddw         m0, m2                            ; b:tl+tr
1012    movu          m2, [bq+xq*2-2]
1013    movu          m3, [bq+xq*2+2]
1014    paddw         m1, [bq+xq*2]                     ; b:top+ctr
1015    paddw         m2, m3                            ; b:l+r
1016    mova  [esp+0x80], m0
1017    mova  [esp+0x70], m1
1018    mova  [esp+0x60], m2
1019%endif
1020    movu          m0, [aq+xq*4-(384+16)*4-4]
1021    movu          m2, [aq+xq*4-(384+16)*4+4]
1022    mova          m1, [aq+xq*4-(384+16)*4]          ; a:top [first half]
1023    paddd         m0, m2                            ; a:tl+tr [first half]
1024    movu          m2, [aq+xq*4-(384+16)*4-4+16]
1025    movu          m4, [aq+xq*4-(384+16)*4+4+16]
1026    mova          m3, [aq+xq*4-(384+16)*4+16]       ; a:top [second half]
1027    paddd         m2, m4                            ; a:tl+tr [second half]
1028    movu          m4, [aq+xq*4-4]
1029    movu          m5, [aq+xq*4+4]
1030    paddd         m1, [aq+xq*4]                     ; a:top+ctr [first half]
1031    paddd         m4, m5                            ; a:l+r [first half]
1032    movu          m5, [aq+xq*4+16-4]
1033    movu          m6, [aq+xq*4+16+4]
1034    paddd         m3, [aq+xq*4+16]                  ; a:top+ctr [second half]
1035    paddd         m5, m6                            ; a:l+r [second half]
1036%if ARCH_X86_64
1037    movu          m6, [bq+xq*2-(384+16)*2-2]
1038    movu          m8, [bq+xq*2-(384+16)*2+2]
1039    mova          m7, [bq+xq*2-(384+16)*2]          ; b:top
1040    paddw         m6, m8                            ; b:tl+tr
1041    movu          m8, [bq+xq*2-2]
1042    movu          m9, [bq+xq*2+2]
1043    paddw         m7, [bq+xq*2]                     ; b:top+ctr
1044    paddw         m8, m9                            ; b:l+r
1045%endif
1046
1047    lea           tq, [tq+xq*2]
1048    lea         srcq, [srcq+xq*1]
1049    lea           aq, [aq+xq*4+(384+16)*4]
1050    lea           bq, [bq+xq*2+(384+16)*2]
1051    mov           yd, hd
1052.loop_y:
1053%if ARCH_X86_64
1054    movu          m9, [bq-2]
1055    movu         m10, [bq+2]
1056    paddw         m7, [bq]                          ; b:top+ctr+bottom
1057    paddw         m9, m10                           ; b:bl+br
1058    paddw        m10, m7, m8                        ; b:top+ctr+bottom+l+r
1059    paddw         m6, m9                            ; b:tl+tr+bl+br
1060    psubw         m7, [bq-(384+16)*2*2]             ; b:ctr+bottom
1061    paddw        m10, m6
1062    psllw        m10, 2
1063    psubw        m10, m6                            ; aa
1064    pxor         m14, m14
1065    movq         m12, [srcq]
1066    punpcklbw    m12, m14
1067    punpcklwd     m6, m10, m15
1068    punpckhwd    m10, m15
1069    punpcklwd    m13, m12, m15
1070    punpckhwd    m12, m15
1071    pmaddwd       m6, m13                           ; aa*src[x]+256 [first half]
1072    pmaddwd      m10, m12                           ; aa*src[x]+256 [second half]
1073%else
1074    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
1075    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
1076    mova  [esp+0x50], m1
1077    mova  [esp+0x40], m3
1078    mova  [esp+0x30], m4
1079    movu          m6, [aq-4]
1080    movu          m7, [aq+4]
1081    paddd         m1, m4                            ; a:top+ctr+bottom+l+r [first half]
1082    paddd         m3, m5                            ; a:top+ctr+bottom+l+r [second half]
1083    paddd         m6, m7                            ; a:bl+br [first half]
1084    movu          m7, [aq+16-4]
1085    movu          m4, [aq+16+4]
1086    paddd         m7, m4                            ; a:bl+br [second half]
1087    paddd         m0, m6                            ; a:tl+tr+bl+br [first half]
1088    paddd         m2, m7                            ; a:tl+tr+bl+br [second half]
1089    paddd         m1, m0
1090    paddd         m3, m2
1091    pslld         m1, 2
1092    pslld         m3, 2
1093    psubd         m1, m0                            ; bb [first half]
1094    psubd         m3, m2                            ; bb [second half]
1095%endif
1096
1097%if ARCH_X86_64
1098    movu         m11, [aq-4]
1099    movu         m12, [aq+4]
1100    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
1101    paddd        m11, m12                           ; a:bl+br [first half]
1102    movu         m12, [aq+16-4]
1103    movu         m13, [aq+16+4]
1104    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
1105    paddd        m12, m13                           ; a:bl+br [second half]
1106    paddd        m13, m1, m4                        ; a:top+ctr+bottom+l+r [first half]
1107    paddd        m14, m3, m5                        ; a:top+ctr+bottom+l+r [second half]
1108    paddd         m0, m11                           ; a:tl+tr+bl+br [first half]
1109    paddd         m2, m12                           ; a:tl+tr+bl+br [second half]
1110    paddd        m13, m0
1111    paddd        m14, m2
1112    pslld        m13, 2
1113    pslld        m14, 2
1114    psubd        m13, m0                            ; bb [first half]
1115    psubd        m14, m2                            ; bb [second half]
1116    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
1117    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
1118%else
1119    mova          m4, [esp+0x80]
1120    mova  [esp+0x80], m5
1121    mova          m5, [esp+0x70]
1122    mova  [esp+0x70], m6
1123    mova          m6, [esp+0x60]
1124    mova  [esp+0x60], m7
1125    mova  [esp+0x20], m1
1126    movu          m7, [bq-2]
1127    movu          m1, [bq+2]
1128    paddw         m5, [bq]                          ; b:top+ctr+bottom
1129    paddw         m7, m1
1130    paddw         m1, m5, m6                        ; b:top+ctr+bottom+l+r
1131    paddw         m4, m7                            ; b:tl+tr+bl+br
1132    psubw         m5, [bq-(384+16)*2*2]             ; b:ctr+bottom
1133    paddw         m1, m4
1134    psllw         m1, 2
1135    psubw         m1, m4                            ; aa
1136    movq          m0, [srcq]
1137    XCHG_PIC_REG
1138    punpcklbw     m0, [PIC_sym(pb_right_ext_mask)+16]
1139    punpcklwd     m4, m1, [PIC_sym(pw_16)]
1140    punpckhwd     m1, [PIC_sym(pw_16)]
1141    punpcklwd     m2, m0, [PIC_sym(pw_16)]
1142    punpckhwd     m0, [PIC_sym(pw_16)]
1143    XCHG_PIC_REG
1144    pmaddwd       m4, m2                            ; aa*src[x]+256 [first half]
1145    pmaddwd       m1, m0                            ; aa*src[x]+256 [second half]
1146%endif
1147
1148%if ARCH_X86_64
1149    paddd         m6, m13
1150    paddd        m10, m14
1151    psrad         m6, 9
1152    psrad        m10, 9
1153    packssdw      m6, m10
1154    mova        [tq], m6
1155%else
1156    paddd         m4, [esp+0x20]
1157    paddd         m1, m3
1158    psrad         m4, 9
1159    psrad         m1, 9
1160    packssdw      m4, m1
1161    mova        [tq], m4
1162%endif
1163
1164    ; shift to next row
1165%if ARCH_X86_64
1166    mova          m0, m4
1167    mova          m2, m5
1168    mova          m4, m11
1169    mova          m5, m12
1170    mova          m6, m8
1171    mova          m8, m9
1172%else
1173    mova          m1, [esp+0x50]
1174    mova          m3, [esp+0x40]
1175    mova          m0, [esp+0x30]
1176    mova          m2, [esp+0x80]
1177    mova          m4, [esp+0x70]
1178    mova  [esp+0x70], m5
1179    mova          m5, [esp+0x60]
1180    mova  [esp+0x80], m6
1181    mova  [esp+0x60], m7
1182    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
1183    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
1184%endif
1185
1186    add         srcq, strideq
1187    add           aq, (384+16)*4
1188    add           bq, (384+16)*2
1189    add           tq, 384*2
1190    dec           yd
1191    jg .loop_y
1192    add           xd, 8
1193    cmp           xd, wd
1194    jl .loop_x
1195    RET
1196
1197cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
1198    movifnidn     hd, hm
1199%if ARCH_X86_32
1200    SETUP_PIC r6, 0
1201%endif
1202    movd          m0, wtm
1203    pshufb        m0, [PIC_sym(pb_0_1)]
1204    psllw         m0, 4
1205    pxor          m7, m7
1206    DEFINE_ARGS dst, stride, t, w, h, idx
1207.loop_y:
1208    xor         idxd, idxd
1209.loop_x:
1210    mova          m1, [tq+idxq*2+ 0]
1211    mova          m4, [tq+idxq*2+16]
1212    mova          m5, [dstq+idxq]
1213    punpcklbw     m2, m5, m7
1214    punpckhbw     m5, m7
1215    psllw         m3, m2, 4
1216    psllw         m6, m5, 4
1217    psubw         m1, m3
1218    psubw         m4, m6
1219    pmulhrsw      m1, m0
1220    pmulhrsw      m4, m0
1221    paddw         m1, m2
1222    paddw         m4, m5
1223    packuswb      m1, m4
1224    mova [dstq+idxq], m1
1225    add         idxd, 16
1226    cmp         idxd, wd
1227    jl .loop_x
1228    add         dstq, strideq
1229    add           tq, 384 * 2
1230    dec           hd
1231    jg .loop_y
1232    RET
1233
1234%if ARCH_X86_64
1235cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
1236    mov        edged, edgem
1237    movifnidn     wd, wm
1238    mov           hd, hm
1239    mova         m10, [pb_0]
1240    mova         m11, [pb_0_1]
1241%else
1242cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
1243 %define edgeb      byte edgem
1244 %define wd         xd
1245 %define wq         wd
1246 %define wm         r5m
1247 %define strideq    r4m
1248    SUB          esp, 8
1249    SETUP_PIC sumsqd, 1, 1
1250
1251 %define m10    [PIC_sym(pb_0)]
1252 %define m11    [PIC_sym(pb_0_1)]
1253%endif
1254
1255    test       edgeb, 2                             ; have_right
1256    jz .no_right
1257    xor        xlimd, xlimd
1258    add           wd, 2
1259    add           wd, 15
1260    and           wd, ~15
1261    jmp .right_done
1262.no_right:
1263    mov        xlimd, 3
1264    dec           wd
1265.right_done:
1266    pxor          m1, m1
1267    lea         srcq, [srcq+wq+1]
1268    lea         sumq, [sumq+wq*2-2]
1269    lea       sumsqq, [sumsqq+wq*4-4]
1270    neg           wq
1271%if ARCH_X86_64
1272    lea          r10, [pb_right_ext_mask+16]
1273%else
1274    mov           wm, xd
1275 %define wq wm
1276%endif
1277
1278.loop_y:
1279    mov           xq, wq
1280    ; load left
1281    test       edgeb, 1                             ; have_left
1282    jz .no_left
1283    test       leftq, leftq
1284    jz .load_left_from_main
1285    movd          m0, [leftq]
1286    movd          m2, [srcq+xq-1]
1287    pslldq        m2, 4
1288    por           m0, m2
1289    pslldq        m0, 11
1290    add        leftq, 4
1291    jmp .expand_x
1292.no_left:
1293    movd          m0, [srcq+xq-1]
1294    XCHG_PIC_REG
1295    pshufb        m0, m10
1296    XCHG_PIC_REG
1297    jmp .expand_x
1298.load_left_from_main:
1299    movd          m0, [srcq+xq-4]
1300    pslldq        m0, 12
1301.expand_x:
1302    punpckhbw     m0, m1
1303
1304    ; when we reach this, m0 contains left two px in highest words
1305    cmp           xd, -8
1306    jle .loop_x
1307    test          xd, xd
1308    jge .right_extend
1309.partial_load_and_extend:
1310    XCHG_PIC_REG
1311    movd          m3, [srcq-1]
1312    movq          m2, [srcq+xq]
1313    pshufb        m3, m10
1314    punpcklbw     m3, m1
1315    punpcklbw     m2, m1
1316%if ARCH_X86_64
1317    movu          m4, [r10+xq*2]
1318%else
1319    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
1320    XCHG_PIC_REG
1321%endif
1322    pand          m2, m4
1323    pandn         m4, m3
1324    por           m2, m4
1325    jmp .loop_x_noload
1326.right_extend:
1327    psrldq        m2, m0, 14
1328    XCHG_PIC_REG
1329    pshufb        m2, m11
1330    XCHG_PIC_REG
1331    jmp .loop_x_noload
1332
1333.loop_x:
1334    movq          m2, [srcq+xq]
1335    punpcklbw     m2, m1
1336.loop_x_noload:
1337    palignr       m3, m2, m0, 8
1338    palignr       m4, m2, m0, 10
1339    palignr       m5, m2, m0, 12
1340    palignr       m6, m2, m0, 14
1341
1342%if ARCH_X86_64
1343    paddw         m0, m3, m2
1344    punpcklwd     m7, m3, m2
1345    punpckhwd     m3, m2
1346    paddw         m0, m4
1347    punpcklwd     m8, m4, m5
1348    punpckhwd     m4, m5
1349    paddw         m0, m5
1350    punpcklwd     m9, m6, m1
1351    punpckhwd     m5, m6, m1
1352    paddw         m0, m6
1353    pmaddwd       m7, m7
1354    pmaddwd       m3, m3
1355    pmaddwd       m8, m8
1356    pmaddwd       m4, m4
1357    pmaddwd       m9, m9
1358    pmaddwd       m5, m5
1359    paddd         m7, m8
1360    paddd         m3, m4
1361    paddd         m7, m9
1362    paddd         m3, m5
1363    movu [sumq+xq*2], m0
1364    movu [sumsqq+xq*4+ 0], m7
1365    movu [sumsqq+xq*4+16], m3
1366%else
1367    paddw         m0, m3, m2
1368    paddw         m0, m4
1369    paddw         m0, m5
1370    paddw         m0, m6
1371    movu [sumq+xq*2], m0
1372    punpcklwd     m7, m3, m2
1373    punpckhwd     m3, m2
1374    punpcklwd     m0, m4, m5
1375    punpckhwd     m4, m5
1376    punpckhwd     m5, m6, m1
1377    pmaddwd       m7, m7
1378    pmaddwd       m3, m3
1379    pmaddwd       m0, m0
1380    pmaddwd       m4, m4
1381    pmaddwd       m5, m5
1382    paddd         m7, m0
1383    paddd         m3, m4
1384    paddd         m3, m5
1385    punpcklwd     m0, m6, m1
1386    pmaddwd       m0, m0
1387    paddd         m7, m0
1388    movu [sumsqq+xq*4+ 0], m7
1389    movu [sumsqq+xq*4+16], m3
1390%endif
1391
1392    mova          m0, m2
1393    add           xq, 8
1394
1395    ; if x <= -8 we can reload more pixels
1396    ; else if x < 0 we reload and extend (this implies have_right=0)
1397    ; else if x < xlimd we extend from previous load (this implies have_right=0)
1398    ; else we are done
1399
1400    cmp           xd, -8
1401    jle .loop_x
1402    test          xd, xd
1403    jl .partial_load_and_extend
1404    cmp           xd, xlimd
1405    jl .right_extend
1406
1407    add         srcq, strideq
1408    add       sumsqq, (384+16)*4
1409    add         sumq, (384+16)*2
1410    dec           hd
1411    jg .loop_y
1412%if ARCH_X86_32
1413    ADD          esp, 8
1414%endif
1415    RET
1416
1417%if ARCH_X86_64
1418cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
1419    movifnidn  edged, edgem
1420    mov        ylimd, edged
1421%else
1422cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
1423 %define wm     [esp+0]
1424 %define hm     [esp+4]
1425 %define edgem  [esp+8]
1426    mov           wm, xd
1427    mov           hm, yd
1428    mov        edgem, ylimd
1429%endif
1430
1431    and        ylimd, 8                             ; have_bottom
1432    shr        ylimd, 2
1433    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
1434    mov           xq, -2
1435%if ARCH_X86_64
1436.loop_x:
1437    lea           yd, [hd+ylimd+2]
1438    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
1439    lea     sum_ptrq, [  sumq+xq*2+2-(384+16)*2]
1440    test       edgeb, 4                             ; have_top
1441    jnz .load_top
1442    movu          m0, [sumsq_ptrq+(384+16)*4*1]
1443    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
1444    mova          m2, m0
1445    mova          m3, m1
1446    mova          m4, m0
1447    mova          m5, m1
1448    mova          m6, m0
1449    mova          m7, m1
1450    movu         m10, [sum_ptrq+(384+16)*2*1]
1451    mova         m11, m10
1452    mova         m12, m10
1453    mova         m13, m10
1454    jmp .loop_y_second_load
1455.load_top:
1456    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
1457    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
1458    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
1459    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
1460    mova          m2, m0
1461    mova          m3, m1
1462    movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
1463    movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
1464    mova         m11, m10
1465.loop_y:
1466    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
1467    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
1468    movu         m13, [sum_ptrq+(384+16)*2*1]        ; l1
1469.loop_y_second_load:
1470    test          yd, yd
1471    jle .emulate_second_load
1472    movu          m8, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
1473    movu          m9, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
1474    movu         m14, [sum_ptrq+(384+16)*2*2]        ; l0
1475.loop_y_noload:
1476    paddd         m0, m2
1477    paddd         m1, m3
1478    paddw        m10, m11
1479    paddd         m0, m4
1480    paddd         m1, m5
1481    paddw        m10, m12
1482    paddd         m0, m6
1483    paddd         m1, m7
1484    paddw        m10, m13
1485    paddd         m0, m8
1486    paddd         m1, m9
1487    paddw        m10, m14
1488    movu [sumsq_ptrq+ 0], m0
1489    movu [sumsq_ptrq+16], m1
1490    movu  [sum_ptrq], m10
1491
1492    ; shift position down by one
1493    mova          m0, m4
1494    mova          m1, m5
1495    mova          m2, m6
1496    mova          m3, m7
1497    mova          m4, m8
1498    mova          m5, m9
1499    mova         m10, m12
1500    mova         m11, m13
1501    mova         m12, m14
1502    add   sumsq_ptrq, (384+16)*4*2
1503    add     sum_ptrq, (384+16)*2*2
1504    sub           yd, 2
1505    jge .loop_y
1506    ; l1 = l0
1507    mova          m6, m8
1508    mova          m7, m9
1509    mova         m13, m14
1510    cmp           yd, ylimd
1511    jg .loop_y_noload
1512    add           xd, 8
1513    cmp           xd, wd
1514    jl .loop_x
1515    RET
1516.emulate_second_load:
1517    mova          m8, m6
1518    mova          m9, m7
1519    mova         m14, m13
1520    jmp .loop_y_noload
1521%else
1522.sumsq_loop_x:
1523    lea           yd, [ylimd+2]
1524    add           yd, hm
1525    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
1526    test  byte edgem, 4                             ; have_top
1527    jnz .sumsq_load_top
1528    movu          m0, [sumsq_ptrq+(384+16)*4*1]
1529    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
1530    mova          m4, m0
1531    mova          m5, m1
1532    mova          m6, m0
1533    mova          m7, m1
1534    mova  [esp+0x1c], m0
1535    mova  [esp+0x0c], m1
1536    jmp .sumsq_loop_y_second_load
1537.sumsq_load_top:
1538    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
1539    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
1540    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
1541    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
1542    mova  [esp+0x1c], m0
1543    mova  [esp+0x0c], m1
1544.sumsq_loop_y:
1545    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
1546    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
1547.sumsq_loop_y_second_load:
1548    test          yd, yd
1549    jle .sumsq_emulate_second_load
1550    movu          m2, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
1551    movu          m3, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
1552.sumsq_loop_y_noload:
1553    paddd         m0, [esp+0x1c]
1554    paddd         m1, [esp+0x0c]
1555    paddd         m0, m4
1556    paddd         m1, m5
1557    paddd         m0, m6
1558    paddd         m1, m7
1559    paddd         m0, m2
1560    paddd         m1, m3
1561    movu [sumsq_ptrq+ 0], m0
1562    movu [sumsq_ptrq+16], m1
1563
1564    ; shift position down by one
1565    mova          m0, m4
1566    mova          m1, m5
1567    mova          m4, m2
1568    mova          m5, m3
1569    mova  [esp+0x1c], m6
1570    mova  [esp+0x0c], m7
1571    add   sumsq_ptrq, (384+16)*4*2
1572    sub           yd, 2
1573    jge .sumsq_loop_y
1574    ; l1 = l0
1575    mova          m6, m2
1576    mova          m7, m3
1577    cmp           yd, ylimd
1578    jg .sumsq_loop_y_noload
1579    add           xd, 8
1580    cmp           xd, wm
1581    jl .sumsq_loop_x
1582
1583    mov           xd, -2
1584.sum_loop_x:
1585    lea           yd, [ylimd+2]
1586    add           yd, hm
1587    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
1588    test  byte edgem, 4                             ; have_top
1589    jnz .sum_load_top
1590    movu          m0, [sum_ptrq+(384+16)*2*1]
1591    mova          m1, m0
1592    mova          m2, m0
1593    mova          m3, m0
1594    jmp .sum_loop_y_second_load
1595.sum_load_top:
1596    movu          m0, [sum_ptrq-(384+16)*2*1]        ; l3/4
1597    movu          m2, [sum_ptrq-(384+16)*2*0]        ; l2
1598    mova          m1, m0
1599.sum_loop_y:
1600    movu          m3, [sum_ptrq+(384+16)*2*1]        ; l1
1601.sum_loop_y_second_load:
1602    test          yd, yd
1603    jle .sum_emulate_second_load
1604    movu          m4, [sum_ptrq+(384+16)*2*2]        ; l0
1605.sum_loop_y_noload:
1606    paddw         m0, m1
1607    paddw         m0, m2
1608    paddw         m0, m3
1609    paddw         m0, m4
1610    movu  [sum_ptrq], m0
1611
1612    ; shift position down by one
1613    mova          m0, m2
1614    mova          m1, m3
1615    mova          m2, m4
1616    add     sum_ptrq, (384+16)*2*2
1617    sub           yd, 2
1618    jge .sum_loop_y
1619    ; l1 = l0
1620    mova          m3, m4
1621    cmp           yd, ylimd
1622    jg .sum_loop_y_noload
1623    add           xd, 8
1624    cmp           xd, wm
1625    jl .sum_loop_x
1626    RET
1627.sumsq_emulate_second_load:
1628    mova          m2, m6
1629    mova          m3, m7
1630    jmp .sumsq_loop_y_noload
1631.sum_emulate_second_load:
1632    mova          m4, m3
1633    jmp .sum_loop_y_noload
1634%endif
1635
1636cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
1637    movifnidn     sd, sm
1638    sub           aq, (384+16-1)*4
1639    sub           bq, (384+16-1)*2
1640    add           hd, 2
1641%if ARCH_X86_64
1642    LEA           r5, sgr_x_by_x-0xF03
1643%else
1644    SETUP_PIC r5, 0
1645%endif
1646    movd          m6, sd
1647    pshuflw       m6, m6, q0000
1648    punpcklqdq    m6, m6
1649    pxor          m7, m7
1650    DEFINE_ARGS a, b, w, h, x
1651%if ARCH_X86_64
1652    mova          m8, [pd_0xF0080029]
1653    mova          m9, [pw_256]
1654    psrld        m10, m9, 15                        ; pd_512
1655%else
1656 %define m8     [PIC_sym(pd_0xF0080029)]
1657 %define m9     [PIC_sym(pw_256)]
1658 %define m10    [PIC_sym(pd_512)]
1659%endif
1660.loop_y:
1661    mov           xq, -2
1662.loop_x:
1663    movq          m0, [bq+xq*2+0]
1664    movq          m1, [bq+xq*2+8]
1665    punpcklwd     m0, m7
1666    punpcklwd     m1, m7
1667    movu          m2, [aq+xq*4+ 0]
1668    movu          m3, [aq+xq*4+16]
1669    pslld         m4, m2, 3                         ; aa * 8
1670    pslld         m5, m3, 3
1671    paddd         m2, m4                            ; aa * 9
1672    paddd         m3, m5
1673    paddd         m4, m4                            ; aa * 16
1674    paddd         m5, m5
1675    paddd         m2, m4                            ; aa * 25
1676    paddd         m3, m5
1677    pmaddwd       m4, m0, m0
1678    pmaddwd       m5, m1, m1
1679    psubd         m2, m4                            ; p = aa * 25 - bb * bb
1680    psubd         m3, m5
1681    MULLD         m2, m6
1682    MULLD         m3, m6
1683    paddusw       m2, m8
1684    paddusw       m3, m8
1685    psrld         m2, 20                            ; z
1686    psrld         m3, 20
1687    GATHERDD      m4, m2                            ; xx
1688    GATHERDD      m2, m3
1689    psrld         m4, 24
1690    psrld         m2, 24
1691    packssdw      m3, m4, m2
1692    pmullw        m4, m8
1693    pmullw        m2, m8
1694    psubw         m5, m9, m3
1695    pmaddwd       m0, m4
1696    pmaddwd       m1, m2
1697    paddd         m0, m10
1698    paddd         m1, m10
1699    psrld         m0, 10
1700    psrld         m1, 10
1701    movu   [bq+xq*2], m5
1702    movu [aq+xq*4+ 0], m0
1703    movu [aq+xq*4+16], m1
1704    add           xd, 8
1705    cmp           xd, wd
1706    jl .loop_x
1707    add           aq, (384+16)*4*2
1708    add           bq, (384+16)*2*2
1709    sub           hd, 2
1710    jg .loop_y
1711    RET
1712
1713%if ARCH_X86_64
1714cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
1715                                       tmp_base, src_base, a_base, b_base, x, y
1716    movifnidn     wd, wm
1717    mov           hd, hm
1718    mov    tmp_baseq, tq
1719    mov    src_baseq, srcq
1720    mov      a_baseq, aq
1721    mov      b_baseq, bq
1722    mova          m9, [pw_5_6]
1723    mova         m12, [pw_256]
1724    psrlw        m10, m12, 8                    ; pw_1
1725    psrlw        m11, m12, 1                    ; pw_128
1726    pxor         m13, m13
1727%else
1728cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
1729 %define tmp_baseq  r0m
1730 %define src_baseq  r1m
1731 %define a_baseq    r3m
1732 %define b_baseq    r4m
1733 %define wd         r5m
1734 %define hd         r6m
1735
1736    SUB          esp, 8
1737    SETUP_PIC yd
1738
1739 %define m8     m5
1740 %define m9     [PIC_sym(pw_5_6)]
1741 %define m10    [PIC_sym(pw_1)]
1742 %define m11    [PIC_sym(pw_128)]
1743 %define m12    [PIC_sym(pw_256)]
1744 %define m13    m0
1745%endif
1746    xor           xd, xd
1747.loop_x:
1748    mov           tq, tmp_baseq
1749    mov         srcq, src_baseq
1750    mov           aq, a_baseq
1751    mov           bq, b_baseq
1752    movu          m0, [aq+xq*4-(384+16)*4-4]
1753    mova          m1, [aq+xq*4-(384+16)*4]
1754    movu          m2, [aq+xq*4-(384+16)*4+4]
1755    movu          m3, [aq+xq*4-(384+16)*4-4+16]
1756    mova          m4, [aq+xq*4-(384+16)*4+16]
1757    movu          m5, [aq+xq*4-(384+16)*4+4+16]
1758    paddd         m0, m2
1759    paddd         m3, m5
1760    paddd         m0, m1
1761    paddd         m3, m4
1762    pslld         m2, m0, 2
1763    pslld         m5, m3, 2
1764    paddd         m2, m0
1765    paddd         m5, m3
1766    paddd         m0, m2, m1                    ; prev_odd_b [first half]
1767    paddd         m1, m5, m4                    ; prev_odd_b [second half]
1768    movu          m3, [bq+xq*2-(384+16)*2-2]
1769    mova          m4, [bq+xq*2-(384+16)*2]
1770    movu          m5, [bq+xq*2-(384+16)*2+2]
1771    paddw         m3, m5
1772    punpcklwd     m5, m3, m4
1773    punpckhwd     m3, m4
1774    pmaddwd       m5, m9
1775    pmaddwd       m3, m9
1776    mova          m2, m5
1777    packssdw      m2, m3                        ; prev_odd_a
1778    lea           tq, [tq+xq*2]
1779    lea         srcq, [srcq+xq*1]
1780    lea           aq, [aq+xq*4+(384+16)*4]
1781    lea           bq, [bq+xq*2+(384+16)*2]
1782%if ARCH_X86_32
1783    mov        [esp], PIC_reg
1784%endif
1785    mov           yd, hd
1786    XCHG_PIC_REG
1787.loop_y:
1788    movu          m3, [aq-4]
1789    mova          m4, [aq]
1790    movu          m5, [aq+4]
1791    paddd         m3, m5
1792    paddd         m3, m4
1793    pslld         m5, m3, 2
1794    paddd         m5, m3
1795    paddd         m5, m4                        ; cur_odd_b [first half]
1796    movu          m3, [aq+16-4]
1797    mova          m6, [aq+16]
1798    movu          m7, [aq+16+4]
1799    paddd         m3, m7
1800    paddd         m3, m6
1801    pslld         m7, m3, 2
1802    paddd         m7, m3
1803    paddd         m4, m7, m6                    ; cur_odd_b [second half]
1804    movu          m3, [bq-2]
1805    mova          m6, [bq]
1806    movu          m7, [bq+2]
1807    paddw         m3, m7
1808    punpcklwd     m7, m3, m6
1809    punpckhwd     m3, m6
1810    pmaddwd       m7, m9
1811    pmaddwd       m3, m9
1812    packssdw      m6, m7, m3                    ; cur_odd_a
1813
1814    paddd         m0, m5                        ; cur_even_b [first half]
1815    paddd         m1, m4                        ; cur_even_b [second half]
1816    paddw         m2, m6                        ; cur_even_a
1817
1818    movq          m3, [srcq]
1819%if ARCH_X86_64
1820    punpcklbw     m3, m13
1821%else
1822    mova        [td], m5
1823    pxor          m7, m7
1824    punpcklbw     m3, m7
1825%endif
1826    punpcklwd     m7, m3, m10
1827    punpckhwd     m3, m10
1828    punpcklwd     m8, m2, m12
1829    punpckhwd     m2, m12
1830    pmaddwd       m7, m8
1831    pmaddwd       m3, m2
1832    paddd         m7, m0
1833    paddd         m3, m1
1834    psrad         m7, 9
1835    psrad         m3, 9
1836
1837%if ARCH_X86_32
1838    pxor         m13, m13
1839%endif
1840    movq          m8, [srcq+strideq]
1841    punpcklbw     m8, m13
1842    punpcklwd     m0, m8, m10
1843    punpckhwd     m8, m10
1844    punpcklwd     m1, m6, m11
1845    punpckhwd     m2, m6, m11
1846    pmaddwd       m0, m1
1847    pmaddwd       m8, m2
1848%if ARCH_X86_64
1849    paddd         m0, m5
1850%else
1851    paddd         m0, [td]
1852%endif
1853    paddd         m8, m4
1854    psrad         m0, 8
1855    psrad         m8, 8
1856
1857    packssdw      m7, m3
1858    packssdw      m0, m8
1859%if ARCH_X86_32
1860    mova          m5, [td]
1861%endif
1862    mova [tq+384*2*0], m7
1863    mova [tq+384*2*1], m0
1864
1865    mova          m0, m5
1866    mova          m1, m4
1867    mova          m2, m6
1868    add           aq, (384+16)*4*2
1869    add           bq, (384+16)*2*2
1870    add           tq, 384*2*2
1871    lea         srcq, [srcq+strideq*2]
1872%if ARCH_X86_64
1873    sub           yd, 2
1874%else
1875    sub dword [esp+4], 2
1876%endif
1877    jg .loop_y
1878    add           xd, 8
1879    cmp           xd, wd
1880    jl .loop_x
1881%if ARCH_X86_32
1882    ADD          esp, 8
1883%endif
1884    RET
1885
1886cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
1887    movifnidn     wd, wm
1888    movd          m0, wtm
1889%if ARCH_X86_64
1890    movifnidn     hd, hm
1891    mova         m10, [pd_1024]
1892    pxor         m11, m11
1893%else
1894    SETUP_PIC     hd, 0
1895 %define m10    [PIC_sym(pd_1024)]
1896 %define m11    m7
1897%endif
1898    pshufd        m0, m0, 0
1899    DEFINE_ARGS dst, stride, t1, t2, w, h, idx
1900%if ARCH_X86_32
1901 %define hd     hmp
1902%endif
1903
1904.loop_y:
1905    xor         idxd, idxd
1906.loop_x:
1907    mova          m1, [t1q+idxq*2+ 0]
1908    mova          m2, [t1q+idxq*2+16]
1909    mova          m3, [t2q+idxq*2+ 0]
1910    mova          m4, [t2q+idxq*2+16]
1911    mova          m6, [dstq+idxq]
1912%if ARCH_X86_32
1913    pxor          m11, m11
1914%endif
1915    punpcklbw     m5, m6, m11
1916    punpckhbw     m6, m11
1917    psllw         m7, m5, 4
1918    psubw         m1, m7
1919    psubw         m3, m7
1920    psllw         m7, m6, 4
1921    psubw         m2, m7
1922    psubw         m4, m7
1923    punpcklwd     m7, m1, m3
1924    punpckhwd     m1, m3
1925    punpcklwd     m3, m2, m4
1926    punpckhwd     m2, m4
1927    pmaddwd       m7, m0
1928    pmaddwd       m1, m0
1929    pmaddwd       m3, m0
1930    pmaddwd       m2, m0
1931    paddd         m7, m10
1932    paddd         m1, m10
1933    paddd         m3, m10
1934    paddd         m2, m10
1935    psrad         m7, 11
1936    psrad         m1, 11
1937    psrad         m3, 11
1938    psrad         m2, 11
1939    packssdw      m7, m1
1940    packssdw      m3, m2
1941    paddw         m7, m5
1942    paddw         m3, m6
1943    packuswb      m7, m3
1944    mova [dstq+idxq], m7
1945    add         idxd, 16
1946    cmp         idxd, wd
1947    jl .loop_x
1948    add         dstq, strideq
1949    add          t1q, 384 * 2
1950    add          t2q, 384 * 2
1951    dec           hd
1952    jg .loop_y
1953    RET
1954