1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "ext/x86/x86inc.asm"
27
28%if ARCH_X86_64
29
30SECTION_RODATA 32
31pb_right_ext_mask: times 32 db 0xff
32                   times 32 db 0
33pb_14x0_1_2: times 14 db 0
34             db 1, 2
35pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
36                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
37pb_15: times 16 db 15
38pw_16: times 2 dw 16
39pw_256: times 2 dw 256
40pw_2048: times 2 dw 2048
41pw_16380: times 2 dw 16380
42pw_0_128: dw 0, 128
43pw_5_6: dw 5, 6
44pd_6: dd 6
45pd_1024: dd 1024
46pd_0xf0080029: dd 0xf0080029
47pd_0xf00801c7: dd 0xf00801c7
48
49cextern sgr_x_by_x
50
51SECTION .text
52
53INIT_YMM avx2
54cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
55    vpbroadcastb m15, [fhq+0]
56    vpbroadcastb m14, [fhq+2]
57    vpbroadcastb m13, [fhq+4]
58    vpbroadcastw m12, [fhq+6]
59    vpbroadcastd m11, [pw_2048]
60    vpbroadcastd m10, [pw_16380]
61    lea          r11, [pb_right_ext_mask]
62
63    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
64
65    ; if (edge & has_right) align_w_to_32
66    ; else w -= 32, and use that as limit in x loop
67    test       edged, 2 ; has_right
68    jnz .align
69    mov        xlimq, -3
70    jmp .loop
71.align:
72    add           wd, 31
73    and           wd, ~31
74    xor        xlimd, xlimd
75
76    ; main y loop for vertical filter
77.loop:
78    mov      srcptrq, srcq
79    mov      dstptrq, dstq
80    lea           xq, [wq+xlimq]
81
82    ; load left edge pixels
83    test       edged, 1 ; have_left
84    jz .emu_left
85    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
86    jz .load_left_combined
87    movd         xm0, [leftq]
88    add        leftq, 4
89    pinsrd       xm0, [srcq], 1
90    pslldq       xm0, 9
91    jmp .left_load_done
92.load_left_combined:
93    movq         xm0, [srcq-3]
94    pslldq       xm0, 10
95    jmp .left_load_done
96.emu_left:
97    movd         xm0, [srcq]
98    pshufb       xm0, [pb_14x0_1_2]
99
100    ; load right edge pixels
101.left_load_done:
102    cmp           xd, 32
103    jg .main_load
104    test          xd, xd
105    jg .load_and_splat
106    je .splat_right
107
108    ; for very small images (w=[1-2]), edge-extend the original cache,
109    ; ugly, but only runs in very odd cases
110    add           wd, wd
111    pshufb       xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
112    shr           wd, 1
113
114    ; main x loop, mostly this starts in .main_load
115.splat_right:
116    ; no need to load new pixels, just extend them from the (possibly previously
117    ; extended) previous load into m0
118    pshufb       xm1, xm0, [pb_15]
119    jmp .main_loop
120.load_and_splat:
121    ; load new pixels and extend edge for right-most
122    movu          m1, [srcptrq+3]
123    sub          r11, xq
124    movu          m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
125    add          r11, xq
126    vpbroadcastb  m3, [srcptrq+2+xq]
127    pand          m1, m2
128    pandn         m3, m2, m3
129    por           m1, m3
130    jmp .main_loop
131.main_load:
132    ; load subsequent line
133    movu          m1, [srcptrq+3]
134.main_loop:
135    vinserti128   m0, xm1, 1
136
137    palignr       m2, m1, m0, 10
138    palignr       m3, m1, m0, 11
139    palignr       m4, m1, m0, 12
140    palignr       m5, m1, m0, 13
141    palignr       m6, m1, m0, 14
142    palignr       m7, m1, m0, 15
143
144    punpcklbw     m0, m2, m1
145    punpckhbw     m2, m1
146    punpcklbw     m8, m3, m7
147    punpckhbw     m3, m7
148    punpcklbw     m7, m4, m6
149    punpckhbw     m4, m6
150    pxor          m9, m9
151    punpcklbw     m6, m5, m9
152    punpckhbw     m5, m9
153
154    pmaddubsw     m0, m15
155    pmaddubsw     m2, m15
156    pmaddubsw     m8, m14
157    pmaddubsw     m3, m14
158    pmaddubsw     m7, m13
159    pmaddubsw     m4, m13
160    paddw         m0, m8
161    paddw         m2, m3
162    psllw         m8, m6, 7
163    psllw         m3, m5, 7
164    psubw         m8, m10
165    psubw         m3, m10
166    pmullw        m6, m12
167    pmullw        m5, m12
168    paddw         m0, m7
169    paddw         m2, m4
170    paddw         m0, m6
171    paddw         m2, m5
172    paddsw        m0, m8
173    paddsw        m2, m3
174    psraw         m0, 3
175    psraw         m2, 3
176    paddw         m0, m11
177    paddw         m2, m11
178    mova   [dstptrq], xm0
179    mova [dstptrq+16], xm2
180    vextracti128 [dstptrq+32], m0, 1
181    vextracti128 [dstptrq+48], m2, 1
182    vextracti128 xm0, m1, 1
183    add      srcptrq, 32
184    add      dstptrq, 64
185    sub           xq, 32
186    cmp           xd, 32
187    jg .main_load
188    test          xd, xd
189    jg .load_and_splat
190    cmp           xd, xlimd
191    jg .splat_right
192
193    add         srcq, strideq
194    add         dstq, 384*2
195    dec           hd
196    jg .loop
197    RET
198
199cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
200    vpbroadcastd m14, [fvq+4]
201    vpbroadcastd m15, [fvq]
202    vpbroadcastd m13, [pw_0_128]
203    paddw        m14, m13
204    vpbroadcastd m12, [pd_1024]
205
206    DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
207    mov        ylimd, edged
208    and        ylimd, 8 ; have_bottom
209    shr        ylimd, 2
210    sub        ylimd, 3
211
212    ; main x loop for vertical filter, does one column of 16 pixels
213.loop_x:
214    mova          m3, [midq] ; middle line
215
216    ; load top pixels
217    test       edged, 4 ; have_top
218    jz .emu_top
219    mova          m0, [midq-384*4]
220    mova          m2, [midq-384*2]
221    mova          m1, m0
222    jmp .load_bottom_pixels
223.emu_top:
224    mova          m0, m3
225    mova          m1, m3
226    mova          m2, m3
227
228    ; load bottom pixels
229.load_bottom_pixels:
230    mov           yd, hd
231    mov        mptrq, midq
232    mov      dstptrq, dstq
233    add           yd, ylimd
234    jg .load_threelines
235
236    ; the remainder here is somewhat messy but only runs in very weird
237    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
238    ; so performance is not terribly important here...
239    je .load_twolines
240    cmp           yd, -1
241    je .load_oneline
242    ; h == 1 case
243    mova          m5, m3
244    mova          m4, m3
245    mova          m6, m3
246    jmp .loop
247.load_oneline:
248    ; h == 2 case
249    mova          m4, [midq+384*2]
250    mova          m5, m4
251    mova          m6, m4
252    jmp .loop
253.load_twolines:
254    ; h == 3 case
255    mova          m4, [midq+384*2]
256    mova          m5, [midq+384*4]
257    mova          m6, m5
258    jmp .loop
259.load_threelines:
260    ; h > 3 case
261    mova          m4, [midq+384*2]
262    mova          m5, [midq+384*4]
263    ; third line loaded in main loop below
264
265    ; main y loop for vertical filter
266.loop_load:
267    ; load one line into m6. if that pixel is no longer available, do
268    ; nothing, since m6 still has the data from the previous line in it. We
269    ; try to structure the loop so that the common case is evaluated fastest
270    mova          m6, [mptrq+384*6]
271.loop:
272    paddw         m7, m0, m6
273    paddw         m8, m1, m5
274    paddw         m9, m2, m4
275    punpcklwd    m10, m7, m8
276    punpckhwd     m7, m8
277    punpcklwd    m11, m9, m3
278    punpckhwd     m9, m3
279    pmaddwd      m10, m15
280    pmaddwd       m7, m15
281    pmaddwd      m11, m14
282    pmaddwd       m9, m14
283    paddd        m10, m11
284    paddd         m7, m9
285    paddd        m10, m12
286    paddd         m7, m12
287    psrad        m10, 11
288    psrad         m7, 11
289    packssdw     m10, m7
290    packuswb     m10, m10
291    vpermq       m10, m10, q3120
292    mova   [dstptrq], xm10
293    ; shift pixels one position
294    mova          m0, m1
295    mova          m1, m2
296    mova          m2, m3
297    mova          m3, m4
298    mova          m4, m5
299    mova          m5, m6
300    add      dstptrq, strideq
301    add        mptrq, 384*2
302    dec           yd
303    jg .loop_load
304    ; for the bottom pixels, continue using m6 (as extended edge)
305    cmp           yd, ylimd
306    jg .loop
307
308    add         dstq, 16
309    add         midq, 32
310    sub           wd, 16
311    jg .loop_x
312    RET
313
314INIT_YMM avx2
315cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
316    mov        xlimd, edged
317    and        xlimd, 2                             ; have_right
318    add           wd, xlimd
319    xor        xlimd, 2                             ; 2*!have_right
320    jnz .no_right
321    add           wd, 15
322    and           wd, ~15
323.no_right:
324    pxor          m1, m1
325    lea         srcq, [srcq+wq]
326    lea         sumq, [sumq+wq*2-2]
327    lea       sumsqq, [sumsqq+wq*4-4]
328    neg           wq
329    lea          r10, [pb_right_ext_mask+32]
330.loop_y:
331    mov           xq, wq
332
333    ; load left
334    test       edged, 1                             ; have_left
335    jz .no_left
336    test       leftq, leftq
337    jz .load_left_from_main
338    pinsrw       xm0, [leftq+2], 7
339    add        leftq, 4
340    jmp .expand_x
341.no_left:
342    vpbroadcastb xm0, [srcq+xq]
343    jmp .expand_x
344.load_left_from_main:
345    pinsrw       xm0, [srcq+xq-2], 7
346.expand_x:
347    punpckhbw    xm0, xm1
348
349    ; when we reach this, xm0 contains left two px in highest words
350    cmp           xd, -16
351    jle .loop_x
352.partial_load_and_extend:
353    vpbroadcastb  m3, [srcq-1]
354    pmovzxbw      m2, [srcq+xq]
355    punpcklbw     m3, m1
356    movu          m4, [r10+xq*2]
357    pand          m2, m4
358    pandn         m4, m3
359    por           m2, m4
360    jmp .loop_x_noload
361.right_extend:
362    psrldq       xm2, xm0, 14
363    vpbroadcastw  m2, xm2
364    jmp .loop_x_noload
365
366.loop_x:
367    pmovzxbw      m2, [srcq+xq]
368.loop_x_noload:
369    vinserti128   m0, xm2, 1
370    palignr       m3, m2, m0, 12
371    palignr       m4, m2, m0, 14
372
373    punpcklwd     m5, m3, m2
374    punpckhwd     m6, m3, m2
375    paddw         m3, m4
376    punpcklwd     m7, m4, m1
377    punpckhwd     m4, m1
378    pmaddwd       m5, m5
379    pmaddwd       m6, m6
380    pmaddwd       m7, m7
381    pmaddwd       m4, m4
382    paddd         m5, m7
383    paddd         m6, m4
384    paddw         m3, m2
385    movu [sumq+xq*2], m3
386    movu [sumsqq+xq*4+ 0], xm5
387    movu [sumsqq+xq*4+16], xm6
388    vextracti128 [sumsqq+xq*4+32], m5, 1
389    vextracti128 [sumsqq+xq*4+48], m6, 1
390
391    vextracti128 xm0, m2, 1
392    add           xq, 16
393
394    ; if x <= -16 we can reload more pixels
395    ; else if x < 0 we reload and extend (this implies have_right=0)
396    ; else if x < xlimd we extend from previous load (this implies have_right=0)
397    ; else we are done
398
399    cmp           xd, -16
400    jle .loop_x
401    test          xd, xd
402    jl .partial_load_and_extend
403    cmp           xd, xlimd
404    jl .right_extend
405
406    add       sumsqq, (384+16)*4
407    add         sumq, (384+16)*2
408    add         srcq, strideq
409    dec           hd
410    jg .loop_y
411    RET
412
413INIT_YMM avx2
414cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
415    mov           xq, -2
416    mov        ylimd, edged
417    and        ylimd, 8                             ; have_bottom
418    shr        ylimd, 2
419    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
420.loop_x:
421    lea           yd, [hq+ylimq+2]
422    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
423    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
424    test       edged, 4                             ; have_top
425    jnz .load_top
426    movu          m0, [sumsq_ptrq+(384+16)*4*1]
427    movu          m1, [sumsq_ptrq+(384+16)*4*1+32]
428    mova          m2, m0
429    mova          m3, m1
430    mova          m4, m0
431    mova          m5, m1
432    movu          m6, [sum_ptrq+(384+16)*2*1]
433    mova          m7, m6
434    mova          m8, m6
435    jmp .loop_y_noload
436.load_top:
437    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l2sq [left]
438    movu          m1, [sumsq_ptrq-(384+16)*4*1+32]   ; l2sq [right]
439    movu          m2, [sumsq_ptrq-(384+16)*4*0]      ; l1sq [left]
440    movu          m3, [sumsq_ptrq-(384+16)*4*0+32]   ; l1sq [right]
441    movu          m6, [sum_ptrq-(384+16)*2*1]        ; l2
442    movu          m7, [sum_ptrq-(384+16)*2*0]        ; l1
443.loop_y:
444    movu          m4, [sumsq_ptrq+(384+16)*4*1]      ; l0sq [left]
445    movu          m5, [sumsq_ptrq+(384+16)*4*1+32]   ; l0sq [right]
446    movu          m8, [sum_ptrq+(384+16)*2*1]        ; l0
447.loop_y_noload:
448    paddd         m0, m2
449    paddd         m1, m3
450    paddw         m6, m7
451    paddd         m0, m4
452    paddd         m1, m5
453    paddw         m6, m8
454    movu [sumsq_ptrq+ 0], m0
455    movu [sumsq_ptrq+32], m1
456    movu  [sum_ptrq], m6
457
458    ; shift position down by one
459    mova          m0, m2
460    mova          m1, m3
461    mova          m2, m4
462    mova          m3, m5
463    mova          m6, m7
464    mova          m7, m8
465    add   sumsq_ptrq, (384+16)*4
466    add     sum_ptrq, (384+16)*2
467    dec           yd
468    jg .loop_y
469    cmp           yd, ylimd
470    jg .loop_y_noload
471    add           xd, 16
472    cmp           xd, wd
473    jl .loop_x
474    RET
475
476INIT_YMM avx2
477cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
478    sub           aq, (384+16-1)*4
479    sub           bq, (384+16-1)*2
480    add           hd, 2
481    lea           r5, [sgr_x_by_x-0xf03]
482%ifidn sd, sm
483    movd         xm6, sd
484    vpbroadcastd  m6, xm6
485%else
486    vpbroadcastd  m6, sm
487%endif
488    vpbroadcastd  m8, [pd_0xf00801c7]
489    vpbroadcastd  m9, [pw_256]
490    pcmpeqb       m7, m7
491    psrld        m10, m9, 13                        ; pd_2048
492    DEFINE_ARGS a, b, w, h, x
493
494.loop_y:
495    mov           xq, -2
496.loop_x:
497    pmovzxwd      m0, [bq+xq*2]
498    pmovzxwd      m1, [bq+xq*2+(384+16)*2]
499    movu          m2, [aq+xq*4]
500    movu          m3, [aq+xq*4+(384+16)*4]
501    pslld         m4, m2, 3
502    pslld         m5, m3, 3
503    paddd         m2, m4                            ; aa * 9
504    paddd         m3, m5
505    pmaddwd       m4, m0, m0
506    pmaddwd       m5, m1, m1
507    pmaddwd       m0, m8
508    pmaddwd       m1, m8
509    psubd         m2, m4                            ; p = aa * 9 - bb * bb
510    psubd         m3, m5
511    pmulld        m2, m6
512    pmulld        m3, m6
513    paddusw       m2, m8
514    paddusw       m3, m8
515    psrld         m2, 20                            ; z
516    psrld         m3, 20
517    mova          m5, m7
518    vpgatherdd    m4, [r5+m2], m5                   ; xx
519    mova          m5, m7
520    vpgatherdd    m2, [r5+m3], m5
521    psrld         m4, 24
522    psrld         m2, 24
523    pmulld        m0, m4
524    pmulld        m1, m2
525    packssdw      m4, m2
526    psubw         m4, m9, m4
527    vpermq        m4, m4, q3120
528    paddd         m0, m10
529    paddd         m1, m10
530    psrld         m0, 12
531    psrld         m1, 12
532    movu   [bq+xq*2], xm4
533    vextracti128 [bq+xq*2+(384+16)*2], m4, 1
534    movu   [aq+xq*4], m0
535    movu [aq+xq*4+(384+16)*4], m1
536    add           xd, 8
537    cmp           xd, wd
538    jl .loop_x
539    add           aq, (384+16)*4*2
540    add           bq, (384+16)*2*2
541    sub           hd, 2
542    jg .loop_y
543    RET
544
545INIT_YMM avx2
546cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \
547                                       tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
548    vpbroadcastd m15, [pw_16]
549    xor           xd, xd
550.loop_x:
551    lea     tmp_ptrq, [tq+xq*2]
552    lea     src_ptrq, [srcq+xq*1]
553    lea       a_ptrq, [aq+xq*4+(384+16)*4]
554    lea       b_ptrq, [bq+xq*2+(384+16)*2]
555    movu          m0, [aq+xq*4-(384+16)*4-4]
556    movu          m2, [aq+xq*4-(384+16)*4+4]
557    mova          m1, [aq+xq*4-(384+16)*4]           ; a:top [first half]
558    paddd         m0, m2                            ; a:tl+tr [first half]
559    movu          m2, [aq+xq*4-(384+16)*4-4+32]
560    movu          m4, [aq+xq*4-(384+16)*4+4+32]
561    mova          m3, [aq+xq*4-(384+16)*4+32]        ; a:top [second half]
562    paddd         m2, m4                            ; a:tl+tr [second half]
563    movu          m4, [aq+xq*4-4]
564    movu          m5, [aq+xq*4+4]
565    paddd         m1, [aq+xq*4]                     ; a:top+ctr [first half]
566    paddd         m4, m5                            ; a:l+r [first half]
567    movu          m5, [aq+xq*4+32-4]
568    movu          m6, [aq+xq*4+32+4]
569    paddd         m3, [aq+xq*4+32]                  ; a:top+ctr [second half]
570    paddd         m5, m6                            ; a:l+r [second half]
571
572    movu          m6, [bq+xq*2-(384+16)*2-2]
573    movu          m8, [bq+xq*2-(384+16)*2+2]
574    mova          m7, [bq+xq*2-(384+16)*2]          ; b:top
575    paddw         m6, m8                            ; b:tl+tr
576    movu          m8, [bq+xq*2-2]
577    movu          m9, [bq+xq*2+2]
578    paddw         m7, [bq+xq*2]                     ; b:top+ctr
579    paddw         m8, m9                            ; b:l+r
580    mov           yd, hd
581.loop_y:
582    movu          m9, [b_ptrq-2]
583    movu         m10, [b_ptrq+2]
584    paddw         m7, [b_ptrq]                      ; b:top+ctr+bottom
585    paddw         m9, m10                           ; b:bl+br
586    paddw        m10, m7, m8                        ; b:top+ctr+bottom+l+r
587    paddw         m6, m9                            ; b:tl+tr+bl+br
588    psubw         m7, [b_ptrq-(384+16)*2*2]         ; b:ctr+bottom
589    paddw        m10, m6
590    psllw        m10, 2
591    psubw        m10, m6                            ; aa
592    pmovzxbw     m12, [src_ptrq]
593    punpcklwd     m6, m10, m15
594    punpckhwd    m10, m15
595    punpcklwd    m13, m12, m15
596    punpckhwd    m12, m15
597    pmaddwd       m6, m13                           ; aa*src[x]+256 [first half]
598    pmaddwd      m10, m12                           ; aa*src[x]+256 [second half]
599
600    movu         m11, [a_ptrq-4]
601    movu         m12, [a_ptrq+4]
602    paddd         m1, [a_ptrq]                      ; a:top+ctr+bottom [first half]
603    paddd        m11, m12                           ; a:bl+br [first half]
604    movu         m12, [a_ptrq+32-4]
605    movu         m13, [a_ptrq+32+4]
606    paddd         m3, [a_ptrq+32]                   ; a:top+ctr+bottom [second half]
607    paddd        m12, m13                           ; a:bl+br [second half]
608    paddd        m13, m1, m4                        ; a:top+ctr+bottom+l+r [first half]
609    paddd        m14, m3, m5                        ; a:top+ctr+bottom+l+r [second half]
610    paddd         m0, m11                           ; a:tl+tr+bl+br [first half]
611    paddd         m2, m12                           ; a:tl+tr+bl+br [second half]
612    paddd        m13, m0
613    paddd        m14, m2
614    pslld        m13, 2
615    pslld        m14, 2
616    psubd        m13, m0                            ; bb [first half]
617    psubd        m14, m2                            ; bb [second half]
618    vperm2i128    m0, m13, m14, 0x31
619    vinserti128  m13, xm14, 1
620    psubd         m1, [a_ptrq-(384+16)*4*2]          ; a:ctr+bottom [first half]
621    psubd         m3, [a_ptrq-(384+16)*4*2+32]       ; a:ctr+bottom [second half]
622
623    paddd         m6, m13
624    paddd        m10, m0
625    psrad         m6, 9
626    psrad        m10, 9
627    packssdw      m6, m10
628    mova  [tmp_ptrq], m6
629
630    ; shift to next row
631    mova          m0, m4
632    mova          m2, m5
633    mova          m4, m11
634    mova          m5, m12
635    mova          m6, m8
636    mova          m8, m9
637
638    add       a_ptrq, (384+16)*4
639    add       b_ptrq, (384+16)*2
640    add     tmp_ptrq, 384*2
641    add     src_ptrq, strideq
642    dec           yd
643    jg .loop_y
644    add           xd, 16
645    cmp           xd, wd
646    jl .loop_x
647    RET
648
649INIT_YMM avx2
650cglobal sgr_weighted1, 6, 6, 7, dst, stride, t, w, h, wt
651    movd         xm0, wtd
652    vpbroadcastw  m0, xm0
653    psllw         m0, 4
654    DEFINE_ARGS dst, stride, t, w, h, idx
655.loop_y:
656    xor         idxd, idxd
657.loop_x:
658    mova          m1, [tq+idxq*2+ 0]
659    mova          m4, [tq+idxq*2+32]
660    pmovzxbw      m2, [dstq+idxq+ 0]
661    pmovzxbw      m5, [dstq+idxq+16]
662    psllw         m3, m2, 4
663    psllw         m6, m5, 4
664    psubw         m1, m3
665    psubw         m4, m6
666    pmulhrsw      m1, m0
667    pmulhrsw      m4, m0
668    paddw         m1, m2
669    paddw         m4, m5
670    packuswb      m1, m4
671    vpermq        m1, m1, q3120
672    mova [dstq+idxq], m1
673    add         idxd, 32
674    cmp         idxd, wd
675    jl .loop_x
676    add         dstq, strideq
677    add           tq, 384 * 2
678    dec           hd
679    jg .loop_y
680    RET
681
682INIT_YMM avx2
683cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
684    test       edged, 2                             ; have_right
685    jz .no_right
686    xor        xlimd, xlimd
687    add           wd, 2
688    add           wd, 15
689    and           wd, ~15
690    jmp .right_done
691.no_right:
692    mov        xlimd, 3
693    sub           wd, 1
694.right_done:
695    pxor          m1, m1
696    lea         srcq, [srcq+wq+1]
697    lea         sumq, [sumq+wq*2-2]
698    lea       sumsqq, [sumsqq+wq*4-4]
699    neg           wq
700    lea          r10, [pb_right_ext_mask+32]
701.loop_y:
702    mov           xq, wq
703
704    ; load left
705    test       edged, 1                             ; have_left
706    jz .no_left
707    test       leftq, leftq
708    jz .load_left_from_main
709    movd         xm0, [leftq]
710    pinsrd       xm0, [srcq+xq-1], 1
711    pslldq       xm0, 11
712    add        leftq, 4
713    jmp .expand_x
714.no_left:
715    vpbroadcastb xm0, [srcq+xq-1]
716    jmp .expand_x
717.load_left_from_main:
718    pinsrd       xm0, [srcq+xq-4], 3
719.expand_x:
720    punpckhbw    xm0, xm1
721
722    ; when we reach this, xm0 contains left two px in highest words
723    cmp           xd, -16
724    jle .loop_x
725    test          xd, xd
726    jge .right_extend
727.partial_load_and_extend:
728    vpbroadcastb  m3, [srcq-1]
729    pmovzxbw      m2, [srcq+xq]
730    punpcklbw     m3, m1
731    movu          m4, [r10+xq*2]
732    pand          m2, m4
733    pandn         m4, m3
734    por           m2, m4
735    jmp .loop_x_noload
736.right_extend:
737    psrldq       xm2, xm0, 14
738    vpbroadcastw  m2, xm2
739    jmp .loop_x_noload
740
741.loop_x:
742    pmovzxbw      m2, [srcq+xq]
743.loop_x_noload:
744    vinserti128   m0, xm2, 1
745    palignr       m3, m2, m0, 8
746    palignr       m4, m2, m0, 10
747    palignr       m5, m2, m0, 12
748    palignr       m6, m2, m0, 14
749
750    paddw         m0, m3, m2
751    punpcklwd     m7, m3, m2
752    punpckhwd     m3, m2
753    paddw         m0, m4
754    punpcklwd     m8, m4, m5
755    punpckhwd     m4, m5
756    paddw         m0, m5
757    punpcklwd     m9, m6, m1
758    punpckhwd     m5, m6, m1
759    paddw         m0, m6
760    pmaddwd       m7, m7
761    pmaddwd       m3, m3
762    pmaddwd       m8, m8
763    pmaddwd       m4, m4
764    pmaddwd       m9, m9
765    pmaddwd       m5, m5
766    paddd         m7, m8
767    paddd         m3, m4
768    paddd         m7, m9
769    paddd         m3, m5
770    movu [sumq+xq*2], m0
771    movu [sumsqq+xq*4+ 0], xm7
772    movu [sumsqq+xq*4+16], xm3
773    vextracti128 [sumsqq+xq*4+32], m7, 1
774    vextracti128 [sumsqq+xq*4+48], m3, 1
775
776    vextracti128 xm0, m2, 1
777    add           xq, 16
778
779    ; if x <= -16 we can reload more pixels
780    ; else if x < 0 we reload and extend (this implies have_right=0)
781    ; else if x < xlimd we extend from previous load (this implies have_right=0)
782    ; else we are done
783
784    cmp           xd, -16
785    jle .loop_x
786    test          xd, xd
787    jl .partial_load_and_extend
788    cmp           xd, xlimd
789    jl .right_extend
790
791    add       sumsqq, (384+16)*4
792    add         sumq, (384+16)*2
793    add         srcq, strideq
794    dec hd
795    jg .loop_y
796    RET
797
798INIT_YMM avx2
799cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
800    mov           xq, -2
801    mov        ylimd, edged
802    and        ylimd, 8                             ; have_bottom
803    shr        ylimd, 2
804    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
805.loop_x:
806    lea           yd, [hq+ylimq+2]
807    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
808    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
809    test       edged, 4                             ; have_top
810    jnz .load_top
811    movu          m0, [sumsq_ptrq+(384+16)*4*1]
812    movu          m1, [sumsq_ptrq+(384+16)*4*1+32]
813    mova          m2, m0
814    mova          m3, m1
815    mova          m4, m0
816    mova          m5, m1
817    mova          m6, m0
818    mova          m7, m1
819    movu         m10, [sum_ptrq+(384+16)*2*1]
820    mova         m11, m10
821    mova         m12, m10
822    mova         m13, m10
823    jmp .loop_y_second_load
824.load_top:
825    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
826    movu          m1, [sumsq_ptrq-(384+16)*4*1+32]   ; l3/4sq [right]
827    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
828    movu          m5, [sumsq_ptrq-(384+16)*4*0+32]   ; l2sq [right]
829    mova          m2, m0
830    mova          m3, m1
831    movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
832    movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
833    mova         m11, m10
834.loop_y:
835    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
836    movu          m7, [sumsq_ptrq+(384+16)*4*1+32]   ; l1sq [right]
837    movu         m13, [sum_ptrq+(384+16)*2*1]        ; l1
838.loop_y_second_load:
839    test          yd, yd
840    jle .emulate_second_load
841    movu          m8, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
842    movu          m9, [sumsq_ptrq+(384+16)*4*2+32]   ; l0sq [right]
843    movu         m14, [sum_ptrq+(384+16)*2*2]        ; l0
844.loop_y_noload:
845    paddd         m0, m2
846    paddd         m1, m3
847    paddw        m10, m11
848    paddd         m0, m4
849    paddd         m1, m5
850    paddw        m10, m12
851    paddd         m0, m6
852    paddd         m1, m7
853    paddw        m10, m13
854    paddd         m0, m8
855    paddd         m1, m9
856    paddw        m10, m14
857    movu [sumsq_ptrq+ 0], m0
858    movu [sumsq_ptrq+32], m1
859    movu  [sum_ptrq], m10
860
861    ; shift position down by one
862    mova          m0, m4
863    mova          m1, m5
864    mova          m2, m6
865    mova          m3, m7
866    mova          m4, m8
867    mova          m5, m9
868    mova         m10, m12
869    mova         m11, m13
870    mova         m12, m14
871    add   sumsq_ptrq, (384+16)*4*2
872    add     sum_ptrq, (384+16)*2*2
873    sub           yd, 2
874    jge .loop_y
875    ; l1 = l0
876    mova          m6, m8
877    mova          m7, m9
878    mova         m13, m14
879    cmp           yd, ylimd
880    jg .loop_y_noload
881    add           xd, 16
882    cmp           xd, wd
883    jl .loop_x
884    RET
885.emulate_second_load:
886    mova          m8, m6
887    mova          m9, m7
888    mova         m14, m13
889    jmp .loop_y_noload
890
891INIT_YMM avx2
892cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
893    sub           aq, (384+16-1)*4
894    sub           bq, (384+16-1)*2
895    add           hd, 2
896    lea           r5, [sgr_x_by_x-0xf03]
897%ifidn sd, sm
898    movd         xm6, sd
899    vpbroadcastd  m6, xm6
900%else
901    vpbroadcastd  m6, sm
902%endif
903    vpbroadcastd  m8, [pd_0xf0080029]
904    vpbroadcastd  m9, [pw_256]
905    pcmpeqb       m7, m7
906    psrld        m10, m9, 15                        ; pd_512
907    DEFINE_ARGS a, b, w, h, x
908.loop_y:
909    mov           xq, -2
910.loop_x:
911    pmovzxwd      m0, [bq+xq*2+ 0]
912    pmovzxwd      m1, [bq+xq*2+16]
913    movu          m2, [aq+xq*4+ 0]
914    movu          m3, [aq+xq*4+32]
915    pslld         m4, m2, 3                         ; aa * 8
916    pslld         m5, m3, 3
917    paddd         m2, m4                            ; aa * 9
918    paddd         m3, m5
919    paddd         m4, m4                            ; aa * 16
920    paddd         m5, m5
921    paddd         m2, m4                            ; aa * 25
922    paddd         m3, m5
923    pmaddwd       m4, m0, m0
924    pmaddwd       m5, m1, m1
925    psubd         m2, m4                            ; p = aa * 25 - bb * bb
926    psubd         m3, m5
927    pmulld        m2, m6
928    pmulld        m3, m6
929    paddusw       m2, m8
930    paddusw       m3, m8
931    psrld         m2, 20                            ; z
932    psrld         m3, 20
933    mova          m5, m7
934    vpgatherdd    m4, [r5+m2], m5                   ; xx
935    mova          m5, m7
936    vpgatherdd    m2, [r5+m3], m5
937    psrld         m4, 24
938    psrld         m2, 24
939    packssdw      m3, m4, m2
940    pmullw        m4, m8
941    pmullw        m2, m8
942    psubw         m3, m9, m3
943    vpermq        m3, m3, q3120
944    pmaddwd       m0, m4
945    pmaddwd       m1, m2
946    paddd         m0, m10
947    paddd         m1, m10
948    psrld         m0, 10
949    psrld         m1, 10
950    movu   [bq+xq*2], m3
951    movu [aq+xq*4+ 0], m0
952    movu [aq+xq*4+32], m1
953    add           xd, 16
954    cmp           xd, wd
955    jl .loop_x
956    add           aq, (384+16)*4*2
957    add           bq, (384+16)*2*2
958    sub           hd, 2
959    jg .loop_y
960    RET
961
962INIT_YMM avx2
963cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \
964                                       tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
965    vpbroadcastd  m9, [pw_5_6]
966    vpbroadcastd m12, [pw_256]
967    psrlw        m11, m12, 1                    ; pw_128
968    psrlw        m10, m12, 8                    ; pw_1
969    xor           xd, xd
970.loop_x:
971    lea     tmp_ptrq, [tq+xq*2]
972    lea     src_ptrq, [srcq+xq*1]
973    lea       a_ptrq, [aq+xq*4+(384+16)*4]
974    lea       b_ptrq, [bq+xq*2+(384+16)*2]
975    movu          m0, [aq+xq*4-(384+16)*4-4]
976    mova          m1, [aq+xq*4-(384+16)*4]
977    movu          m2, [aq+xq*4-(384+16)*4+4]
978    movu          m3, [aq+xq*4-(384+16)*4-4+32]
979    mova          m4, [aq+xq*4-(384+16)*4+32]
980    movu          m5, [aq+xq*4-(384+16)*4+4+32]
981    paddd         m0, m2
982    paddd         m3, m5
983    paddd         m0, m1
984    paddd         m3, m4
985    pslld         m2, m0, 2
986    pslld         m5, m3, 2
987    paddd         m2, m0
988    paddd         m5, m3
989    paddd         m0, m2, m1                    ; prev_odd_b [first half]
990    paddd         m1, m5, m4                    ; prev_odd_b [second half]
991    movu          m3, [bq+xq*2-(384+16)*2-2]
992    mova          m4, [bq+xq*2-(384+16)*2]
993    movu          m5, [bq+xq*2-(384+16)*2+2]
994    paddw         m3, m5
995    punpcklwd     m5, m3, m4
996    punpckhwd     m3, m4
997    pmaddwd       m5, m9
998    pmaddwd       m3, m9
999    packssdw      m2, m5, m3                    ; prev_odd_a
1000    mov           yd, hd
1001.loop_y:
1002    movu          m3, [a_ptrq-4]
1003    mova          m4, [a_ptrq]
1004    movu          m5, [a_ptrq+4]
1005    movu          m6, [a_ptrq+32-4]
1006    mova          m7, [a_ptrq+32]
1007    movu          m8, [a_ptrq+32+4]
1008    paddd         m3, m5
1009    paddd         m6, m8
1010    paddd         m3, m4
1011    paddd         m6, m7
1012    pslld         m5, m3, 2
1013    pslld         m8, m6, 2
1014    paddd         m5, m3
1015    paddd         m8, m6
1016    paddd         m3, m5, m4                    ; cur_odd_b [first half]
1017    paddd         m4, m8, m7                    ; cur_odd_b [second half]
1018    movu          m5, [b_ptrq-2]
1019    mova          m6, [b_ptrq]
1020    movu          m7, [b_ptrq+2]
1021    paddw         m5, m7
1022    punpcklwd     m7, m5, m6
1023    punpckhwd     m5, m6
1024    pmaddwd       m7, m9
1025    pmaddwd       m5, m9
1026    packssdw      m5, m7, m5                    ; cur_odd_a
1027
1028    paddd         m0, m3                        ; cur_even_b [first half]
1029    paddd         m1, m4                        ; cur_even_b [second half]
1030    paddw         m2, m5                        ; cur_even_a
1031
1032    pmovzxbw      m6, [src_ptrq]
1033    vperm2i128    m8, m0, m1, 0x31
1034    vinserti128   m0, xm1, 1
1035    punpcklwd     m7, m6, m10
1036    punpckhwd     m6, m10
1037    punpcklwd     m1, m2, m12
1038    punpckhwd     m2, m12
1039    pmaddwd       m7, m1
1040    pmaddwd       m6, m2
1041    paddd         m7, m0
1042    paddd         m6, m8
1043    psrad         m7, 9
1044    psrad         m6, 9
1045
1046    pmovzxbw      m8, [src_ptrq+strideq]
1047    punpcklwd     m0, m8, m10
1048    punpckhwd     m8, m10
1049    punpcklwd     m1, m5, m11
1050    punpckhwd     m2, m5, m11
1051    pmaddwd       m0, m1
1052    pmaddwd       m8, m2
1053    vinserti128   m2, m3, xm4, 1
1054    vperm2i128    m1, m3, m4, 0x31
1055    paddd         m0, m2
1056    paddd         m8, m1
1057    psrad         m0, 8
1058    psrad         m8, 8
1059
1060    packssdw      m7, m6
1061    packssdw      m0, m8
1062    mova [tmp_ptrq+384*2*0], m7
1063    mova [tmp_ptrq+384*2*1], m0
1064
1065    mova          m0, m3
1066    mova          m1, m4
1067    mova          m2, m5
1068    add       a_ptrq, (384+16)*4*2
1069    add       b_ptrq, (384+16)*2*2
1070    add     tmp_ptrq, 384*2*2
1071    lea     src_ptrq, [src_ptrq+strideq*2]
1072    sub           yd, 2
1073    jg .loop_y
1074    add           xd, 16
1075    cmp           xd, wd
1076    jl .loop_x
1077    RET
1078
1079INIT_YMM avx2
1080cglobal sgr_weighted2, 7, 7, 11, dst, stride, t1, t2, w, h, wt
1081    vpbroadcastd  m0, [wtq]
1082    vpbroadcastd m10, [pd_1024]
1083    DEFINE_ARGS dst, stride, t1, t2, w, h, idx
1084.loop_y:
1085    xor         idxd, idxd
1086.loop_x:
1087    mova          m1, [t1q+idxq*2+ 0]
1088    mova          m2, [t1q+idxq*2+32]
1089    mova          m3, [t2q+idxq*2+ 0]
1090    mova          m4, [t2q+idxq*2+32]
1091    pmovzxbw      m5, [dstq+idxq+ 0]
1092    pmovzxbw      m6, [dstq+idxq+16]
1093    psllw         m7, m5, 4
1094    psllw         m8, m6, 4
1095    psubw         m1, m7
1096    psubw         m2, m8
1097    psubw         m3, m7
1098    psubw         m4, m8
1099    punpcklwd     m9, m1, m3
1100    punpckhwd     m1, m3
1101    punpcklwd     m3, m2, m4
1102    punpckhwd     m2, m4
1103    pmaddwd       m9, m0
1104    pmaddwd       m1, m0
1105    pmaddwd       m3, m0
1106    pmaddwd       m2, m0
1107    paddd         m9, m10
1108    paddd         m1, m10
1109    paddd         m3, m10
1110    paddd         m2, m10
1111    psrad         m9, 11
1112    psrad         m1, 11
1113    psrad         m3, 11
1114    psrad         m2, 11
1115    packssdw      m1, m9, m1
1116    packssdw      m2, m3, m2
1117    paddw         m1, m5
1118    paddw         m2, m6
1119    packuswb      m1, m2
1120    vpermq        m1, m1, q3120
1121    mova [dstq+idxq], m1
1122    add         idxd, 32
1123    cmp         idxd, wd
1124    jl .loop_x
1125    add         dstq, strideq
1126    add          t1q, 384 * 2
1127    add          t2q, 384 * 2
1128    dec           hd
1129    jg .loop_y
1130    RET
1131%endif ; ARCH_X86_64
1132