1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10;
11
12;
13
14
15%include "x86_abi_support.asm"
16
17;Note: tap3 and tap4 have to be applied and added after other taps to avoid
18;overflow.
19
20%macro GET_FILTERS_4 0
21    mov         rdx, arg(5)                 ;filter ptr
22    mov         rcx, 0x0400040
23
24    movdqa      xmm7, [rdx]                 ;load filters
25    pshuflw     xmm0, xmm7, 0b              ;k0
26    pshuflw     xmm1, xmm7, 01010101b       ;k1
27    pshuflw     xmm2, xmm7, 10101010b       ;k2
28    pshuflw     xmm3, xmm7, 11111111b       ;k3
29    psrldq      xmm7, 8
30    pshuflw     xmm4, xmm7, 0b              ;k4
31    pshuflw     xmm5, xmm7, 01010101b       ;k5
32    pshuflw     xmm6, xmm7, 10101010b       ;k6
33    pshuflw     xmm7, xmm7, 11111111b       ;k7
34
35    punpcklqdq  xmm0, xmm1
36    punpcklqdq  xmm2, xmm3
37    punpcklqdq  xmm5, xmm4
38    punpcklqdq  xmm6, xmm7
39
40    movdqa      k0k1, xmm0
41    movdqa      k2k3, xmm2
42    movdqa      k5k4, xmm5
43    movdqa      k6k7, xmm6
44
45    movq        xmm6, rcx
46    pshufd      xmm6, xmm6, 0
47    movdqa      krd, xmm6
48
49    pxor        xmm7, xmm7
50    movdqa      zero, xmm7
51%endm
52
53%macro APPLY_FILTER_4 1
54    punpckldq   xmm0, xmm1                  ;two row in one register
55    punpckldq   xmm6, xmm7
56    punpckldq   xmm2, xmm3
57    punpckldq   xmm5, xmm4
58
59    punpcklbw   xmm0, zero                  ;unpack to word
60    punpcklbw   xmm6, zero
61    punpcklbw   xmm2, zero
62    punpcklbw   xmm5, zero
63
64    pmullw      xmm0, k0k1                  ;multiply the filter factors
65    pmullw      xmm6, k6k7
66    pmullw      xmm2, k2k3
67    pmullw      xmm5, k5k4
68
69    paddsw      xmm0, xmm6                  ;sum
70    movdqa      xmm1, xmm0
71    psrldq      xmm1, 8
72    paddsw      xmm0, xmm1
73    paddsw      xmm0, xmm2
74    psrldq      xmm2, 8
75    paddsw      xmm0, xmm5
76    psrldq      xmm5, 8
77    paddsw      xmm0, xmm2
78    paddsw      xmm0, xmm5
79
80    paddsw      xmm0, krd                   ;rounding
81    psraw       xmm0, 7                     ;shift
82    packuswb    xmm0, xmm0                  ;pack to byte
83
84%if %1
85    movd        xmm1, [rdi]
86    pavgb       xmm0, xmm1
87%endif
88    movd        [rdi], xmm0
89%endm
90
91%macro GET_FILTERS 0
92    mov         rdx, arg(5)                 ;filter ptr
93    mov         rsi, arg(0)                 ;src_ptr
94    mov         rdi, arg(2)                 ;output_ptr
95    mov         rcx, 0x0400040
96
97    movdqa      xmm7, [rdx]                 ;load filters
98    pshuflw     xmm0, xmm7, 0b              ;k0
99    pshuflw     xmm1, xmm7, 01010101b       ;k1
100    pshuflw     xmm2, xmm7, 10101010b       ;k2
101    pshuflw     xmm3, xmm7, 11111111b       ;k3
102    pshufhw     xmm4, xmm7, 0b              ;k4
103    pshufhw     xmm5, xmm7, 01010101b       ;k5
104    pshufhw     xmm6, xmm7, 10101010b       ;k6
105    pshufhw     xmm7, xmm7, 11111111b       ;k7
106
107    punpcklwd   xmm0, xmm0
108    punpcklwd   xmm1, xmm1
109    punpcklwd   xmm2, xmm2
110    punpcklwd   xmm3, xmm3
111    punpckhwd   xmm4, xmm4
112    punpckhwd   xmm5, xmm5
113    punpckhwd   xmm6, xmm6
114    punpckhwd   xmm7, xmm7
115
116    movdqa      k0,   xmm0                  ;store filter factors on stack
117    movdqa      k1,   xmm1
118    movdqa      k2,   xmm2
119    movdqa      k3,   xmm3
120    movdqa      k4,   xmm4
121    movdqa      k5,   xmm5
122    movdqa      k6,   xmm6
123    movdqa      k7,   xmm7
124
125    movq        xmm6, rcx
126    pshufd      xmm6, xmm6, 0
127    movdqa      krd, xmm6                   ;rounding
128
129    pxor        xmm7, xmm7
130    movdqa      zero, xmm7
131%endm
132
133%macro LOAD_VERT_8 1
134    movq        xmm0, [rsi + %1]            ;0
135    movq        xmm1, [rsi + rax + %1]      ;1
136    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
137    lea         rsi,  [rsi + rax]
138    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
139    movq        xmm2, [rsi + rax + %1]      ;2
140    movq        xmm3, [rsi + rax * 2 + %1]  ;3
141    movq        xmm4, [rsi + rdx + %1]      ;4
142    movq        xmm5, [rsi + rax * 4 + %1]  ;5
143%endm
144
145%macro APPLY_FILTER_8 2
146    punpcklbw   xmm0, zero
147    punpcklbw   xmm1, zero
148    punpcklbw   xmm6, zero
149    punpcklbw   xmm7, zero
150    punpcklbw   xmm2, zero
151    punpcklbw   xmm5, zero
152    punpcklbw   xmm3, zero
153    punpcklbw   xmm4, zero
154
155    pmullw      xmm0, k0
156    pmullw      xmm1, k1
157    pmullw      xmm6, k6
158    pmullw      xmm7, k7
159    pmullw      xmm2, k2
160    pmullw      xmm5, k5
161    pmullw      xmm3, k3
162    pmullw      xmm4, k4
163
164    paddsw      xmm0, xmm1
165    paddsw      xmm0, xmm6
166    paddsw      xmm0, xmm7
167    paddsw      xmm0, xmm2
168    paddsw      xmm0, xmm5
169    paddsw      xmm0, xmm3
170    paddsw      xmm0, xmm4
171
172    paddsw      xmm0, krd                   ;rounding
173    psraw       xmm0, 7                     ;shift
174    packuswb    xmm0, xmm0                  ;pack back to byte
175%if %1
176    movq        xmm1, [rdi + %2]
177    pavgb       xmm0, xmm1
178%endif
179    movq        [rdi + %2], xmm0
180%endm
181
182SECTION .text
183
184;void svt_aom_filter_block1d4_v8_sse2
185;(
186;    unsigned char *src_ptr,
187;    unsigned int   src_pitch,
188;    unsigned char *output_ptr,
189;    unsigned int   out_pitch,
190;    unsigned int   output_height,
191;    short *filter
192;)
193global sym(svt_aom_filter_block1d4_v8_sse2) PRIVATE
194sym(svt_aom_filter_block1d4_v8_sse2):
195    push        rbp
196    mov         rbp, rsp
197    SHADOW_ARGS_TO_STACK 6
198    SAVE_XMM 7
199    push        rsi
200    push        rdi
201    push        rbx
202    ; end prolog
203
204    ALIGN_STACK 16, rax
205    sub         rsp, 16 * 6
206    %define k0k1 [rsp + 16 * 0]
207    %define k2k3 [rsp + 16 * 1]
208    %define k5k4 [rsp + 16 * 2]
209    %define k6k7 [rsp + 16 * 3]
210    %define krd [rsp + 16 * 4]
211    %define zero [rsp + 16 * 5]
212
213    GET_FILTERS_4
214
215    mov         rsi, arg(0)                 ;src_ptr
216    mov         rdi, arg(2)                 ;output_ptr
217
218    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
219    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
220    lea         rdx, [rax + rax * 2]
221    movsxd      rcx, DWORD PTR arg(4)       ;output_height
222
223.loop:
224    movd        xmm0, [rsi]                 ;load src: row 0
225    movd        xmm1, [rsi + rax]           ;1
226    movd        xmm6, [rsi + rdx * 2]       ;6
227    lea         rsi,  [rsi + rax]
228    movd        xmm7, [rsi + rdx * 2]       ;7
229    movd        xmm2, [rsi + rax]           ;2
230    movd        xmm3, [rsi + rax * 2]       ;3
231    movd        xmm4, [rsi + rdx]           ;4
232    movd        xmm5, [rsi + rax * 4]       ;5
233
234    APPLY_FILTER_4 0
235
236    lea         rdi, [rdi + rbx]
237    dec         rcx
238    jnz         .loop
239
240    add rsp, 16 * 6
241    pop rsp
242    pop rbx
243    ; begin epilog
244    pop rdi
245    pop rsi
246    RESTORE_XMM
247    UNSHADOW_ARGS
248    pop         rbp
249    ret
250
251;void svt_aom_filter_block1d8_v8_sse2
252;(
253;    unsigned char *src_ptr,
254;    unsigned int   src_pitch,
255;    unsigned char *output_ptr,
256;    unsigned int   out_pitch,
257;    unsigned int   output_height,
258;    short *filter
259;)
260global sym(svt_aom_filter_block1d8_v8_sse2) PRIVATE
261sym(svt_aom_filter_block1d8_v8_sse2):
262    push        rbp
263    mov         rbp, rsp
264    SHADOW_ARGS_TO_STACK 6
265    SAVE_XMM 7
266    push        rsi
267    push        rdi
268    push        rbx
269    ; end prolog
270
271    ALIGN_STACK 16, rax
272    sub         rsp, 16 * 10
273    %define k0 [rsp + 16 * 0]
274    %define k1 [rsp + 16 * 1]
275    %define k2 [rsp + 16 * 2]
276    %define k3 [rsp + 16 * 3]
277    %define k4 [rsp + 16 * 4]
278    %define k5 [rsp + 16 * 5]
279    %define k6 [rsp + 16 * 6]
280    %define k7 [rsp + 16 * 7]
281    %define krd [rsp + 16 * 8]
282    %define zero [rsp + 16 * 9]
283
284    GET_FILTERS
285
286    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
287    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
288    lea         rdx, [rax + rax * 2]
289    movsxd      rcx, DWORD PTR arg(4)       ;output_height
290
291.loop:
292    LOAD_VERT_8 0
293    APPLY_FILTER_8 0, 0
294
295    lea         rdi, [rdi + rbx]
296    dec         rcx
297    jnz         .loop
298
299    add rsp, 16 * 10
300    pop rsp
301    pop rbx
302    ; begin epilog
303    pop rdi
304    pop rsi
305    RESTORE_XMM
306    UNSHADOW_ARGS
307    pop         rbp
308    ret
309
310;void svt_aom_filter_block1d16_v8_sse2
311;(
312;    unsigned char *src_ptr,
313;    unsigned int   src_pitch,
314;    unsigned char *output_ptr,
315;    unsigned int   out_pitch,
316;    unsigned int   output_height,
317;    short *filter
318;)
319global sym(svt_aom_filter_block1d16_v8_sse2) PRIVATE
320sym(svt_aom_filter_block1d16_v8_sse2):
321    push        rbp
322    mov         rbp, rsp
323    SHADOW_ARGS_TO_STACK 6
324    SAVE_XMM 7
325    push        rsi
326    push        rdi
327    push        rbx
328    ; end prolog
329
330    ALIGN_STACK 16, rax
331    sub         rsp, 16 * 10
332    %define k0 [rsp + 16 * 0]
333    %define k1 [rsp + 16 * 1]
334    %define k2 [rsp + 16 * 2]
335    %define k3 [rsp + 16 * 3]
336    %define k4 [rsp + 16 * 4]
337    %define k5 [rsp + 16 * 5]
338    %define k6 [rsp + 16 * 6]
339    %define k7 [rsp + 16 * 7]
340    %define krd [rsp + 16 * 8]
341    %define zero [rsp + 16 * 9]
342
343    GET_FILTERS
344
345    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
346    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
347    lea         rdx, [rax + rax * 2]
348    movsxd      rcx, DWORD PTR arg(4)       ;output_height
349
350.loop:
351    LOAD_VERT_8 0
352    APPLY_FILTER_8 0, 0
353    sub         rsi, rax
354
355    LOAD_VERT_8 8
356    APPLY_FILTER_8 0, 8
357    add         rdi, rbx
358
359    dec         rcx
360    jnz         .loop
361
362    add rsp, 16 * 10
363    pop rsp
364    pop rbx
365    ; begin epilog
366    pop rdi
367    pop rsi
368    RESTORE_XMM
369    UNSHADOW_ARGS
370    pop         rbp
371    ret
372
373;void svt_aom_filter_block1d4_h8_sse2
374;(
375;    unsigned char  *src_ptr,
376;    unsigned int    src_pixels_per_line,
377;    unsigned char  *output_ptr,
378;    unsigned int    output_pitch,
379;    unsigned int    output_height,
380;    short *filter
381;)
382global sym(svt_aom_filter_block1d4_h8_sse2) PRIVATE
383sym(svt_aom_filter_block1d4_h8_sse2):
384    push        rbp
385    mov         rbp, rsp
386    SHADOW_ARGS_TO_STACK 6
387    SAVE_XMM 7
388    push        rsi
389    push        rdi
390    ; end prolog
391
392    ALIGN_STACK 16, rax
393    sub         rsp, 16 * 6
394    %define k0k1 [rsp + 16 * 0]
395    %define k2k3 [rsp + 16 * 1]
396    %define k5k4 [rsp + 16 * 2]
397    %define k6k7 [rsp + 16 * 3]
398    %define krd [rsp + 16 * 4]
399    %define zero [rsp + 16 * 5]
400
401    GET_FILTERS_4
402
403    mov         rsi, arg(0)                 ;src_ptr
404    mov         rdi, arg(2)                 ;output_ptr
405
406    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
407    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
408    movsxd      rcx, DWORD PTR arg(4)       ;output_height
409
410.loop:
411    movdqu      xmm0,   [rsi - 3]           ;load src
412
413    movdqa      xmm1, xmm0
414    movdqa      xmm6, xmm0
415    movdqa      xmm7, xmm0
416    movdqa      xmm2, xmm0
417    movdqa      xmm3, xmm0
418    movdqa      xmm5, xmm0
419    movdqa      xmm4, xmm0
420
421    psrldq      xmm1, 1
422    psrldq      xmm6, 6
423    psrldq      xmm7, 7
424    psrldq      xmm2, 2
425    psrldq      xmm3, 3
426    psrldq      xmm5, 5
427    psrldq      xmm4, 4
428
429    APPLY_FILTER_4 0
430
431    lea         rsi, [rsi + rax]
432    lea         rdi, [rdi + rdx]
433    dec         rcx
434    jnz         .loop
435
436    add rsp, 16 * 6
437    pop rsp
438
439    ; begin epilog
440    pop rdi
441    pop rsi
442    RESTORE_XMM
443    UNSHADOW_ARGS
444    pop         rbp
445    ret
446
447;void svt_aom_filter_block1d8_h8_sse2
448;(
449;    unsigned char  *src_ptr,
450;    unsigned int    src_pixels_per_line,
451;    unsigned char  *output_ptr,
452;    unsigned int    output_pitch,
453;    unsigned int    output_height,
454;    short *filter
455;)
456global sym(svt_aom_filter_block1d8_h8_sse2) PRIVATE
457sym(svt_aom_filter_block1d8_h8_sse2):
458    push        rbp
459    mov         rbp, rsp
460    SHADOW_ARGS_TO_STACK 6
461    SAVE_XMM 7
462    push        rsi
463    push        rdi
464    ; end prolog
465
466    ALIGN_STACK 16, rax
467    sub         rsp, 16 * 10
468    %define k0 [rsp + 16 * 0]
469    %define k1 [rsp + 16 * 1]
470    %define k2 [rsp + 16 * 2]
471    %define k3 [rsp + 16 * 3]
472    %define k4 [rsp + 16 * 4]
473    %define k5 [rsp + 16 * 5]
474    %define k6 [rsp + 16 * 6]
475    %define k7 [rsp + 16 * 7]
476    %define krd [rsp + 16 * 8]
477    %define zero [rsp + 16 * 9]
478
479    GET_FILTERS
480
481    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
482    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
483    movsxd      rcx, DWORD PTR arg(4)       ;output_height
484
485.loop:
486    movdqu      xmm0,   [rsi - 3]           ;load src
487
488    movdqa      xmm1, xmm0
489    movdqa      xmm6, xmm0
490    movdqa      xmm7, xmm0
491    movdqa      xmm2, xmm0
492    movdqa      xmm5, xmm0
493    movdqa      xmm3, xmm0
494    movdqa      xmm4, xmm0
495
496    psrldq      xmm1, 1
497    psrldq      xmm6, 6
498    psrldq      xmm7, 7
499    psrldq      xmm2, 2
500    psrldq      xmm5, 5
501    psrldq      xmm3, 3
502    psrldq      xmm4, 4
503
504    APPLY_FILTER_8 0, 0
505
506    lea         rsi, [rsi + rax]
507    lea         rdi, [rdi + rdx]
508    dec         rcx
509    jnz         .loop
510
511    add rsp, 16 * 10
512    pop rsp
513
514    ; begin epilog
515    pop rdi
516    pop rsi
517    RESTORE_XMM
518    UNSHADOW_ARGS
519    pop         rbp
520    ret
521
522;void svt_aom_filter_block1d16_h8_sse2
523;(
524;    unsigned char  *src_ptr,
525;    unsigned int    src_pixels_per_line,
526;    unsigned char  *output_ptr,
527;    unsigned int    output_pitch,
528;    unsigned int    output_height,
529;    short *filter
530;)
531global sym(svt_aom_filter_block1d16_h8_sse2) PRIVATE
532sym(svt_aom_filter_block1d16_h8_sse2):
533    push        rbp
534    mov         rbp, rsp
535    SHADOW_ARGS_TO_STACK 6
536    SAVE_XMM 7
537    push        rsi
538    push        rdi
539    ; end prolog
540
541    ALIGN_STACK 16, rax
542    sub         rsp, 16 * 10
543    %define k0 [rsp + 16 * 0]
544    %define k1 [rsp + 16 * 1]
545    %define k2 [rsp + 16 * 2]
546    %define k3 [rsp + 16 * 3]
547    %define k4 [rsp + 16 * 4]
548    %define k5 [rsp + 16 * 5]
549    %define k6 [rsp + 16 * 6]
550    %define k7 [rsp + 16 * 7]
551    %define krd [rsp + 16 * 8]
552    %define zero [rsp + 16 * 9]
553
554    GET_FILTERS
555
556    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
557    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
558    movsxd      rcx, DWORD PTR arg(4)       ;output_height
559
560.loop:
561    movdqu      xmm0,   [rsi - 3]           ;load src
562
563    movdqa      xmm1, xmm0
564    movdqa      xmm6, xmm0
565    movdqa      xmm7, xmm0
566    movdqa      xmm2, xmm0
567    movdqa      xmm5, xmm0
568    movdqa      xmm3, xmm0
569    movdqa      xmm4, xmm0
570
571    psrldq      xmm1, 1
572    psrldq      xmm6, 6
573    psrldq      xmm7, 7
574    psrldq      xmm2, 2
575    psrldq      xmm5, 5
576    psrldq      xmm3, 3
577    psrldq      xmm4, 4
578
579    APPLY_FILTER_8 0, 0
580
581    movdqu      xmm0,   [rsi + 5]           ;load src
582
583    movdqa      xmm1, xmm0
584    movdqa      xmm6, xmm0
585    movdqa      xmm7, xmm0
586    movdqa      xmm2, xmm0
587    movdqa      xmm5, xmm0
588    movdqa      xmm3, xmm0
589    movdqa      xmm4, xmm0
590
591    psrldq      xmm1, 1
592    psrldq      xmm6, 6
593    psrldq      xmm7, 7
594    psrldq      xmm2, 2
595    psrldq      xmm5, 5
596    psrldq      xmm3, 3
597    psrldq      xmm4, 4
598
599    APPLY_FILTER_8 0, 8
600
601    lea         rsi, [rsi + rax]
602    lea         rdi, [rdi + rdx]
603    dec         rcx
604    jnz         .loop
605
606    add rsp, 16 * 10
607    pop rsp
608
609    ; begin epilog
610    pop rdi
611    pop rsi
612    RESTORE_XMM
613    UNSHADOW_ARGS
614    pop         rbp
615    ret
616