1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14
15%include "aom_ports/x86_abi_support.asm"
16
17;Note: tap3 and tap4 have to be applied and added after other taps to avoid
18;overflow.
19
20%macro HIGH_GET_FILTERS_4 0
21    mov         rdx, arg(5)                 ;filter ptr
22    mov         rcx, 0x00000040
23
24    movdqa      xmm7, [rdx]                 ;load filters
25    pshuflw     xmm0, xmm7, 0b              ;k0
26    pshuflw     xmm1, xmm7, 01010101b       ;k1
27    pshuflw     xmm2, xmm7, 10101010b       ;k2
28    pshuflw     xmm3, xmm7, 11111111b       ;k3
29    psrldq      xmm7, 8
30    pshuflw     xmm4, xmm7, 0b              ;k4
31    pshuflw     xmm5, xmm7, 01010101b       ;k5
32    pshuflw     xmm6, xmm7, 10101010b       ;k6
33    pshuflw     xmm7, xmm7, 11111111b       ;k7
34
35    punpcklwd   xmm0, xmm6
36    punpcklwd   xmm2, xmm5
37    punpcklwd   xmm3, xmm4
38    punpcklwd   xmm1, xmm7
39
40    movdqa      k0k6, xmm0
41    movdqa      k2k5, xmm2
42    movdqa      k3k4, xmm3
43    movdqa      k1k7, xmm1
44
45    movq        xmm6, rcx
46    pshufd      xmm6, xmm6, 0
47    movdqa      krd, xmm6
48
49    ;Compute max and min values of a pixel
50    mov         rdx, 0x00010001
51    movsxd      rcx, DWORD PTR arg(6)      ;bps
52    movq        xmm0, rdx
53    movq        xmm1, rcx
54    pshufd      xmm0, xmm0, 0b
55    movdqa      xmm2, xmm0
56    psllw       xmm0, xmm1
57    psubw       xmm0, xmm2
58    pxor        xmm1, xmm1
59    movdqa      max, xmm0                  ;max value (for clamping)
60    movdqa      min, xmm1                  ;min value (for clamping)
61
62%endm
63
64%macro HIGH_APPLY_FILTER_4 1
65    punpcklwd   xmm0, xmm6                  ;two row in one register
66    punpcklwd   xmm1, xmm7
67    punpcklwd   xmm2, xmm5
68    punpcklwd   xmm3, xmm4
69
70    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
71    pmaddwd     xmm1, k1k7
72    pmaddwd     xmm2, k2k5
73    pmaddwd     xmm3, k3k4
74
75    paddd       xmm0, xmm1                  ;sum
76    paddd       xmm0, xmm2
77    paddd       xmm0, xmm3
78
79    paddd       xmm0, krd                   ;rounding
80    psrad       xmm0, 7                     ;shift
81    packssdw    xmm0, xmm0                  ;pack to word
82
83    ;clamp the values
84    pminsw      xmm0, max
85    pmaxsw      xmm0, min
86
87%if %1
88    movq        xmm1, [rdi]
89    pavgw       xmm0, xmm1
90%endif
91    movq        [rdi], xmm0
92%endm
93
94%macro HIGH_GET_FILTERS 0
95    mov         rdx, arg(5)                 ;filter ptr
96    mov         rsi, arg(0)                 ;src_ptr
97    mov         rdi, arg(2)                 ;output_ptr
98    mov         rcx, 0x00000040
99
100    movdqa      xmm7, [rdx]                 ;load filters
101    pshuflw     xmm0, xmm7, 0b              ;k0
102    pshuflw     xmm1, xmm7, 01010101b       ;k1
103    pshuflw     xmm2, xmm7, 10101010b       ;k2
104    pshuflw     xmm3, xmm7, 11111111b       ;k3
105    pshufhw     xmm4, xmm7, 0b              ;k4
106    pshufhw     xmm5, xmm7, 01010101b       ;k5
107    pshufhw     xmm6, xmm7, 10101010b       ;k6
108    pshufhw     xmm7, xmm7, 11111111b       ;k7
109    punpcklqdq  xmm2, xmm2
110    punpcklqdq  xmm3, xmm3
111    punpcklwd   xmm0, xmm1
112    punpckhwd   xmm6, xmm7
113    punpckhwd   xmm2, xmm5
114    punpckhwd   xmm3, xmm4
115
116    movdqa      k0k1, xmm0                  ;store filter factors on stack
117    movdqa      k6k7, xmm6
118    movdqa      k2k5, xmm2
119    movdqa      k3k4, xmm3
120
121    movq        xmm6, rcx
122    pshufd      xmm6, xmm6, 0
123    movdqa      krd, xmm6                   ;rounding
124
125    ;Compute max and min values of a pixel
126    mov         rdx, 0x00010001
127    movsxd      rcx, DWORD PTR arg(6)       ;bps
128    movq        xmm0, rdx
129    movq        xmm1, rcx
130    pshufd      xmm0, xmm0, 0b
131    movdqa      xmm2, xmm0
132    psllw       xmm0, xmm1
133    psubw       xmm0, xmm2
134    pxor        xmm1, xmm1
135    movdqa      max, xmm0                  ;max value (for clamping)
136    movdqa      min, xmm1                  ;min value (for clamping)
137%endm
138
139%macro LOAD_VERT_8 1
140    movdqu      xmm0, [rsi + %1]            ;0
141    movdqu      xmm1, [rsi + rax + %1]      ;1
142    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
143    lea         rsi,  [rsi + rax]
144    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
145    movdqu      xmm2, [rsi + rax + %1]      ;2
146    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
147    movdqu      xmm4, [rsi + rdx + %1]      ;4
148    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
149%endm
150
151%macro HIGH_APPLY_FILTER_8 2
152    movdqu      temp, xmm4
153    movdqa      xmm4, xmm0
154    punpcklwd   xmm0, xmm1
155    punpckhwd   xmm4, xmm1
156    movdqa      xmm1, xmm6
157    punpcklwd   xmm6, xmm7
158    punpckhwd   xmm1, xmm7
159    movdqa      xmm7, xmm2
160    punpcklwd   xmm2, xmm5
161    punpckhwd   xmm7, xmm5
162
163    movdqu      xmm5, temp
164    movdqu      temp, xmm4
165    movdqa      xmm4, xmm3
166    punpcklwd   xmm3, xmm5
167    punpckhwd   xmm4, xmm5
168    movdqu      xmm5, temp
169
170    pmaddwd     xmm0, k0k1
171    pmaddwd     xmm5, k0k1
172    pmaddwd     xmm6, k6k7
173    pmaddwd     xmm1, k6k7
174    pmaddwd     xmm2, k2k5
175    pmaddwd     xmm7, k2k5
176    pmaddwd     xmm3, k3k4
177    pmaddwd     xmm4, k3k4
178
179    paddd       xmm0, xmm6
180    paddd       xmm0, xmm2
181    paddd       xmm0, xmm3
182    paddd       xmm5, xmm1
183    paddd       xmm5, xmm7
184    paddd       xmm5, xmm4
185
186    paddd       xmm0, krd                   ;rounding
187    paddd       xmm5, krd
188    psrad       xmm0, 7                     ;shift
189    psrad       xmm5, 7
190    packssdw    xmm0, xmm5                  ;pack back to word
191
192    ;clamp the values
193    pminsw      xmm0, max
194    pmaxsw      xmm0, min
195
196%if %1
197    movdqu      xmm1, [rdi + %2]
198    pavgw       xmm0, xmm1
199%endif
200    movdqu      [rdi + %2], xmm0
201%endm
202
203SECTION .text
204
205;void aom_filter_block1d4_v8_sse2
206;(
207;    unsigned char *src_ptr,
208;    unsigned int   src_pitch,
209;    unsigned char *output_ptr,
210;    unsigned int   out_pitch,
211;    unsigned int   output_height,
212;    short *filter
213;)
214global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
215sym(aom_highbd_filter_block1d4_v8_sse2):
216    push        rbp
217    mov         rbp, rsp
218    SHADOW_ARGS_TO_STACK 7
219    SAVE_XMM 7
220    push        rsi
221    push        rdi
222    push        rbx
223    ; end prolog
224
225    ALIGN_STACK 16, rax
226    sub         rsp, 16 * 7
227    %define k0k6 [rsp + 16 * 0]
228    %define k2k5 [rsp + 16 * 1]
229    %define k3k4 [rsp + 16 * 2]
230    %define k1k7 [rsp + 16 * 3]
231    %define krd [rsp + 16 * 4]
232    %define max [rsp + 16 * 5]
233    %define min [rsp + 16 * 6]
234
235    HIGH_GET_FILTERS_4
236
237    mov         rsi, arg(0)                 ;src_ptr
238    mov         rdi, arg(2)                 ;output_ptr
239
240    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
241    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
242    lea         rax, [rax + rax]            ;bytes per line
243    lea         rbx, [rbx + rbx]
244    lea         rdx, [rax + rax * 2]
245    movsxd      rcx, DWORD PTR arg(4)       ;output_height
246
247.loop:
248    movq        xmm0, [rsi]                 ;load src: row 0
249    movq        xmm1, [rsi + rax]           ;1
250    movq        xmm6, [rsi + rdx * 2]       ;6
251    lea         rsi,  [rsi + rax]
252    movq        xmm7, [rsi + rdx * 2]       ;7
253    movq        xmm2, [rsi + rax]           ;2
254    movq        xmm3, [rsi + rax * 2]       ;3
255    movq        xmm4, [rsi + rdx]           ;4
256    movq        xmm5, [rsi + rax * 4]       ;5
257
258    HIGH_APPLY_FILTER_4 0
259
260    lea         rdi, [rdi + rbx]
261    dec         rcx
262    jnz         .loop
263
264    add rsp, 16 * 7
265    pop rsp
266    pop rbx
267    ; begin epilog
268    pop rdi
269    pop rsi
270    RESTORE_XMM
271    UNSHADOW_ARGS
272    pop         rbp
273    ret
274
275;void aom_filter_block1d8_v8_sse2
276;(
277;    unsigned char *src_ptr,
278;    unsigned int   src_pitch,
279;    unsigned char *output_ptr,
280;    unsigned int   out_pitch,
281;    unsigned int   output_height,
282;    short *filter
283;)
284global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
285sym(aom_highbd_filter_block1d8_v8_sse2):
286    push        rbp
287    mov         rbp, rsp
288    SHADOW_ARGS_TO_STACK 7
289    SAVE_XMM 7
290    push        rsi
291    push        rdi
292    push        rbx
293    ; end prolog
294
295    ALIGN_STACK 16, rax
296    sub         rsp, 16 * 8
297    %define k0k1 [rsp + 16 * 0]
298    %define k6k7 [rsp + 16 * 1]
299    %define k2k5 [rsp + 16 * 2]
300    %define k3k4 [rsp + 16 * 3]
301    %define krd [rsp + 16 * 4]
302    %define temp [rsp + 16 * 5]
303    %define max [rsp + 16 * 6]
304    %define min [rsp + 16 * 7]
305
306    HIGH_GET_FILTERS
307
308    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
309    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
310    lea         rax, [rax + rax]            ;bytes per line
311    lea         rbx, [rbx + rbx]
312    lea         rdx, [rax + rax * 2]
313    movsxd      rcx, DWORD PTR arg(4)       ;output_height
314
315.loop:
316    LOAD_VERT_8 0
317    HIGH_APPLY_FILTER_8 0, 0
318
319    lea         rdi, [rdi + rbx]
320    dec         rcx
321    jnz         .loop
322
323    add rsp, 16 * 8
324    pop rsp
325    pop rbx
326    ; begin epilog
327    pop rdi
328    pop rsi
329    RESTORE_XMM
330    UNSHADOW_ARGS
331    pop         rbp
332    ret
333
334;void aom_filter_block1d16_v8_sse2
335;(
336;    unsigned char *src_ptr,
337;    unsigned int   src_pitch,
338;    unsigned char *output_ptr,
339;    unsigned int   out_pitch,
340;    unsigned int   output_height,
341;    short *filter
342;)
343global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
344sym(aom_highbd_filter_block1d16_v8_sse2):
345    push        rbp
346    mov         rbp, rsp
347    SHADOW_ARGS_TO_STACK 7
348    SAVE_XMM 7
349    push        rsi
350    push        rdi
351    push        rbx
352    ; end prolog
353
354    ALIGN_STACK 16, rax
355    sub         rsp, 16 * 8
356    %define k0k1 [rsp + 16 * 0]
357    %define k6k7 [rsp + 16 * 1]
358    %define k2k5 [rsp + 16 * 2]
359    %define k3k4 [rsp + 16 * 3]
360    %define krd [rsp + 16 * 4]
361    %define temp [rsp + 16 * 5]
362    %define max [rsp + 16 * 6]
363    %define min [rsp + 16 * 7]
364
365    HIGH_GET_FILTERS
366
367    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
368    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
369    lea         rax, [rax + rax]            ;bytes per line
370    lea         rbx, [rbx + rbx]
371    lea         rdx, [rax + rax * 2]
372    movsxd      rcx, DWORD PTR arg(4)       ;output_height
373
374.loop:
375    LOAD_VERT_8 0
376    HIGH_APPLY_FILTER_8 0, 0
377    sub         rsi, rax
378
379    LOAD_VERT_8 16
380    HIGH_APPLY_FILTER_8 0, 16
381    add         rdi, rbx
382
383    dec         rcx
384    jnz         .loop
385
386    add rsp, 16 * 8
387    pop rsp
388    pop rbx
389    ; begin epilog
390    pop rdi
391    pop rsi
392    RESTORE_XMM
393    UNSHADOW_ARGS
394    pop         rbp
395    ret
396
397;void aom_filter_block1d4_h8_sse2
398;(
399;    unsigned char  *src_ptr,
400;    unsigned int    src_pixels_per_line,
401;    unsigned char  *output_ptr,
402;    unsigned int    output_pitch,
403;    unsigned int    output_height,
404;    short *filter
405;)
406global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
407sym(aom_highbd_filter_block1d4_h8_sse2):
408    push        rbp
409    mov         rbp, rsp
410    SHADOW_ARGS_TO_STACK 7
411    SAVE_XMM 7
412    push        rsi
413    push        rdi
414    ; end prolog
415
416    ALIGN_STACK 16, rax
417    sub         rsp, 16 * 7
418    %define k0k6 [rsp + 16 * 0]
419    %define k2k5 [rsp + 16 * 1]
420    %define k3k4 [rsp + 16 * 2]
421    %define k1k7 [rsp + 16 * 3]
422    %define krd [rsp + 16 * 4]
423    %define max [rsp + 16 * 5]
424    %define min [rsp + 16 * 6]
425
426    HIGH_GET_FILTERS_4
427
428    mov         rsi, arg(0)                 ;src_ptr
429    mov         rdi, arg(2)                 ;output_ptr
430
431    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
432    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
433    lea         rax, [rax + rax]            ;bytes per line
434    lea         rdx, [rdx + rdx]
435    movsxd      rcx, DWORD PTR arg(4)       ;output_height
436
437.loop:
438    movdqu      xmm0,   [rsi - 6]           ;load src
439    movdqu      xmm4,   [rsi + 2]
440    movdqa      xmm1, xmm0
441    movdqa      xmm6, xmm4
442    movdqa      xmm7, xmm4
443    movdqa      xmm2, xmm0
444    movdqa      xmm3, xmm0
445    movdqa      xmm5, xmm4
446
447    psrldq      xmm1, 2
448    psrldq      xmm6, 4
449    psrldq      xmm7, 6
450    psrldq      xmm2, 4
451    psrldq      xmm3, 6
452    psrldq      xmm5, 2
453
454    HIGH_APPLY_FILTER_4 0
455
456    lea         rsi, [rsi + rax]
457    lea         rdi, [rdi + rdx]
458    dec         rcx
459    jnz         .loop
460
461    add rsp, 16 * 7
462    pop rsp
463
464    ; begin epilog
465    pop rdi
466    pop rsi
467    RESTORE_XMM
468    UNSHADOW_ARGS
469    pop         rbp
470    ret
471
472;void aom_filter_block1d8_h8_sse2
473;(
474;    unsigned char  *src_ptr,
475;    unsigned int    src_pixels_per_line,
476;    unsigned char  *output_ptr,
477;    unsigned int    output_pitch,
478;    unsigned int    output_height,
479;    short *filter
480;)
481global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
482sym(aom_highbd_filter_block1d8_h8_sse2):
483    push        rbp
484    mov         rbp, rsp
485    SHADOW_ARGS_TO_STACK 7
486    SAVE_XMM 7
487    push        rsi
488    push        rdi
489    ; end prolog
490
491    ALIGN_STACK 16, rax
492    sub         rsp, 16 * 8
493    %define k0k1 [rsp + 16 * 0]
494    %define k6k7 [rsp + 16 * 1]
495    %define k2k5 [rsp + 16 * 2]
496    %define k3k4 [rsp + 16 * 3]
497    %define krd [rsp + 16 * 4]
498    %define temp [rsp + 16 * 5]
499    %define max [rsp + 16 * 6]
500    %define min [rsp + 16 * 7]
501
502    HIGH_GET_FILTERS
503
504    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
505    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
506    lea         rax, [rax + rax]            ;bytes per line
507    lea         rdx, [rdx + rdx]
508    movsxd      rcx, DWORD PTR arg(4)       ;output_height
509
510.loop:
511    movdqu      xmm0,   [rsi - 6]           ;load src
512    movdqu      xmm1,   [rsi - 4]
513    movdqu      xmm2,   [rsi - 2]
514    movdqu      xmm3,   [rsi]
515    movdqu      xmm4,   [rsi + 2]
516    movdqu      xmm5,   [rsi + 4]
517    movdqu      xmm6,   [rsi + 6]
518    movdqu      xmm7,   [rsi + 8]
519
520    HIGH_APPLY_FILTER_8 0, 0
521
522    lea         rsi, [rsi + rax]
523    lea         rdi, [rdi + rdx]
524    dec         rcx
525    jnz         .loop
526
527    add rsp, 16 * 8
528    pop rsp
529
530    ; begin epilog
531    pop rdi
532    pop rsi
533    RESTORE_XMM
534    UNSHADOW_ARGS
535    pop         rbp
536    ret
537
538;void aom_filter_block1d16_h8_sse2
539;(
540;    unsigned char  *src_ptr,
541;    unsigned int    src_pixels_per_line,
542;    unsigned char  *output_ptr,
543;    unsigned int    output_pitch,
544;    unsigned int    output_height,
545;    short *filter
546;)
547global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
548sym(aom_highbd_filter_block1d16_h8_sse2):
549    push        rbp
550    mov         rbp, rsp
551    SHADOW_ARGS_TO_STACK 7
552    SAVE_XMM 7
553    push        rsi
554    push        rdi
555    ; end prolog
556
557    ALIGN_STACK 16, rax
558    sub         rsp, 16 * 8
559    %define k0k1 [rsp + 16 * 0]
560    %define k6k7 [rsp + 16 * 1]
561    %define k2k5 [rsp + 16 * 2]
562    %define k3k4 [rsp + 16 * 3]
563    %define krd [rsp + 16 * 4]
564    %define temp [rsp + 16 * 5]
565    %define max [rsp + 16 * 6]
566    %define min [rsp + 16 * 7]
567
568    HIGH_GET_FILTERS
569
570    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
571    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
572    lea         rax, [rax + rax]            ;bytes per line
573    lea         rdx, [rdx + rdx]
574    movsxd      rcx, DWORD PTR arg(4)       ;output_height
575
576.loop:
577    movdqu      xmm0,   [rsi - 6]           ;load src
578    movdqu      xmm1,   [rsi - 4]
579    movdqu      xmm2,   [rsi - 2]
580    movdqu      xmm3,   [rsi]
581    movdqu      xmm4,   [rsi + 2]
582    movdqu      xmm5,   [rsi + 4]
583    movdqu      xmm6,   [rsi + 6]
584    movdqu      xmm7,   [rsi + 8]
585
586    HIGH_APPLY_FILTER_8 0, 0
587
588    movdqu      xmm0,   [rsi + 10]           ;load src
589    movdqu      xmm1,   [rsi + 12]
590    movdqu      xmm2,   [rsi + 14]
591    movdqu      xmm3,   [rsi + 16]
592    movdqu      xmm4,   [rsi + 18]
593    movdqu      xmm5,   [rsi + 20]
594    movdqu      xmm6,   [rsi + 22]
595    movdqu      xmm7,   [rsi + 24]
596
597    HIGH_APPLY_FILTER_8 0, 16
598
599    lea         rsi, [rsi + rax]
600    lea         rdi, [rdi + rdx]
601    dec         rcx
602    jnz         .loop
603
604    add rsp, 16 * 8
605    pop rsp
606
607    ; begin epilog
608    pop rdi
609    pop rsi
610    RESTORE_XMM
611    UNSHADOW_ARGS
612    pop         rbp
613    ret
614