1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "config.asm"
15%include "ext/x86/x86inc.asm"
16
17SECTION .text
18
19%macro SAD_FN 4
20%if %4 == 0
21%if %3 == 5
22cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
23%else ; %3 == 7
24cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
25                            src_stride3, ref_stride3, n_rows
26%endif ; %3 == 5/7
27%else ; avg
28%if %3 == 5
29cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
30                                    second_pred, n_rows
31%else ; %3 == 7
32cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
33                                              ref, ref_stride, \
34                                              second_pred, \
35                                              src_stride3, ref_stride3
36%if ARCH_X86_64
37%define n_rowsd r7d
38%else ; x86-32
39%define n_rowsd dword r0m
40%endif ; x86-32/64
41%endif ; %3 == 5/7
42%endif ; avg/sad
43  movsxdifnidn src_strideq, src_strided
44  movsxdifnidn ref_strideq, ref_strided
45%if %3 == 7
46  lea         src_stride3q, [src_strideq*3]
47  lea         ref_stride3q, [ref_strideq*3]
48%endif ; %3 == 7
49%endmacro
50
51; unsigned int aom_sad128x128_avx2(uint8_t *src, int src_stride,
52;                                  uint8_t *ref, int ref_stride);
53%macro SAD128XN 1-2 0
54  SAD_FN 128, %1, 5, %2
55  mov              n_rowsd, %1
56  pxor                  m0, m0
57
58.loop:
59  movu                  m1, [refq]
60  movu                  m2, [refq+32]
61  movu                  m3, [refq+64]
62  movu                  m4, [refq+96]
63%if %2 == 1
64  vpavgb                m1, [second_predq+mmsize*0]
65  vpavgb                m2, [second_predq+mmsize*1]
66  vpavgb                m3, [second_predq+mmsize*2]
67  vpavgb                m4, [second_predq+mmsize*3]
68  lea         second_predq, [second_predq+mmsize*4]
69%endif
70  vpsadbw               m1, [srcq]
71  vpsadbw               m2, [srcq+32]
72  vpsadbw               m3, [srcq+64]
73  vpsadbw               m4, [srcq+96]
74
75  add                 refq, ref_strideq
76  add                 srcq, src_strideq
77
78  vpaddd                m1, m2
79  vpaddd                m3, m4
80  vpaddd                m0, m1
81  vpaddd                m0, m3
82
83  dec              n_rowsd
84  jg .loop
85
86  vextracti128         xm1, m0, 1
87  paddd                xm0, xm1
88
89  movhlps              xm1, xm0
90  paddd                xm0, xm1
91  movd                 eax, xm0
92
93  RET
94%endmacro
95
96INIT_YMM avx2
97SAD128XN 128     ; sad128x128_avx2
98SAD128XN 128, 1  ; sad128x128_avg_avx2
99SAD128XN 64      ; sad128x64_avx2
100SAD128XN 64, 1   ; sad128x64_avg_avx2
101
102
103; unsigned int aom_sad64x64_avx2(uint8_t *src, int src_stride,
104;                               uint8_t *ref, int ref_stride);
105%macro SAD64XN 1-2 0
106  SAD_FN 64, %1, 5, %2
107  mov              n_rowsd, %1/2
108  pxor                  m0, m0
109.loop:
110  movu                  m1, [refq]
111  movu                  m2, [refq+32]
112  movu                  m3, [refq+ref_strideq]
113  movu                  m4, [refq+ref_strideq+32]
114%if %2 == 1
115  vpavgb                m1, [second_predq+mmsize*0]
116  vpavgb                m2, [second_predq+mmsize*1]
117  vpavgb                m3, [second_predq+mmsize*2]
118  vpavgb                m4, [second_predq+mmsize*3]
119  lea         second_predq, [second_predq+mmsize*4]
120%endif
121  vpsadbw               m1, [srcq]
122  vpsadbw               m2, [srcq+32]
123  vpsadbw               m3, [srcq+src_strideq]
124  vpsadbw               m4, [srcq+src_strideq+32]
125
126  vpaddd                m1, m2
127  vpaddd                m3, m4
128  lea                 refq, [refq+ref_strideq*2]
129  vpaddd                m0, m1
130  lea                 srcq, [srcq+src_strideq*2]
131  vpaddd                m0, m3
132  dec              n_rowsd
133  jg .loop
134
135  vextracti128         xm1, m0, 1
136  paddd                xm0, xm1
137
138  movhlps              xm1, xm0
139  paddd                xm0, xm1
140  movd                 eax, xm0
141  RET
142%endmacro
143
144INIT_YMM avx2
145SAD64XN 128     ; sad64x128_avx2
146SAD64XN 128, 1  ; sad64x128_avg_avx2
147SAD64XN 64 ; sad64x64_avx2
148SAD64XN 32 ; sad64x32_avx2
149SAD64XN 64, 1 ; sad64x64_avg_avx2
150SAD64XN 32, 1 ; sad64x32_avg_avx2
151SAD64XN 16 ; sad64x16_avx2
152SAD64XN 16, 1 ; sad64x16_avg_avx2
153
154
155; unsigned int aom_sad32x32_avx2(uint8_t *src, int src_stride,
156;                               uint8_t *ref, int ref_stride);
157%macro SAD32XN 1-2 0
158  SAD_FN 32, %1, 7, %2
159  mov              n_rowsd, %1/4
160  pxor                  m0, m0
161.loop:
162  movu                  m1, [refq]
163  movu                  m2, [refq+ref_strideq]
164  movu                  m3, [refq+ref_strideq*2]
165  movu                  m4, [refq+ref_stride3q]
166%if %2 == 1
167  vpavgb                m1, [second_predq+mmsize*0]
168  vpavgb                m2, [second_predq+mmsize*1]
169  vpavgb                m3, [second_predq+mmsize*2]
170  vpavgb                m4, [second_predq+mmsize*3]
171  lea         second_predq, [second_predq+mmsize*4]
172%endif
173  psadbw                m1, [srcq]
174  psadbw                m2, [srcq+src_strideq]
175  psadbw                m3, [srcq+src_strideq*2]
176  psadbw                m4, [srcq+src_stride3q]
177
178  vpaddd                m1, m2
179  vpaddd                m3, m4
180  lea                 refq, [refq+ref_strideq*4]
181  vpaddd                m0, m1
182  lea                 srcq, [srcq+src_strideq*4]
183  vpaddd                m0, m3
184  dec              n_rowsd
185  jg .loop
186
187  vextracti128         xm1, m0, 1
188  paddd                xm0, xm1
189
190  movhlps              xm1, xm0
191  paddd                xm0, xm1
192  movd                 eax, xm0
193  RET
194%endmacro
195
196INIT_YMM avx2
197SAD32XN 64 ; sad32x64_avx2
198SAD32XN 32 ; sad32x32_avx2
199SAD32XN 16 ; sad32x16_avx2
200SAD32XN 64, 1 ; sad32x64_avg_avx2
201SAD32XN 32, 1 ; sad32x32_avg_avx2
202SAD32XN 16, 1 ; sad32x16_avg_avx2
203SAD32XN 8 ; sad_32x8_avx2
204SAD32XN 8, 1 ; sad_32x8_avg_avx2
205