1; Copyright (c) 2019, The rav1e contributors. All rights reserved
2;
3; This source code is subject to the terms of the BSD 2 Clause License and
4; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5; was not distributed with this source code in the LICENSE file, you can
6; obtain it at www.aomedia.org/license/software. If the Alliance for Open
7; Media Patent License 1.0 was not distributed with this source code in the
8; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10%include "config.asm"
11%include "ext/x86/x86inc.asm"
12
13%if ARCH_X86_64
14
15SECTION_RODATA 32
16maddubsw_hsub: times 16 db 1, -1
17
18SECTION .text
19
20%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
21
22; Perform 4x4 hadamard transform on input with 2 rows per register.
23; Rows 0 and 2 are in m0 and rows 1 and 3 are in m1.
24; A second set of packed input can also be taken in m2 and m3.
25; Ends with sums in every other entry (i.e. already reduced horizontally).
26%macro HADAMARD_4x4_PACKED 1
27%if %1 == 1
28    %define tmp m2
29    ; 2->0, 1->2, 0->2
30    %define ROTATE SWAP 2, 1, 0
31%elif %1 == 2
32    %define tmp m4
33    ; 4->0, 3->2, 2->3, 1->2, 0->1
34    %define ROTATE SWAP 4, 3, 2, 1, 0
35%endif
36    ; m0  d2 c2 b2 a2 d0 c0 b0 a0
37    ; m1  d3 c3 b3 a3 d1 c1 b1 a1
38
39    ; Stage 1
40    ; m0  d2+d3 c2+c3 b2+b3 a2+a3 d0+d1 c0+c1 b0+b1 a0+a1
41    ; m1  d2-d3 c2-c3 b2-b3 a2-a3 d0-d1 c0-c1 b0-b1 a0-a1
42    paddw              tmp, m0, m1
43    psubw               m0, m1
44%if %1 == 2
45    paddw               m1, m2, m3
46    psubw               m2, m3
47%endif
48    ROTATE
49
50    ; Stage 2
51    ; m0  d0-d1 d0+d1 c0-c1 c0+c1 b0-b1 b0+b1 a0-a1 a0+a1
52    ; m1  d2-d3 d2+d3 c2-c3 c2+c3 b2-b3 b2+b3 a2-a3 a2+a3
53    punpcklwd          tmp, m0, m1
54    punpckhwd           m0, m1
55%if %1 == 2
56    punpcklwd           m1, m2, m3
57    punpckhwd           m2, m3
58%endif
59    ROTATE
60
61    ; m0  d0-d1+d2-d3 d0+d1+d2+d3 c0-c1+c2-c3 c0+c1+c2+c3
62    ;     b0-b1+b2-b3 b0+b1+b2+b3 a0-a1+a2-a3 a0+a1+a2+a3
63    ; m1  d0-d2-d2+d3 d0+d1-d2-d3 c0-c1-c2+c3 c0+c1-c2-c3
64    ;     b0-b1-b2+b3 b0+b1-b2-b3 a0-a1-a2-a3 a0+a1-a2-a3
65    paddw              tmp, m0, m1
66    psubw               m0, m1
67%if %1 == 2
68    paddw               m1, m2, m3
69    psubw               m2, m3
70%endif
71    ROTATE
72
73    ; m0  s2 s0 r2 r0 q2 q0 p2 p0
74    ; m1  s3 s1 r3 r1 q3 q1 p3 p1
75
76    ; Stage 1
77    ; m0  q3 q1 q2 q0 p3 p1 p2 p0
78    ; m1  s3 s1 s2 s0 r3 r1 r2 r0
79    punpckldq          tmp, m0, m1
80    punpckhdq           m0, m1
81%if %1 == 2
82    punpckldq           m1, m2, m3
83    punpckhdq           m2, m3
84%endif
85    ROTATE
86
87    ; m0  q3+s3 q1+s1 q2+s2 q0+s0 p3+r3 p1+r1 p2+r2 p0+r0
88    ; m1  q3-s3 q1-s1 q2-s2 q0-s0 p3-r3 p1-r1 p2-r2 p0-r0
89    paddw              tmp, m0, m1
90    psubw               m0, m1
91%if %1 == 2
92    paddw               m1, m2, m3
93    psubw               m2, m3
94%endif
95    ROTATE
96
97    ; Stage 2
98    ; m0  p3-r3 p1-r1 p2-r2 p0-r0 p3+r3 p1+r1 p2+r2 p0+r0
99    ; m1  q3-s3 q1-s1 q2-s2 q0-s0 q3+s3 q1+s1 q2+s2 q0+s0
100    punpcklqdq         tmp, m0, m1
101    punpckhqdq          m0, m1
102%if %1 == 2
103    punpcklqdq          m1, m2, m3
104    punpckhqdq          m2, m3
105%endif
106    ROTATE
107
108    ; Use the fact that
109    ;   (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b))
110    ;  to merge the final butterfly with the abs and the first stage of
111    ;  accumulation.
112    ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead.
113    ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
114    ; The final sum must be offset to compensate for subtracting 0x7FFF.
115    paddw              tmp, m0, m1
116    pmaxsw              m0, m1
117    ; m1 is free
118    ; 0x7FFF
119    pcmpeqb             m1, m1
120    psrlw               m1, 1
121
122    paddsw             tmp, m1
123    psubw               m0, tmp
124%if %1 == 2
125    paddw              tmp, m2, m3
126    pmaxsw              m2, m3
127    paddsw             tmp, m1
128    psubw               m2, tmp
129
130    paddw               m0, m2
131%endif
132%endmacro
133
134; Load diffs of 4 entries for 2 rows
135%macro LOAD_PACK_DIFF_Dx2 7
136    movd               m%1, %2
137    movd               m%6, %4
138    punpckldq          m%1, m%6
139    pmovzxbw           m%1, m%1
140    movd               m%6, %3
141    movd               m%7, %5
142    punpckldq          m%6, m%7
143    pmovzxbw           m%6, m%6
144    psubw              m%1, m%6
145%endmacro
146
147; Can only use 128-bit vectors
148%macro SATD_4x4_FN 0
149cglobal satd_4x4, 4, 6, 4, src, src_stride, dst, dst_stride, \
150                           src_stride3, dst_stride3
151    lea       src_stride3q, [src_strideq*3]
152    lea       dst_stride3q, [dst_strideq*3]
153
154    ; Load rows 0 and 2 to m0 and 1 and 3 to m1
155    LOAD_PACK_DIFF_Dx2 0, [srcq], [dstq], \
156                          [srcq+src_strideq*2], [dstq+dst_strideq*2], \
157                          2, 3
158    LOAD_PACK_DIFF_Dx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
159                          [srcq+src_stride3q], [dstq+dst_stride3q], \
160                          2, 3
161
162    HADAMARD_4x4_PACKED 1
163
164    ; Reduce horizontally
165    pshufd              m1, m0, q3232
166    paddw               m0, m1
167    pshuflw             m1, m0, q3232
168    paddw               m0, m1
169    pshuflw             m1, m0, q1111
170
171    ; Perform normalization during the final stage of accumulation
172    pavgw               m0, m1
173    movd               eax, m0
174    movzx              eax, ax
175
176    ; Add an offset for how the final butterfly stage and the first stage of
177    ;  accumulation was done. Since this offset is an even number, this can
178    ;  safely be done after normalization using pavgw.
179    sub                 ax, 4
180    RET
181%endmacro
182
183INIT_XMM sse4
184SATD_4x4_FN
185
186INIT_XMM avx2
187SATD_4x4_FN
188
189; Load diffs of 8 entries for 2 row
190; Each set of 4 columns share an 128-bit lane
191%macro LOAD_PACK_DIFF_Qx2 7
192    movq              xm%1, %2
193    movq              xm%6, %4
194    punpckldq         xm%1, xm%6
195    pmovzxbw           m%1, xm%1
196    movq              xm%6, %3
197    movq              xm%7, %5
198    punpckldq         xm%6, xm%7
199    pmovzxbw           m%6, xm%6
200    psubw              m%1, m%6
201%endmacro
202
203INIT_YMM avx2
204cglobal satd_8x4, 4, 6, 4, src, src_stride, dst, dst_stride, \
205                           src_stride3, dst_stride3
206    lea       src_stride3q, [src_strideq*3]
207    lea       dst_stride3q, [dst_strideq*3]
208    ; Load rows 0 and 2 to m0 and 1 and 3 to m1
209    ; Each set of 4 columns share 128-bit lanes
210    LOAD_PACK_DIFF_Qx2 0, [srcq], [dstq], \
211                          [srcq+src_strideq*2], [dstq+dst_strideq*2], \
212                       2, 3
213    LOAD_PACK_DIFF_Qx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
214                          [srcq+src_stride3q], [dstq+dst_stride3q], \
215                       2, 3
216
217    HADAMARD_4x4_PACKED 1
218
219    ; Reduce horizontally
220    vextracti128       xm1, m0, 1
221    paddw              xm0, xm1
222    pshufd             xm1, xm0, q3232
223    paddw              xm0, xm1
224    pshuflw            xm1, xm0, q3232
225    paddw              xm0, xm1
226    pshuflw            xm1, xm0, q1111
227
228    ; Perform normalization during the final stage of accumulation
229    pavgw              xm0, xm1
230    movd               eax, xm0
231    movzx              eax, ax
232
233    ; Add an offset for how the final butterfly stage and the first stage of
234    ;  accumulation was done. Since this offset is an even number, this can
235    ;  safely be done after normalization using pavgw.
236    sub                 ax, 8
237    RET
238
239; Load diffs of 4 entries for 4 rows
240; Each set of two rows share 128-bit lanes
241%macro LOAD_PACK_DIFF_Dx4 12
242    movd              xm%1, %2
243    movd             xm%10, %4
244    punpckldq         xm%1, xm%10
245    movd             xm%10, %6
246    movd             xm%11, %8
247    punpckldq        xm%10, xm%11
248    punpcklqdq        xm%1, xm%10
249    pmovzxbw           m%1, xm%1
250    movd             xm%10, %3
251    movd             xm%11, %5
252    punpckldq        xm%10, xm%11
253    movd             xm%11, %7
254    movd             xm%12, %9
255    punpckldq        xm%11, xm%12
256    punpcklqdq       xm%10, xm%11
257    pmovzxbw          m%10, xm%10
258    psubw              m%1, m%10
259%endmacro
260
261INIT_YMM avx2
262cglobal satd_4x8, 4, 8, 5, src, src_stride, dst, dst_stride, \
263                           src4, dst4, src_stride3, dst_stride3
264    lea       src_stride3q, [src_strideq*3]
265    lea       dst_stride3q, [dst_strideq*3]
266    lea              src4q, [srcq+src_strideq*4]
267    lea              dst4q, [dstq+dst_strideq*4]
268    ; Load rows 0, 2, 4 and 6 to m0 and 1, 3, 5 and 7 to m1.
269    ; Lanes split the low and high rows of m0 and m1.
270    LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \
271                          [srcq+src_strideq*2], [dstq+dst_strideq*2], \
272                          [src4q], [dst4q], \
273                          [src4q+src_strideq*2], [dst4q+dst_strideq*2], \
274                       2, 3, 4
275    LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
276                          [srcq+src_stride3q], [dstq+dst_stride3q], \
277                          [src4q+src_strideq*1], [dst4q+dst_strideq*1], \
278                          [src4q+src_stride3q], [dst4q+dst_stride3q], \
279                       2, 3, 4
280
281    HADAMARD_4x4_PACKED 1
282
283    ; Reduce horizontally
284    vextracti128       xm1, m0, 1
285    paddw              xm0, xm1
286    pshufd             xm1, xm0, q3232
287    paddw              xm0, xm1
288    pshuflw            xm1, xm0, q3232
289    paddw              xm0, xm1
290    pshuflw            xm1, xm0, q1111
291
292    ; Perform normalization during the final stage of accumulation.
293    pavgw              xm0, xm1
294    movd               eax, xm0
295    movzx              eax, ax
296    sub                 ax, 8
297    RET
298
299; Rudimentary fast hadamard transform
300; Two Hadamard transforms share an 128-bit lane.
301%macro HADAMARD_4x4 0
302    ; 4->0, 3->2, 2->3, 1->2, 0->1
303    %define ROTATE SWAP 4, 3, 2, 1, 0
304
305    ; Stage 1
306    paddw               m0, m1, m2
307    psubw               m1, m2
308    paddw               m2, m3, m4
309    psubw               m3, m4
310    ROTATE
311
312    ; Stage 2
313    paddw               m0, m1, m3
314    psubw               m1, m3
315    paddw               m3, m2, m4
316    psubw               m2, m4
317    SWAP                3, 2, 1
318    ROTATE
319
320    ; Transpose
321    ; Since two transforms share an 128-bit lane, unpacking results in a single
322    ;  transform's values on each register. This has to be resolved later.
323    ; A and B indicate different 4x4 transforms.
324
325    ; Start
326    ; m1  B (a3 a2 a1 a0) A (a3 a2 a1 a0)
327    ; m2  B (b3 b2 b1 b0) A (b3 b2 b1 b0)
328    ; m3  B (c3 c2 c1 c0) A (c3 c2 c1 c0)
329    ; m4  B (d3 d2 d1 d0) A (d3 d2 d1 d0)
330
331    ; Stage 1
332    ; m1  A (b3 a3 b2 a2 b1 a1 b0 a0)
333    ; m2  B (b3 a3 b2 a2 b1 a1 b0 a0)
334    ; m3  A (d3 c3 d2 c2 d1 c1 d0 c0)
335    ; m4  B (d3 c3 d2 c2 d1 c1 d0 c0)
336    punpcklwd           m0, m1, m2
337    punpckhwd           m1, m2
338    punpcklwd           m2, m3, m4
339    punpckhwd           m3, m4
340    ROTATE
341
342    ; m1  A (d3 c3 b3 a3 d2 c2 b2 a2)
343    ; m2  A (d1 c1 b1 a1 d0 c0 b0 a0)
344    ; m3  B (d3 c3 b3 a3 d2 c2 b2 a2)
345    ; m4  B (d1 c1 b1 a1 d0 c0 b0 a0)
346    punpckldq           m0, m1, m3
347    punpckhdq           m1, m3
348    punpckldq           m3, m2, m4
349    punpckhdq           m2, m4
350    SWAP                3, 2, 1
351    ROTATE
352
353    ; Make the transforms share 128-bit lanes again.
354    ; m1  B (d0 c0 b0 a0) A (d0 c0 b0 a0)
355    ; m2  B (d1 c1 b1 a1) A (d1 c1 b1 a1)
356    ; m3  B (d2 c2 b2 a2) A (d2 c2 b2 a2)
357    ; m4  B (d3 c3 b3 a3) A (d3 c3 b3 a3)
358    punpcklqdq          m0, m1, m2
359    punpckhqdq          m1, m2
360    punpcklqdq          m2, m3, m4
361    punpckhqdq          m3, m4
362    ROTATE
363
364    ; Stage 1
365    paddw               m0, m1, m2
366    psubw               m1, m2
367    paddw               m2, m3, m4
368    psubw               m3, m4
369    ROTATE
370
371    ; Use the fact that
372    ;   (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b))
373    ;  to merge the final butterfly with the abs and the first stage of
374    ;  accumulation.
375    ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead.
376    ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
377    ; The final sum must be offset to compensate for subtracting 0x7FFF.
378    paddw               m0, m1, m3
379    pmaxsw              m1, m3
380    ; m2 is free
381    ; 0x7FFF
382    pcmpeqb             m3, m3
383    psrlw               m3, 1
384
385    paddsw              m0, m3
386    psubw               m1, m0
387
388    paddw               m0, m2, m4
389    pmaxsw              m2, m4
390    paddsw              m0, m3
391    psubw               m2, m0
392
393    paddw               m1, m2
394    SWAP                1, 0
395%endmacro
396
397; Load diffs of 16 entries for 1 row
398%macro LOAD_DIFF_DQ 4
399    movu              xm%1, %2
400    movu              xm%4, %3
401    vpmovzxbw          m%1, xm%1
402    vpmovzxbw          m%4, xm%4
403    psubw              m%1, m%4
404%endmacro
405
406INIT_YMM avx2
407cglobal satd_16x4, 4, 6, 5, src, src_stride, dst, dst_stride, \
408                            src_stride3, dst_stride3
409    lea       src_stride3q, [src_strideq*3]
410    lea       dst_stride3q, [dst_strideq*3]
411    LOAD_DIFF_DQ 1, [srcq], [dstq], 0
412    LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
413    LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
414    LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0
415
416    HADAMARD_4x4
417
418    ; Reduce horizontally
419    vextracti128       xm1, m0, 1
420    paddw              xm0, xm1
421    pshufd             xm1, xm0, q3232
422    paddw              xm0, xm1
423    pshuflw            xm1, xm0, q3232
424    paddw              xm0, xm1
425    pshuflw            xm1, xm0, q1111
426
427    ; Perform normalization during the final stage of accumulation
428    ; Avoids overflow in this case
429    pavgw              xm0, xm1
430    movd               eax, xm0
431    movzx              eax, ax
432
433    ; Add an offset for how the final butterfly stage and the first stage of
434    ;  accumulation was done. Since this offset is an even number, this can
435    ;  safely be done after normalization using pavgw.
436    sub                 ax, 16
437    RET
438
439INIT_YMM avx2
440cglobal satd_4x16, 4, 8, 7, src, src_stride, dst, dst_stride, \
441                            src4, dst4, src_stride3, dst_stride3
442    lea       src_stride3q, [src_strideq*3]
443    lea       dst_stride3q, [dst_strideq*3]
444    lea              src4q, [srcq+src_strideq*4]
445    lea              dst4q, [dstq+dst_strideq*4]
446    LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \
447                          [srcq+src_strideq*2], [dstq+dst_strideq*2], \
448                          [src4q], [dst4q], \
449                          [src4q+src_strideq*2], [dst4q+dst_strideq*2], \
450                       4, 5, 6
451    LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
452                          [srcq+src_stride3q], [dstq+dst_stride3q], \
453                          [src4q+src_strideq*1], [dst4q+dst_strideq*1], \
454                          [src4q+src_stride3q], [dst4q+dst_stride3q], \
455                       4, 5, 6
456    lea               srcq, [srcq+src_strideq*8]
457    lea               dstq, [dstq+dst_strideq*8]
458    lea              src4q, [src4q+src_strideq*8]
459    lea              dst4q, [dst4q+dst_strideq*8]
460    LOAD_PACK_DIFF_Dx4 2, [srcq], [dstq], \
461                          [srcq+src_strideq*2], [dstq+dst_strideq*2], \
462                          [src4q], [dst4q], \
463                          [src4q+src_strideq*2], [dst4q+dst_strideq*2], \
464                       4, 5, 6
465    LOAD_PACK_DIFF_Dx4 3, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
466                          [srcq+src_stride3q], [dstq+dst_stride3q], \
467                          [src4q+src_strideq*1], [dst4q+dst_strideq*1], \
468                          [src4q+src_stride3q], [dst4q+dst_stride3q], \
469                       4, 5, 6
470    HADAMARD_4x4_PACKED 2
471
472    ; Reduce horizontally
473    vextracti128       xm1, m0, 1
474    paddw              xm0, xm1
475    pshufd             xm1, xm0, q3232
476    paddw              xm0, xm1
477    pshuflw            xm1, xm0, q3232
478    paddw              xm0, xm1
479    pshuflw            xm1, xm0, q1111
480
481    ; Perform normalization during the final stage of accumulation
482    pavgw              xm0, xm1
483    movd               eax, xm0
484    movzx              eax, ax
485
486    ; Add an offset for how the final butterfly stage and the first stage of
487    ;  accumulation was done. Since this offset is an even number, this can
488    ;  safely be done after normalization using pavgw.
489    sub                 ax, 16
490    RET
491
492; On x86-64 we can transpose in-place without spilling registers.
493; By clever choices of the order to apply the butterflies and the order of
494;  their outputs, we can take the rows in order and output the columns in order
495;  without any extra operations and using just one temporary register.
496%macro TRANSPOSE8x8 9
497    punpckhwd           m%9, m%5, m%6
498    punpcklwd           m%5, m%6
499    ; m%6 is free
500    punpckhwd           m%6, m%1, m%2
501    punpcklwd           m%1, m%2
502    ; m%2 is free
503    punpckhwd           m%2, m%7, m%8
504    punpcklwd           m%7, m%8
505    ; m%8 is free
506    punpckhwd           m%8, m%3, m%4
507    punpcklwd           m%3, m%4
508    ; m%4 is free
509    punpckhdq           m%4, m%1, m%3
510    punpckldq           m%1, m%3
511    ; m%3 is free
512    punpckldq           m%3, m%5, m%7
513    punpckhdq           m%5, m%7
514    ; m%7 is free
515    punpckhdq           m%7, m%6, m%8
516    punpckldq           m%6, m%8
517    ; m%8 is free
518    punpckldq           m%8, m%9, m%2
519    punpckhdq           m%9, m%2
520    ; m%2 is free
521    punpckhqdq          m%2, m%1, m%3
522    punpcklqdq          m%1, m%3
523    ; m%3 is free
524    punpcklqdq          m%3, m%4, m%5
525    punpckhqdq          m%4, m%5
526    ; m%5 is free
527    punpcklqdq          m%5, m%6, m%8
528    punpckhqdq          m%6, m%8
529    ; m%8 is free
530    punpckhqdq          m%8, m%7, m%9
531    punpcklqdq          m%7, m%9
532%endmacro
533
534; Load diff of 8 entries for 1 row
535%macro LOAD_DIFF_Q 4
536    movq                %1, %2
537    movq                %4, %3
538    punpcklbw           %1, %4
539    pmaddubsw           %1, hsub
540%endmacro
541
542%macro HADAMARD_8_STAGE_1 9
543    paddw              m%9, m%1, m%2
544    psubw              m%1, m%2
545    paddw              m%2, m%3, m%4
546    psubw              m%3, m%4
547    paddw              m%4, m%5, m%6
548    psubw              m%5, m%6
549    paddw              m%6, m%7, m%8
550    psubw              m%7, m%8
551    ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1
552    SWAP                %8, %7, %6, %5, %4, %3, %2, %1, %9
553%endmacro
554
555%macro HADAMARD_8_STAGE_2 9
556    paddw              m%9, m%1, m%3 ; 0
557    psubw              m%1, m%3      ; 2
558    paddw              m%3, m%2, m%4 ; 1
559    psubw              m%2, m%4      ; 3
560    SWAP                %3, %2, %1
561    paddw              m%4, m%5, m%7 ; 4
562    psubw              m%5, m%7      ; 6
563    paddw              m%7, m%6, m%8 ; 5
564    psubw              m%6, m%8      ; 7
565    SWAP                %7, %6, %5
566    ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1
567    SWAP                %8, %7, %6, %5, %4, %3, %2, %1, %9
568%endmacro
569
570%macro HADAMARD_8_STAGE_3 9
571    paddw              m%9, m%1, m%5 ; 0
572    psubw              m%1, m%5      ; 4
573    paddw              m%5, m%2, m%6 ; 1
574    psubw              m%2, m%6      ; 5
575    paddw              m%6, m%3, m%7 ; 2
576    psubw              m%3, m%7      ; 6
577    paddw              m%7, m%4, m%8 ; 3
578    psubw              m%4, m%8      ; 7
579    SWAP                %5, %2, %6, %3, %7, %4, %1
580    ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1
581    SWAP                %8, %7, %6, %5, %4, %3, %2, %1, %9
582%endmacro
583
584; Rudimentary fast hadamard transform
585%macro HADAMARD_8x8 0
586    HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0
587    HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0
588    HADAMARD_8_STAGE_3 1, 2, 3, 4, 5, 6, 7, 8, 0
589
590    TRANSPOSE8x8 1, 2, 3, 4, 5, 6, 7, 8, 0
591
592    HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0
593    HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0
594
595    ; Stage 3
596    ; Use the fact that
597    ;   (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b))
598    ;  to merge the final butterfly with the abs and the first stage of
599    ;  accumulation.
600    ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead.
601    ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
602    ; The final sum must be offset to compensate for subtracting 0x7FFF.
603    paddw               m0, m1, m5
604    pmaxsw              m1, m5
605    ; m1 is free
606    ; 0x7FFF
607    pcmpeqb             m5, m5
608    psrlw               m5, 1
609
610    paddsw              m0, m5
611    psubw               m1, m0
612
613    paddw               m0, m2, m6
614    pmaxsw              m2, m6
615    paddsw              m0, m5
616    psubw               m2, m0
617
618    paddw               m0, m3, m7
619    pmaxsw              m3, m7
620    paddsw              m0, m5
621    psubw               m3, m0
622
623    paddw               m0, m4, m8
624    pmaxsw              m4, m8
625    paddsw              m0, m5
626    psubw               m4, m0
627
628    paddw               m1, m2
629    paddw               m3, m4
630
631    paddw               m1, m3
632    SWAP                 1, 0
633%endmacro
634
635; Only works with 128 bit vectors
636%macro SATD_8x8_FN 0
637cglobal satd_8x8, 4, 6, 10, src, src_stride, dst, dst_stride, \
638                           src_stride3, dst_stride3
639    %define           hsub  m0
640    mova              hsub, [maddubsw_hsub]
641    ; Load rows into m1-m8
642    lea       src_stride3q, [src_strideq*3]
643    lea       dst_stride3q, [dst_strideq*3]
644    LOAD_DIFF_Q m1, [srcq], [dstq], m2
645    LOAD_DIFF_Q m2, [srcq+src_strideq*1], [dstq+dst_strideq*1], m3
646    LOAD_DIFF_Q m3, [srcq+src_strideq*2], [dstq+dst_strideq*2], m4
647    LOAD_DIFF_Q m4, [srcq+src_stride3q], [dstq+dst_stride3q], m5
648    lea               srcq, [srcq+src_strideq*4]
649    lea               dstq, [dstq+dst_strideq*4]
650    LOAD_DIFF_Q m5, [srcq], [dstq], m6
651    LOAD_DIFF_Q m6, [srcq+src_strideq*1], [dstq+dst_strideq*1], m7
652    LOAD_DIFF_Q m7, [srcq+src_strideq*2], [dstq+dst_strideq*2], m8
653    LOAD_DIFF_Q m8, [srcq+src_stride3q], [dstq+dst_stride3q], m9
654
655    HADAMARD_8x8
656
657    ; Reduce horizontally and convert to 32 bits
658    pxor                m2, m2
659    punpcklwd           m1, m0, m2
660    punpckhwd           m0, m2
661    paddd               m0, m1
662
663    pshufd              m1, m0, q3232
664    paddd               m0, m1
665    pshuflw             m1, m0, q3232
666    paddd               m0, m1
667    movd               eax, m0
668
669    ; Normalize
670    ; Add rounding offset and an offset for how the final butterfly stage and
671    ;  the first stage of accumulation was done.
672    sub                eax, 32-2
673    shr                eax, 2
674    RET
675%endmacro
676
677INIT_XMM ssse3
678SATD_8x8_FN
679
680INIT_XMM avx2
681SATD_8x8_FN
682
683INIT_YMM avx2
684cglobal satd_16x8, 4, 6, 9, src, src_stride, dst, dst_stride, \
685                            src_stride3, dst_stride3
686    ; Load rows into m1-m8
687    lea       src_stride3q, [src_strideq*3]
688    lea       dst_stride3q, [dst_strideq*3]
689    LOAD_DIFF_DQ 1, [srcq], [dstq], 0
690    LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
691    LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
692    LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0
693    lea               srcq, [srcq+src_strideq*4]
694    lea               dstq, [dstq+dst_strideq*4]
695    LOAD_DIFF_DQ 5, [srcq], [dstq], 0
696    LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
697    LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
698    LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0
699
700    HADAMARD_8x8
701
702    ; Reduce horizontally and convert to 32 bits
703    pxor                m2, m2
704    punpcklwd           m1, m0, m2
705    punpckhwd           m0, m2
706    paddd               m0, m1
707
708    vextracti128       xm1, m0, 1
709    paddd              xm0, xm1
710    pshufd             xm1, xm0, q3232
711    paddd              xm0, xm1
712    pshuflw            xm1, xm0, q3232
713    paddd              xm0, xm1
714    movd               eax, xm0
715
716    ; Normalize
717    ; Add rounding offset and an offset for how the final butterfly stage and
718    ;  the first stage of accumulation was done.
719    sub                eax, 64-2
720    shr                eax, 2
721    RET
722
723%macro LOAD_DIFF_Qx2 7
724    movq              xm%1, %2
725    movq              xm%6, %3
726    punpcklbw         xm%1, xm%6
727    movq              xm%6, %4
728    movq              xm%7, %5
729    punpcklbw         xm%6, xm%7
730    vinserti128        m%1, xm%6, 1
731    pmaddubsw          m%1, hsub
732%endmacro
733
734INIT_YMM avx2
735cglobal satd_8x16, 4, 8, 11, src, src_stride, dst, dst_stride, \
736                             src8, dst8, src_stride3, dst_stride3
737    %define           hsub  m0
738    mova              hsub, [maddubsw_hsub]
739    ; Load rows into m1-m8
740    lea              src8q, [srcq+src_strideq*8]
741    lea              dst8q, [dstq+dst_strideq*8]
742    lea       src_stride3q, [src_strideq*3]
743    lea       dst_stride3q, [dst_strideq*3]
744    LOAD_DIFF_Qx2 1, [srcq], [dstq], \
745                     [src8q], [dst8q], \
746                     9, 10
747    LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
748                     [src8q+src_strideq*1], [dst8q+dst_strideq*1], \
749                     9, 10
750    LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
751                     [src8q+src_strideq*2], [dst8q+dst_strideq*2], \
752                     9, 10
753    LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \
754                     [src8q+src_stride3q], [dst8q+dst_stride3q], \
755                     9, 10
756    lea               srcq, [srcq+src_strideq*4]
757    lea               dstq, [dstq+dst_strideq*4]
758    lea              src8q, [src8q+src_strideq*4]
759    lea              dst8q, [dst8q+dst_strideq*4]
760    LOAD_DIFF_Qx2 5, [srcq], [dstq], \
761                     [src8q], [dst8q], \
762                     9, 10
763    LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
764                     [src8q+src_strideq*1], [dst8q+dst_strideq*1], \
765                     9, 10
766    LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
767                     [src8q+src_strideq*2], [dst8q+dst_strideq*2], \
768                     9, 10
769    LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \
770                     [src8q+src_stride3q], [dst8q+dst_stride3q], \
771                     9, 10
772
773    HADAMARD_8x8
774
775    ; Reduce horizontally and convert to 32 bits
776    pxor                m2, m2
777    punpcklwd           m1, m0, m2
778    punpckhwd           m0, m2
779    paddd               m0, m1
780
781    vextracti128       xm1, m0, 1
782    paddd              xm0, xm1
783    pshufd             xm1, xm0, q3232
784    paddd              xm0, xm1
785    pshuflw            xm1, xm0, q3232
786    paddd              xm0, xm1
787    movd               eax, xm0
788
789    ; Normalize
790    ; Add rounding offset and an offset for how the final butterfly stage and
791    ;  the first stage of accumulation was done.
792    sub                eax, 64-2
793    shr                eax, 2
794    RET
795
796; Less optimized, boilerplate implementations
797
798INIT_YMM avx2
799cglobal satd_8x32, 4, 9, 13, src, src_stride, dst, dst_stride, \
800                             src8, dst8, src_stride3, dst_stride3, cnt
801    ; ones for converting to 32-bit with pmaddwd
802    pcmpeqw            m11, m11
803    pabsw              m11, m11
804    ; sum
805    pxor               m12, m12
806    mov               cntd, 1
807    lea       src_stride3q, [src_strideq*3]
808    lea       dst_stride3q, [dst_strideq*3]
809    lea              src8q, [srcq+src_strideq*8]
810    lea              dst8q, [dstq+dst_strideq*8]
811.loop:
812    %define           hsub  m0
813    mova              hsub, [maddubsw_hsub]
814    ; Load rows into m1-m8
815    LOAD_DIFF_Qx2 1, [srcq], [dstq], \
816                     [src8q], [dst8q], \
817                  9, 10
818    LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
819                     [src8q+src_strideq*1], [dst8q+dst_strideq*1], \
820                  9, 10
821    LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
822                     [src8q+src_strideq*2], [dst8q+dst_strideq*2], \
823                  9, 10
824    LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \
825                     [src8q+src_stride3q], [dst8q+dst_stride3q], \
826                  9, 10
827    lea               srcq, [srcq+src_strideq*4]
828    lea               dstq, [dstq+dst_strideq*4]
829    lea              src8q, [src8q+src_strideq*4]
830    lea              dst8q, [dst8q+dst_strideq*4]
831    LOAD_DIFF_Qx2 5, [srcq], [dstq], \
832                     [src8q], [dst8q], \
833                  9, 10
834    LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
835                     [src8q+src_strideq*1], [dst8q+dst_strideq*1], \
836                  9, 10
837    LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
838                     [src8q+src_strideq*2], [dst8q+dst_strideq*2], \
839                  9, 10
840    LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \
841                     [src8q+src_stride3q], [dst8q+dst_stride3q], \
842                  9, 10
843
844    HADAMARD_8x8
845
846    ; Reduce horizontally and convert to 32 bits
847    pmaddwd             m0, m11
848    paddd              m12, m0
849
850    lea               srcq, [srcq+src_stride3q*4]
851    lea               dstq, [dstq+dst_stride3q*4]
852    lea              src8q, [src8q+src_stride3q*4]
853    lea              dst8q, [dst8q+dst_stride3q*4]
854    dec               cntd
855    jge .loop
856
857    vextracti128       xm0, m12, 1
858    paddd              xm0, xm12
859    pshufd             xm1, xm0, q3232
860    paddd              xm0, xm1
861    pshuflw            xm1, xm0, q3232
862    paddd              xm0, xm1
863    movd               eax, xm0
864
865    ; Normalize
866    ; Add rounding offset and an offset for how the final butterfly stage and
867    ;  the first stage of accumulation was done.
868    sub                eax, 128-2
869    shr                eax, 2
870    RET
871
872INIT_YMM avx2
873cglobal satd_16x8_internal, 0, 0, 0, \
874                            dummy1, src_stride, dummy2, dst_stride, \
875                            src_stride3, dst_stride3, src, dst
876    %define hadd m9
877    %define sum m10
878    ; Load rows into m1-m8
879    LOAD_DIFF_DQ 1, [srcq], [dstq], 0
880    LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
881    LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
882    LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0
883    lea               srcq, [srcq+src_strideq*4]
884    lea               dstq, [dstq+dst_strideq*4]
885    LOAD_DIFF_DQ 5, [srcq], [dstq], 0
886    LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
887    LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
888    LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0
889
890    HADAMARD_8x8
891
892    pmaddwd             m0, hadd
893    paddd              sum, m0
894    ret
895
896%macro SATD_NXM 2
897%if %1 > 16
898%if %2 > 8
899cglobal satd_%1x%2, 4, 10, 11, src, src_stride, dst, dst_stride, \
900                              src_stride3, dst_stride3, call_src, call_dst, \
901                              w, h
902%else
903cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \
904                              src_stride3, dst_stride3, call_src, call_dst, \
905                              w
906%endif
907%else ; %2 > 8
908cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \
909                              src_stride3, dst_stride3, call_src, call_dst, \
910                              h
911%endif
912    ; ones for converting to 32-bit with pmaddwd
913    pcmpeqw             m9, m9
914    pabsw               m9, m9
915    ; sum
916    pxor               m10, m10
917    lea       src_stride3q, [src_strideq*3]
918    lea       dst_stride3q, [dst_strideq*3]
919%if %2 > 8
920    mov                 hd, %2/8 - 1
921.looph:
922%endif
923%if %1 > 16
924    mov                 wd, %1/16 - 1
925.loopv:
926%endif
927    mov          call_srcq, srcq
928    mov          call_dstq, dstq
929    call m(satd_16x8_internal)
930%if %1 > 16
931    add               srcq, 16
932    add               dstq, 16
933    dec                 wd
934    jge .loopv
935    sub               srcq, %1
936    sub               dstq, %1
937%endif
938%if %2 > 8
939    lea               srcq, [srcq+src_strideq*8]
940    lea               dstq, [dstq+dst_strideq*8]
941    dec                 hd
942    jge .looph
943%endif
944
945    ; Reduce horizontally
946    vextracti128       xm0, m10, 1
947    paddd              xm0, xm10
948    pshufd             xm1, xm0, q3232
949    paddd              xm0, xm1
950    pshuflw            xm1, xm0, q3232
951    paddd              xm0, xm1
952    movd               eax, xm0
953
954    ; Normalize
955    ; Add rounding offset and an offset for how the final butterfly stage and
956    ;  the first stage of accumulation was done.
957    sub                eax, %1*%2/2 - 2
958    shr                eax, 2
959    RET
960%endmacro
961
962INIT_YMM avx2
963SATD_NXM 16, 16
964SATD_NXM 32, 32
965SATD_NXM 64, 64
966SATD_NXM 128, 128
967
968SATD_NXM 16, 32
969SATD_NXM 32, 16
970SATD_NXM 32, 64
971SATD_NXM 64, 32
972SATD_NXM 64, 128
973SATD_NXM 128, 64
974
975SATD_NXM 32, 8
976SATD_NXM 16, 64
977SATD_NXM 64, 16
978
979%endif ; ARCH_X86_64
980