1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%define private_prefix av1
15
16%include "third_party/x86inc/x86inc.asm"
17
18SECTION .text
19
20; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
21;                         int64_t *ssz)
22
23INIT_XMM sse2
24cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
25  pxor      m4, m4                 ; sse accumulator
26  pxor      m6, m6                 ; ssz accumulator
27  pxor      m5, m5                 ; dedicated zero register
28  lea     uqcq, [uqcq+sizeq*2]
29  lea     dqcq, [dqcq+sizeq*2]
30  neg    sizeq
31.loop:
32  mova      m2, [uqcq+sizeq*2]
33  mova      m0, [dqcq+sizeq*2]
34  mova      m3, [uqcq+sizeq*2+mmsize]
35  mova      m1, [dqcq+sizeq*2+mmsize]
36  psubw     m0, m2
37  psubw     m1, m3
38  ; individual errors are max. 15bit+sign, so squares are 30bit, and
39  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
40  pmaddwd   m0, m0
41  pmaddwd   m1, m1
42  pmaddwd   m2, m2
43  pmaddwd   m3, m3
44  ; accumulate in 64bit
45  punpckldq m7, m0, m5
46  punpckhdq m0, m5
47  paddq     m4, m7
48  punpckldq m7, m1, m5
49  paddq     m4, m0
50  punpckhdq m1, m5
51  paddq     m4, m7
52  punpckldq m7, m2, m5
53  paddq     m4, m1
54  punpckhdq m2, m5
55  paddq     m6, m7
56  punpckldq m7, m3, m5
57  paddq     m6, m2
58  punpckhdq m3, m5
59  paddq     m6, m7
60  paddq     m6, m3
61  add    sizeq, mmsize
62  jl .loop
63
64  ; accumulate horizontally and store in return value
65  movhlps   m5, m4
66  movhlps   m7, m6
67  paddq     m4, m5
68  paddq     m6, m7
69%if ARCH_X86_64
70  movq    rax, m4
71  movq [sszq], m6
72%else
73  mov     eax, sszm
74  pshufd   m5, m4, 0x1
75  movq  [eax], m6
76  movd    eax, m4
77  movd    edx, m5
78%endif
79  RET
80