1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%define private_prefix av1 15 16%include "third_party/x86inc/x86inc.asm" 17 18SECTION .text 19 20; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, 21; int64_t *ssz) 22 23INIT_XMM sse2 24cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz 25 pxor m4, m4 ; sse accumulator 26 pxor m6, m6 ; ssz accumulator 27 pxor m5, m5 ; dedicated zero register 28 lea uqcq, [uqcq+sizeq*2] 29 lea dqcq, [dqcq+sizeq*2] 30 neg sizeq 31.loop: 32 mova m2, [uqcq+sizeq*2] 33 mova m0, [dqcq+sizeq*2] 34 mova m3, [uqcq+sizeq*2+mmsize] 35 mova m1, [dqcq+sizeq*2+mmsize] 36 psubw m0, m2 37 psubw m1, m3 38 ; individual errors are max. 15bit+sign, so squares are 30bit, and 39 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 40 pmaddwd m0, m0 41 pmaddwd m1, m1 42 pmaddwd m2, m2 43 pmaddwd m3, m3 44 ; accumulate in 64bit 45 punpckldq m7, m0, m5 46 punpckhdq m0, m5 47 paddq m4, m7 48 punpckldq m7, m1, m5 49 paddq m4, m0 50 punpckhdq m1, m5 51 paddq m4, m7 52 punpckldq m7, m2, m5 53 paddq m4, m1 54 punpckhdq m2, m5 55 paddq m6, m7 56 punpckldq m7, m3, m5 57 paddq m6, m2 58 punpckhdq m3, m5 59 paddq m6, m7 60 paddq m6, m3 61 add sizeq, mmsize 62 jl .loop 63 64 ; accumulate horizontally and store in return value 65 movhlps m5, m4 66 movhlps m7, m6 67 paddq m4, m5 68 paddq m6, m7 69%if ARCH_X86_64 70 movq rax, m4 71 movq [sszq], m6 72%else 73 mov eax, sszm 74 pshufd m5, m4, 0x1 75 movq [eax], m6 76 movd eax, m4 77 movd edx, m5 78%endif 79 RET 80