1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "config.asm" 15%include "ext/x86/x86inc.asm" 16 17SECTION .text 18 19%macro SAD_FN 4 20%if %4 == 0 21%if %3 == 5 22cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 23%else ; %3 == 7 24cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 25 src_stride3, ref_stride3, n_rows 26%endif ; %3 == 5/7 27%else ; avg 28%if %3 == 5 29cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 30 second_pred, n_rows 31%else ; %3 == 7 32cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ 33 ref, ref_stride, \ 34 second_pred, \ 35 src_stride3, ref_stride3 36%if ARCH_X86_64 37%define n_rowsd r7d 38%else ; x86-32 39%define n_rowsd dword r0m 40%endif ; x86-32/64 41%endif ; %3 == 5/7 42%endif ; avg/sad 43 movsxdifnidn src_strideq, src_strided 44 movsxdifnidn ref_strideq, ref_strided 45%if %3 == 7 46 lea src_stride3q, [src_strideq*3] 47 lea ref_stride3q, [ref_strideq*3] 48%endif ; %3 == 7 49%endmacro 50 51; unsigned int aom_sad128x128_avx2(uint8_t *src, int src_stride, 52; uint8_t *ref, int ref_stride); 53%macro SAD128XN 1-2 0 54 SAD_FN 128, %1, 5, %2 55 mov n_rowsd, %1 56 pxor m0, m0 57 58.loop: 59 movu m1, [refq] 60 movu m2, [refq+32] 61 movu m3, [refq+64] 62 movu m4, [refq+96] 63%if %2 == 1 64 vpavgb m1, [second_predq+mmsize*0] 65 vpavgb m2, [second_predq+mmsize*1] 66 vpavgb m3, [second_predq+mmsize*2] 67 vpavgb m4, [second_predq+mmsize*3] 68 lea second_predq, [second_predq+mmsize*4] 69%endif 70 vpsadbw m1, [srcq] 71 vpsadbw m2, [srcq+32] 72 vpsadbw m3, [srcq+64] 73 vpsadbw m4, [srcq+96] 74 75 add refq, ref_strideq 76 add srcq, src_strideq 77 78 vpaddd m1, m2 79 vpaddd m3, m4 80 vpaddd m0, m1 81 vpaddd m0, m3 82 83 dec n_rowsd 84 jg .loop 85 86 vextracti128 xm1, m0, 1 87 paddd xm0, xm1 88 89 movhlps xm1, xm0 90 paddd xm0, xm1 91 movd eax, xm0 92 93 RET 94%endmacro 95 96INIT_YMM avx2 97SAD128XN 128 ; sad128x128_avx2 98SAD128XN 128, 1 ; sad128x128_avg_avx2 99SAD128XN 64 ; sad128x64_avx2 100SAD128XN 64, 1 ; sad128x64_avg_avx2 101 102 103; unsigned int aom_sad64x64_avx2(uint8_t *src, int src_stride, 104; uint8_t *ref, int ref_stride); 105%macro SAD64XN 1-2 0 106 SAD_FN 64, %1, 5, %2 107 mov n_rowsd, %1/2 108 pxor m0, m0 109.loop: 110 movu m1, [refq] 111 movu m2, [refq+32] 112 movu m3, [refq+ref_strideq] 113 movu m4, [refq+ref_strideq+32] 114%if %2 == 1 115 vpavgb m1, [second_predq+mmsize*0] 116 vpavgb m2, [second_predq+mmsize*1] 117 vpavgb m3, [second_predq+mmsize*2] 118 vpavgb m4, [second_predq+mmsize*3] 119 lea second_predq, [second_predq+mmsize*4] 120%endif 121 vpsadbw m1, [srcq] 122 vpsadbw m2, [srcq+32] 123 vpsadbw m3, [srcq+src_strideq] 124 vpsadbw m4, [srcq+src_strideq+32] 125 126 vpaddd m1, m2 127 vpaddd m3, m4 128 lea refq, [refq+ref_strideq*2] 129 vpaddd m0, m1 130 lea srcq, [srcq+src_strideq*2] 131 vpaddd m0, m3 132 dec n_rowsd 133 jg .loop 134 135 vextracti128 xm1, m0, 1 136 paddd xm0, xm1 137 138 movhlps xm1, xm0 139 paddd xm0, xm1 140 movd eax, xm0 141 RET 142%endmacro 143 144INIT_YMM avx2 145SAD64XN 128 ; sad64x128_avx2 146SAD64XN 128, 1 ; sad64x128_avg_avx2 147SAD64XN 64 ; sad64x64_avx2 148SAD64XN 32 ; sad64x32_avx2 149SAD64XN 64, 1 ; sad64x64_avg_avx2 150SAD64XN 32, 1 ; sad64x32_avg_avx2 151SAD64XN 16 ; sad64x16_avx2 152SAD64XN 16, 1 ; sad64x16_avg_avx2 153 154 155; unsigned int aom_sad32x32_avx2(uint8_t *src, int src_stride, 156; uint8_t *ref, int ref_stride); 157%macro SAD32XN 1-2 0 158 SAD_FN 32, %1, 7, %2 159 mov n_rowsd, %1/4 160 pxor m0, m0 161.loop: 162 movu m1, [refq] 163 movu m2, [refq+ref_strideq] 164 movu m3, [refq+ref_strideq*2] 165 movu m4, [refq+ref_stride3q] 166%if %2 == 1 167 vpavgb m1, [second_predq+mmsize*0] 168 vpavgb m2, [second_predq+mmsize*1] 169 vpavgb m3, [second_predq+mmsize*2] 170 vpavgb m4, [second_predq+mmsize*3] 171 lea second_predq, [second_predq+mmsize*4] 172%endif 173 psadbw m1, [srcq] 174 psadbw m2, [srcq+src_strideq] 175 psadbw m3, [srcq+src_strideq*2] 176 psadbw m4, [srcq+src_stride3q] 177 178 vpaddd m1, m2 179 vpaddd m3, m4 180 lea refq, [refq+ref_strideq*4] 181 vpaddd m0, m1 182 lea srcq, [srcq+src_strideq*4] 183 vpaddd m0, m3 184 dec n_rowsd 185 jg .loop 186 187 vextracti128 xm1, m0, 1 188 paddd xm0, xm1 189 190 movhlps xm1, xm0 191 paddd xm0, xm1 192 movd eax, xm0 193 RET 194%endmacro 195 196INIT_YMM avx2 197SAD32XN 64 ; sad32x64_avx2 198SAD32XN 32 ; sad32x32_avx2 199SAD32XN 16 ; sad32x16_avx2 200SAD32XN 64, 1 ; sad32x64_avg_avx2 201SAD32XN 32, 1 ; sad32x32_avg_avx2 202SAD32XN 16, 1 ; sad32x16_avg_avx2 203SAD32XN 8 ; sad_32x8_avx2 204SAD32XN 8, 1 ; sad_32x8_avg_avx2 205