1;****************************************************************************** 2;* Pixel utilities SIMD 3;* 4;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5;* Copyright (C) 2014 Clément Bœsch <u pkh me> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "x86util.asm" 25 26SECTION .text 27 28;------------------------------------------------------------------------------- 29; int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1, 30; const uint8_t *src2, ptrdiff_t stride2); 31;------------------------------------------------------------------------------- 32INIT_MMX mmx 33cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 34 pxor m7, m7 35 pxor m6, m6 36%rep 4 37 mova m0, [src1q] 38 mova m2, [src1q + stride1q] 39 mova m1, [src2q] 40 mova m3, [src2q + stride2q] 41 psubusb m4, m0, m1 42 psubusb m5, m2, m3 43 psubusb m1, m0 44 psubusb m3, m2 45 por m1, m4 46 por m3, m5 47 punpcklbw m0, m1, m7 48 punpcklbw m2, m3, m7 49 punpckhbw m1, m7 50 punpckhbw m3, m7 51 paddw m0, m1 52 paddw m2, m3 53 paddw m0, m2 54 paddw m6, m0 55 lea src1q, [src1q + 2*stride1q] 56 lea src2q, [src2q + 2*stride2q] 57%endrep 58 psrlq m0, m6, 32 59 paddw m6, m0 60 psrlq m0, m6, 16 61 paddw m6, m0 62 movd eax, m6 63 movzx eax, ax 64 RET 65 66;------------------------------------------------------------------------------- 67; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, 68; const uint8_t *src2, ptrdiff_t stride2); 69;------------------------------------------------------------------------------- 70INIT_MMX mmxext 71cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 72 pxor m2, m2 73%rep 4 74 mova m0, [src1q] 75 mova m1, [src1q + stride1q] 76 psadbw m0, [src2q] 77 psadbw m1, [src2q + stride2q] 78 paddw m2, m0 79 paddw m2, m1 80 lea src1q, [src1q + 2*stride1q] 81 lea src2q, [src2q + 2*stride2q] 82%endrep 83 movd eax, m2 84 RET 85 86;------------------------------------------------------------------------------- 87; int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1, 88; const uint8_t *src2, ptrdiff_t stride2); 89;------------------------------------------------------------------------------- 90INIT_MMX mmxext 91cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2 92 pxor m2, m2 93%rep 16 94 mova m0, [src1q] 95 mova m1, [src1q + 8] 96 psadbw m0, [src2q] 97 psadbw m1, [src2q + 8] 98 paddw m2, m0 99 paddw m2, m1 100 add src1q, stride1q 101 add src2q, stride2q 102%endrep 103 movd eax, m2 104 RET 105 106;------------------------------------------------------------------------------- 107; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 108; const uint8_t *src2, ptrdiff_t stride2); 109;------------------------------------------------------------------------------- 110INIT_XMM sse2 111cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 112 movu m4, [src1q] 113 movu m2, [src2q] 114 movu m1, [src1q + stride1q] 115 movu m3, [src2q + stride2q] 116 psadbw m4, m2 117 psadbw m1, m3 118 paddw m4, m1 119%rep 7 120 lea src1q, [src1q + 2*stride1q] 121 lea src2q, [src2q + 2*stride2q] 122 movu m0, [src1q] 123 movu m2, [src2q] 124 movu m1, [src1q + stride1q] 125 movu m3, [src2q + stride2q] 126 psadbw m0, m2 127 psadbw m1, m3 128 paddw m4, m0 129 paddw m4, m1 130%endrep 131 movhlps m0, m4 132 paddw m4, m0 133 movd eax, m4 134 RET 135 136;------------------------------------------------------------------------------- 137; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 138; const uint8_t *src2, ptrdiff_t stride2); 139;------------------------------------------------------------------------------- 140%macro SAD_XMM_16x16 1 141INIT_XMM sse2 142cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 143 mov%1 m2, [src2q] 144 psadbw m2, [src1q] 145 mov%1 m1, [src2q + stride2q] 146 psadbw m1, [src1q + stride1q] 147 paddw m2, m1 148%rep 7 149 lea src1q, [src1q + 2*stride1q] 150 lea src2q, [src2q + 2*stride2q] 151 mov%1 m0, [src2q] 152 psadbw m0, [src1q] 153 mov%1 m1, [src2q + stride2q] 154 psadbw m1, [src1q + stride1q] 155 paddw m2, m0 156 paddw m2, m1 157%endrep 158 movhlps m0, m2 159 paddw m2, m0 160 movd eax, m2 161 RET 162%endmacro 163 164SAD_XMM_16x16 a 165SAD_XMM_16x16 u 166 167 168%macro PROCESS_SAD_32x4_U 0 169 movu m1, [r2] 170 movu m2, [r2 + 16] 171 movu m3, [r0] 172 movu m4, [r0 + 16] 173 psadbw m1, m3 174 psadbw m2, m4 175 paddd m1, m2 176 paddd m0, m1 177 lea r2, [r2 + r3] 178 lea r0, [r0 + r1] 179 180 movu m1, [r2] 181 movu m2, [r2 + 16] 182 movu m3, [r0] 183 movu m4, [r0 + 16] 184 psadbw m1, m3 185 psadbw m2, m4 186 paddd m1, m2 187 paddd m0, m1 188 lea r2, [r2 + r3] 189 lea r0, [r0 + r1] 190 191 movu m1, [r2] 192 movu m2, [r2 + 16] 193 movu m3, [r0] 194 movu m4, [r0 + 16] 195 psadbw m1, m3 196 psadbw m2, m4 197 paddd m1, m2 198 paddd m0, m1 199 lea r2, [r2 + r3] 200 lea r0, [r0 + r1] 201 202 movu m1, [r2] 203 movu m2, [r2 + 16] 204 movu m3, [r0] 205 movu m4, [r0 + 16] 206 psadbw m1, m3 207 psadbw m2, m4 208 paddd m1, m2 209 paddd m0, m1 210 lea r2, [r2 + r3] 211 lea r0, [r0 + r1] 212%endmacro 213 214%macro PROCESS_SAD_32x4 1 215 mov%1 m1, [r2] 216 mov%1 m2, [r2 + 16] 217 psadbw m1, [r0] 218 psadbw m2, [r0 + 16] 219 paddd m1, m2 220 paddd m0, m1 221 lea r2, [r2 + r3] 222 lea r0, [r0 + r1] 223 224 mov%1 m1, [r2] 225 mov%1 m2, [r2 + 16] 226 psadbw m1, [r0] 227 psadbw m2, [r0 + 16] 228 paddd m1, m2 229 paddd m0, m1 230 lea r2, [r2 + r3] 231 lea r0, [r0 + r1] 232 233 mov%1 m1, [r2] 234 mov%1 m2, [r2 + 16] 235 psadbw m1, [r0] 236 psadbw m2, [r0 + 16] 237 paddd m1, m2 238 paddd m0, m1 239 lea r2, [r2 + r3] 240 lea r0, [r0 + r1] 241 242 mov%1 m1, [r2] 243 mov%1 m2, [r2 + 16] 244 psadbw m1, [r0] 245 psadbw m2, [r0 + 16] 246 paddd m1, m2 247 paddd m0, m1 248 lea r2, [r2 + r3] 249 lea r0, [r0 + r1] 250%endmacro 251 252;----------------------------------------------------------------------------- 253; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 254; const uint8_t *src2, ptrdiff_t stride2); 255;----------------------------------------------------------------------------- 256INIT_XMM sse2 257cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2 258 pxor m0, m0 259 mov r4d, 4 260.loop: 261 PROCESS_SAD_32x4_U 262 PROCESS_SAD_32x4_U 263 dec r4d 264 jnz .loop 265 266 movhlps m1, m0 267 paddd m0, m1 268 movd eax, m0 269 RET 270 271;------------------------------------------------------------------------------- 272; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 273; const uint8_t *src2, ptrdiff_t stride2); 274;------------------------------------------------------------------------------- 275%macro SAD_XMM_32x32 1 276INIT_XMM sse2 277cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2 278 pxor m0, m0 279 mov r4d, 4 280.loop: 281 PROCESS_SAD_32x4 %1 282 PROCESS_SAD_32x4 %1 283 dec r4d 284 jnz .loop 285 286 movhlps m1, m0 287 paddd m0, m1 288 movd eax, m0 289 RET 290%endmacro 291 292SAD_XMM_32x32 a 293SAD_XMM_32x32 u 294 295%if HAVE_AVX2_EXTERNAL 296;------------------------------------------------------------------------------- 297; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 298; const uint8_t *src2, ptrdiff_t stride2); 299;------------------------------------------------------------------------------- 300INIT_YMM avx2 301cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 302 pxor m0, m0 303 mov r4d, 32/4 304 lea r5, [stride1q * 3] 305 lea r6, [stride2q * 3] 306 307.loop: 308 movu m1, [src1q] ; row 0 of pix0 309 movu m2, [src2q] ; row 0 of pix1 310 movu m3, [src1q + stride1q] ; row 1 of pix0 311 movu m4, [src2q + stride2q] ; row 1 of pix1 312 313 psadbw m1, m2 314 psadbw m3, m4 315 paddd m0, m1 316 paddd m0, m3 317 318 movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 319 movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 320 movu m3, [src1q + r5] ; row 3 of pix0 321 movu m4, [src2q + r6] ; row 3 of pix1 322 323 psadbw m1, m2 324 psadbw m3, m4 325 paddd m0, m1 326 paddd m0, m3 327 328 lea src2q, [src2q + 4 * stride2q] 329 lea src1q, [src1q + 4 * stride1q] 330 331 dec r4d 332 jnz .loop 333 334 vextracti128 xm1, m0, 1 335 paddd xm0, xm1 336 pshufd xm1, xm0, 2 337 paddd xm0, xm1 338 movd eax, xm0 339 RET 340 341;------------------------------------------------------------------------------- 342; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 343; const uint8_t *src2, ptrdiff_t stride2); 344;------------------------------------------------------------------------------- 345%macro SAD_AVX2_32x32 1 346INIT_YMM avx2 347cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 348 pxor m0, m0 349 mov r4d, 32/4 350 lea r5, [stride1q * 3] 351 lea r6, [stride2q * 3] 352 353.loop: 354 mov%1 m1, [src2q] ; row 0 of pix1 355 psadbw m1, [src1q] 356 mov%1 m2, [src2q + stride2q] ; row 1 of pix1 357 psadbw m2, [src1q + stride1q] 358 359 paddd m0, m1 360 paddd m0, m2 361 362 mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 363 psadbw m1, [src1q + 2 * stride1q] 364 mov%1 m2, [src2q + r6] ; row 3 of pix1 365 psadbw m2, [src1q + r5] 366 367 paddd m0, m1 368 paddd m0, m2 369 370 lea src2q, [src2q + 4 * stride2q] 371 lea src1q, [src1q + 4 * stride1q] 372 373 dec r4d 374 jnz .loop 375 376 vextracti128 xm1, m0, 1 377 paddd xm0, xm1 378 pshufd xm1, xm0, 2 379 paddd xm0, xm1 380 movd eax, xm0 381 RET 382%endmacro 383 384SAD_AVX2_32x32 a 385SAD_AVX2_32x32 u 386%endif 387