1;****************************************************************************** 2;* VP9 MC SIMD optimizations 3;* 4;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 32 26 27pd_64: times 8 dd 64 28 29cextern pw_1023 30cextern pw_4095 31 32SECTION .text 33 34%macro filter_h4_fn 1-2 12 35cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery 36 mova m5, [pw_1023] 37.body: 38%if notcpuflag(sse4) && ARCH_X86_64 39 pxor m11, m11 40%endif 41 mova m6, [pd_64] 42 mova m7, [filteryq+ 0] 43%if ARCH_X86_64 && mmsize > 8 44 mova m8, [filteryq+32] 45 mova m9, [filteryq+64] 46 mova m10, [filteryq+96] 47%endif 48.loop: 49 movh m0, [srcq-6] 50 movh m1, [srcq-4] 51 movh m2, [srcq-2] 52 movh m3, [srcq+0] 53 movh m4, [srcq+2] 54 punpcklwd m0, m1 55 punpcklwd m2, m3 56 pmaddwd m0, m7 57%if ARCH_X86_64 && mmsize > 8 58 pmaddwd m2, m8 59%else 60 pmaddwd m2, [filteryq+32] 61%endif 62 movu m1, [srcq+4] 63 movu m3, [srcq+6] 64 paddd m0, m2 65 movu m2, [srcq+8] 66 add srcq, sstrideq 67 punpcklwd m4, m1 68 punpcklwd m3, m2 69%if ARCH_X86_64 && mmsize > 8 70 pmaddwd m4, m9 71 pmaddwd m3, m10 72%else 73 pmaddwd m4, [filteryq+64] 74 pmaddwd m3, [filteryq+96] 75%endif 76 paddd m0, m4 77 paddd m0, m3 78 paddd m0, m6 79 psrad m0, 7 80%if cpuflag(sse4) 81 packusdw m0, m0 82%else 83 packssdw m0, m0 84%endif 85%ifidn %1, avg 86 movh m1, [dstq] 87%endif 88 pminsw m0, m5 89%if notcpuflag(sse4) 90%if ARCH_X86_64 91 pmaxsw m0, m11 92%else 93 pxor m2, m2 94 pmaxsw m0, m2 95%endif 96%endif 97%ifidn %1, avg 98 pavgw m0, m1 99%endif 100 movh [dstq], m0 101 add dstq, dstrideq 102 dec hd 103 jg .loop 104 RET 105 106cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery 107 mova m5, [pw_4095] 108 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body 109%endmacro 110 111INIT_XMM sse2 112filter_h4_fn put 113filter_h4_fn avg 114 115%macro filter_h_fn 1-2 12 116%assign %%px mmsize/2 117cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery 118 mova m5, [pw_1023] 119.body: 120%if notcpuflag(sse4) && ARCH_X86_64 121 pxor m11, m11 122%endif 123 mova m6, [pd_64] 124 mova m7, [filteryq+ 0] 125%if ARCH_X86_64 && mmsize > 8 126 mova m8, [filteryq+32] 127 mova m9, [filteryq+64] 128 mova m10, [filteryq+96] 129%endif 130.loop: 131 movu m0, [srcq-6] 132 movu m1, [srcq-4] 133 movu m2, [srcq-2] 134 movu m3, [srcq+0] 135 movu m4, [srcq+2] 136 pmaddwd m0, m7 137 pmaddwd m1, m7 138%if ARCH_X86_64 && mmsize > 8 139 pmaddwd m2, m8 140 pmaddwd m3, m8 141 pmaddwd m4, m9 142%else 143 pmaddwd m2, [filteryq+32] 144 pmaddwd m3, [filteryq+32] 145 pmaddwd m4, [filteryq+64] 146%endif 147 paddd m0, m2 148 paddd m1, m3 149 paddd m0, m4 150 movu m2, [srcq+4] 151 movu m3, [srcq+6] 152 movu m4, [srcq+8] 153 add srcq, sstrideq 154%if ARCH_X86_64 && mmsize > 8 155 pmaddwd m2, m9 156 pmaddwd m3, m10 157 pmaddwd m4, m10 158%else 159 pmaddwd m2, [filteryq+64] 160 pmaddwd m3, [filteryq+96] 161 pmaddwd m4, [filteryq+96] 162%endif 163 paddd m1, m2 164 paddd m0, m3 165 paddd m1, m4 166 paddd m0, m6 167 paddd m1, m6 168 psrad m0, 7 169 psrad m1, 7 170%if cpuflag(sse4) 171 packusdw m0, m0 172 packusdw m1, m1 173%else 174 packssdw m0, m0 175 packssdw m1, m1 176%endif 177 punpcklwd m0, m1 178 pminsw m0, m5 179%if notcpuflag(sse4) 180%if ARCH_X86_64 181 pmaxsw m0, m11 182%else 183 pxor m2, m2 184 pmaxsw m0, m2 185%endif 186%endif 187%ifidn %1, avg 188 pavgw m0, [dstq] 189%endif 190 mova [dstq], m0 191 add dstq, dstrideq 192 dec hd 193 jg .loop 194 RET 195 196cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery 197 mova m5, [pw_4095] 198 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body 199%endmacro 200 201INIT_XMM sse2 202filter_h_fn put 203filter_h_fn avg 204%if HAVE_AVX2_EXTERNAL 205INIT_YMM avx2 206filter_h_fn put 207filter_h_fn avg 208%endif 209 210%macro filter_v4_fn 1-2 12 211%if ARCH_X86_64 212cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 213%else 214cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 215 mov filteryq, r5mp 216%define hd r4mp 217%endif 218 mova m5, [pw_1023] 219.body: 220%if notcpuflag(sse4) && ARCH_X86_64 221 pxor m11, m11 222%endif 223 mova m6, [pd_64] 224 lea sstride3q, [sstrideq*3] 225 lea src4q, [srcq+sstrideq] 226 sub srcq, sstride3q 227 mova m7, [filteryq+ 0] 228%if ARCH_X86_64 && mmsize > 8 229 mova m8, [filteryq+ 32] 230 mova m9, [filteryq+ 64] 231 mova m10, [filteryq+ 96] 232%endif 233.loop: 234 ; FIXME maybe reuse loads from previous rows, or just 235 ; more generally unroll this to prevent multiple loads of 236 ; the same data? 237 movh m0, [srcq] 238 movh m1, [srcq+sstrideq] 239 movh m2, [srcq+sstrideq*2] 240 movh m3, [srcq+sstride3q] 241 add srcq, sstrideq 242 movh m4, [src4q] 243 punpcklwd m0, m1 244 punpcklwd m2, m3 245 pmaddwd m0, m7 246%if ARCH_X86_64 && mmsize > 8 247 pmaddwd m2, m8 248%else 249 pmaddwd m2, [filteryq+ 32] 250%endif 251 movh m1, [src4q+sstrideq] 252 movh m3, [src4q+sstrideq*2] 253 paddd m0, m2 254 movh m2, [src4q+sstride3q] 255 add src4q, sstrideq 256 punpcklwd m4, m1 257 punpcklwd m3, m2 258%if ARCH_X86_64 && mmsize > 8 259 pmaddwd m4, m9 260 pmaddwd m3, m10 261%else 262 pmaddwd m4, [filteryq+ 64] 263 pmaddwd m3, [filteryq+ 96] 264%endif 265 paddd m0, m4 266 paddd m0, m3 267 paddd m0, m6 268 psrad m0, 7 269%if cpuflag(sse4) 270 packusdw m0, m0 271%else 272 packssdw m0, m0 273%endif 274%ifidn %1, avg 275 movh m1, [dstq] 276%endif 277 pminsw m0, m5 278%if notcpuflag(sse4) 279%if ARCH_X86_64 280 pmaxsw m0, m11 281%else 282 pxor m2, m2 283 pmaxsw m0, m2 284%endif 285%endif 286%ifidn %1, avg 287 pavgw m0, m1 288%endif 289 movh [dstq], m0 290 add dstq, dstrideq 291 dec hd 292 jg .loop 293 RET 294 295%if ARCH_X86_64 296cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 297%else 298cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 299 mov filteryq, r5mp 300%endif 301 mova m5, [pw_4095] 302 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body 303%endmacro 304 305INIT_XMM sse2 306filter_v4_fn put 307filter_v4_fn avg 308 309%macro filter_v_fn 1-2 13 310%assign %%px mmsize/2 311%if ARCH_X86_64 312cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 313%else 314cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 315 mov filteryq, r5mp 316%define hd r4mp 317%endif 318 mova m5, [pw_1023] 319.body: 320%if notcpuflag(sse4) && ARCH_X86_64 321 pxor m12, m12 322%endif 323%if ARCH_X86_64 324 mova m11, [pd_64] 325%endif 326 lea sstride3q, [sstrideq*3] 327 lea src4q, [srcq+sstrideq] 328 sub srcq, sstride3q 329 mova m7, [filteryq+ 0] 330%if ARCH_X86_64 && mmsize > 8 331 mova m8, [filteryq+ 32] 332 mova m9, [filteryq+ 64] 333 mova m10, [filteryq+ 96] 334%endif 335.loop: 336 ; FIXME maybe reuse loads from previous rows, or just 337 ; more generally unroll this to prevent multiple loads of 338 ; the same data? 339 movu m0, [srcq] 340 movu m1, [srcq+sstrideq] 341 movu m2, [srcq+sstrideq*2] 342 movu m3, [srcq+sstride3q] 343 add srcq, sstrideq 344 movu m4, [src4q] 345 SBUTTERFLY wd, 0, 1, 6 346 SBUTTERFLY wd, 2, 3, 6 347 pmaddwd m0, m7 348 pmaddwd m1, m7 349%if ARCH_X86_64 && mmsize > 8 350 pmaddwd m2, m8 351 pmaddwd m3, m8 352%else 353 pmaddwd m2, [filteryq+ 32] 354 pmaddwd m3, [filteryq+ 32] 355%endif 356 paddd m0, m2 357 paddd m1, m3 358 movu m2, [src4q+sstrideq] 359 movu m3, [src4q+sstrideq*2] 360 SBUTTERFLY wd, 4, 2, 6 361%if ARCH_X86_64 && mmsize > 8 362 pmaddwd m4, m9 363 pmaddwd m2, m9 364%else 365 pmaddwd m4, [filteryq+ 64] 366 pmaddwd m2, [filteryq+ 64] 367%endif 368 paddd m0, m4 369 paddd m1, m2 370 movu m4, [src4q+sstride3q] 371 add src4q, sstrideq 372 SBUTTERFLY wd, 3, 4, 6 373%if ARCH_X86_64 && mmsize > 8 374 pmaddwd m3, m10 375 pmaddwd m4, m10 376%else 377 pmaddwd m3, [filteryq+ 96] 378 pmaddwd m4, [filteryq+ 96] 379%endif 380 paddd m0, m3 381 paddd m1, m4 382%if ARCH_X86_64 383 paddd m0, m11 384 paddd m1, m11 385%else 386 paddd m0, [pd_64] 387 paddd m1, [pd_64] 388%endif 389 psrad m0, 7 390 psrad m1, 7 391%if cpuflag(sse4) 392 packusdw m0, m1 393%else 394 packssdw m0, m1 395%endif 396 pminsw m0, m5 397%if notcpuflag(sse4) 398%if ARCH_X86_64 399 pmaxsw m0, m12 400%else 401 pxor m2, m2 402 pmaxsw m0, m2 403%endif 404%endif 405%ifidn %1, avg 406 pavgw m0, [dstq] 407%endif 408 mova [dstq], m0 409 add dstq, dstrideq 410 dec hd 411 jg .loop 412 RET 413 414%if ARCH_X86_64 415cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 416%else 417cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 418 mov filteryq, r5mp 419%endif 420 mova m5, [pw_4095] 421 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body 422%endmacro 423 424INIT_XMM sse2 425filter_v_fn put 426filter_v_fn avg 427%if HAVE_AVX2_EXTERNAL 428INIT_YMM avx2 429filter_v_fn put 430filter_v_fn avg 431%endif 432