1;****************************************************************************** 2;* SIMD lossless video DSP utils 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2014 Michael Niedermayer 5;* Copyright (c) 2017 Jokyo Images 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pb_15 29pb_zzzzzzzz77777777: times 8 db -1 30pb_7: times 8 db 7 31pb_ef: times 8 db 14,15 32pb_67: times 8 db 6, 7 33pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 34pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 35pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11 36pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7 37 38SECTION .text 39 40;------------------------------------------------------------------------------ 41; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top, 42; const uint8_t *diff, int w, 43; int *left, int *left_top) 44;------------------------------------------------------------------------------ 45%macro MEDIAN_PRED 0 46cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top 47 movu m0, [topq] 48 mova m2, m0 49 movd m4, [left_topq] 50 LSHIFT m2, 1 51 mova m1, m0 52 por m4, m2 53 movd m3, [leftq] 54 psubb m0, m4 ; t-tl 55 add dstq, wq 56 add topq, wq 57 add diffq, wq 58 neg wq 59 jmp .skip 60.loop: 61 movu m4, [topq+wq] 62 mova m0, m4 63 LSHIFT m4, 1 64 por m4, m1 65 mova m1, m0 ; t 66 psubb m0, m4 ; t-tl 67.skip: 68 movu m2, [diffq+wq] 69%assign i 0 70%rep mmsize 71 mova m4, m0 72 paddb m4, m3 ; t-tl+l 73 mova m5, m3 74 pmaxub m3, m1 75 pminub m5, m1 76 pminub m3, m4 77 pmaxub m3, m5 ; median 78 paddb m3, m2 ; +residual 79%if i==0 80 mova m7, m3 81 LSHIFT m7, mmsize-1 82%else 83 mova m6, m3 84 RSHIFT m7, 1 85 LSHIFT m6, mmsize-1 86 por m7, m6 87%endif 88%if i<mmsize-1 89 RSHIFT m0, 1 90 RSHIFT m1, 1 91 RSHIFT m2, 1 92%endif 93%assign i i+1 94%endrep 95 movu [dstq+wq], m7 96 add wq, mmsize 97 jl .loop 98 movzx r2d, byte [dstq-1] 99 mov [leftq], r2d 100 movzx r2d, byte [topq-1] 101 mov [left_topq], r2d 102 RET 103%endmacro 104 105%if ARCH_X86_32 106INIT_MMX mmxext 107MEDIAN_PRED 108%endif 109INIT_XMM sse2 110MEDIAN_PRED 111 112 113%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned 114 add srcq, wq 115 add dstq, wq 116 neg wq 117%%.loop: 118 pshufb xm0, xm5 119%if %2 120 mova m1, [srcq+wq] 121%else 122 movu m1, [srcq+wq] 123%endif 124 psllw m2, m1, 8 125 paddb m1, m2 126 pshufb m2, m1, m3 127 paddb m1, m2 128 pshufb m2, m1, m4 129 paddb m1, m2 130%if mmsize >= 16 131 pshufb m2, m1, m6 132 paddb m1, m2 133%endif 134 paddb xm0, xm1 135%if %1 136 mova [dstq+wq], xm0 137%else 138 movq [dstq+wq], xm0 139 movhps [dstq+wq+8], xm0 140%endif 141 142%if mmsize == 32 143 vextracti128 xm2, m1, 1 ; get second lane of the ymm 144 pshufb xm0, xm5 ; set alls val to last val of the first lane 145 paddb xm0, xm2 146;store val 147%if %1 148 mova [dstq+wq+16], xm0 149%else; 150 movq [dstq+wq+16], xm0 151 movhps [dstq+wq+16+8], xm0 152%endif 153%endif 154 add wq, mmsize 155 jl %%.loop 156%if mmsize == 32 157 movzx eax, byte [dstq - 1] 158%else; 159 mov eax, mmsize-1 160 sub eax, wd 161 movd m1, eax 162 pshufb m0, m1 163 movd eax, m0 164%endif 165 RET 166%endmacro 167 168;------------------------------------------------------------------------------ 169; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left) 170;------------------------------------------------------------------------------ 171INIT_MMX ssse3 172cglobal add_left_pred, 3,3,7, dst, src, w, left 173.skip_prologue: 174 mova m5, [pb_7] 175 mova m4, [pb_zzzz3333zzzzbbbb] 176 mova m3, [pb_zz11zz55zz99zzdd] 177 movd m0, leftm 178 psllq m0, 56 179 ADD_LEFT_LOOP 1, 1 180 181%macro ADD_LEFT_PRED_UNALIGNED 0 182cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left 183 mova xm5, [pb_15] 184 VBROADCASTI128 m6, [pb_zzzzzzzz77777777] 185 VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb] 186 VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd] 187 movd xm0, leftm 188 pslldq xm0, 15 189 test srcq, mmsize - 1 190 jnz .src_unaligned 191 test dstq, mmsize - 1 192 jnz .dst_unaligned 193 ADD_LEFT_LOOP 1, 1 194.dst_unaligned: 195 ADD_LEFT_LOOP 0, 1 196.src_unaligned: 197 ADD_LEFT_LOOP 0, 0 198%endmacro 199 200INIT_XMM ssse3 201ADD_LEFT_PRED_UNALIGNED 202 203%if HAVE_AVX2_EXTERNAL 204INIT_YMM avx2 205ADD_LEFT_PRED_UNALIGNED 206%endif 207 208;------------------------------------------------------------------------------ 209; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w); 210;------------------------------------------------------------------------------ 211%macro ADD_BYTES 0 212cglobal add_bytes, 3,4,2, dst, src, w, size 213 mov sizeq, wq 214 and sizeq, -2*mmsize 215 jz .2 216 add dstq, sizeq 217 add srcq, sizeq 218 neg sizeq 219.1: 220 mova m0, [srcq + sizeq] 221 mova m1, [srcq + sizeq + mmsize] 222 paddb m0, [dstq + sizeq] 223 paddb m1, [dstq + sizeq + mmsize] 224 mova [dstq + sizeq], m0 225 mova [dstq + sizeq + mmsize], m1 226 add sizeq, 2*mmsize 227 jl .1 228.2: 229 and wq, 2*mmsize-1 230 jz .end 231 add dstq, wq 232 add srcq, wq 233 neg wq 234.3: 235 mov sizeb, [srcq + wq] 236 add [dstq + wq], sizeb 237 inc wq 238 jl .3 239.end: 240 REP_RET 241%endmacro 242 243%if ARCH_X86_32 244INIT_MMX mmx 245ADD_BYTES 246%endif 247INIT_XMM sse2 248ADD_BYTES 249 250%if HAVE_AVX2_EXTERNAL 251INIT_YMM avx2 252ADD_BYTES 253%endif 254 255%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) 256 add wd, wd 257 add srcq, wq 258 add dstq, wq 259 neg wq 260%%.loop: 261 mov%2 m1, [srcq+wq] 262 mova m2, m1 263 pslld m1, 16 264 paddw m1, m2 265 mova m2, m1 266 267 pshufb m1, m3 268 paddw m1, m2 269 pshufb m0, m5 270%if mmsize == 16 271 mova m2, m1 272 pshufb m1, m4 273 paddw m1, m2 274%endif 275 paddw m0, m1 276 pand m0, m7 277%ifidn %1, a 278 mova [dstq+wq], m0 279%else 280 movq [dstq+wq], m0 281 movhps [dstq+wq+8], m0 282%endif 283 add wq, mmsize 284 jl %%.loop 285 mov eax, mmsize-1 286 sub eax, wd 287 mov wd, eax 288 shl wd, 8 289 lea eax, [wd+eax-1] 290 movd m1, eax 291 pshufb m0, m1 292 movd eax, m0 293 RET 294%endmacro 295 296;--------------------------------------------------------------------------------------------- 297; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) 298;--------------------------------------------------------------------------------------------- 299INIT_MMX ssse3 300cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left 301.skip_prologue: 302 mova m5, [pb_67] 303 mova m3, [pb_zzzz2323zzzzabab] 304 movd m0, leftm 305 psllq m0, 48 306 movd m7, maskm 307 SPLATW m7 ,m7 308 ADD_HFYU_LEFT_LOOP_INT16 a, a 309 310INIT_XMM ssse3 311cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left 312 mova m5, [pb_ef] 313 mova m4, [pb_zzzzzzzz67676767] 314 mova m3, [pb_zzzz2323zzzzabab] 315 movd m0, leftm 316 pslldq m0, 14 317 movd m7, maskm 318 SPLATW m7 ,m7 319 test srcq, 15 320 jnz .src_unaligned 321 test dstq, 15 322 jnz .dst_unaligned 323 ADD_HFYU_LEFT_LOOP_INT16 a, a 324.dst_unaligned: 325 ADD_HFYU_LEFT_LOOP_INT16 u, a 326.src_unaligned: 327 ADD_HFYU_LEFT_LOOP_INT16 u, u 328 329 330;--------------------------------------------------------------------------------------------- 331; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width) 332;--------------------------------------------------------------------------------------------- 333%macro ADD_GRADIENT_PRED 0 334cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp 335 mova xm0, [pb_15] 336 337;load src - 1 in xm1 338 movd xm1, [srcq-1] 339%if cpuflag(avx2) 340 vpbroadcastb xm1, xm1 341%else 342 pxor xm2, xm2 343 pshufb xm1, xm2 344%endif 345 346 add srcq, widthq 347 neg widthq 348 neg strideq 349 350.loop: 351 lea tmpq, [srcq + strideq] 352 mova m2, [tmpq + widthq] ; A = src[x-stride] 353 movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)] 354 mova m4, [srcq + widthq] ; current val (src[x]) 355 356 psubb m2, m3; A - B 357 358; prefix sum A-B 359 pslldq m3, m2, 1 360 paddb m2, m3 361 pslldq m3, m2, 2 362 paddb m2, m3 363 pslldq m3, m2, 4 364 paddb m2, m3 365 pslldq m3, m2, 8 366 paddb m2, m3 367 368; prefix sum current val 369 pslldq m3, m4, 1 370 paddb m4, m3 371 pslldq m3, m4, 2 372 paddb m4, m3 373 pslldq m3, m4, 4 374 paddb m4, m3 375 pslldq m3, m4, 8 376 paddb m4, m3 377 378; last sum 379 paddb m2, m4 ; current + (A - B) 380 381 paddb xm1, xm2 ; += C 382 mova [srcq + widthq], xm1 ; store 383 384 pshufb xm1, xm0 ; put last val in all val of xm1 385 386%if mmsize == 32 387 vextracti128 xm2, m2, 1 ; get second lane of the ymm 388 paddb xm1, xm2; += C 389 390 mova [srcq + widthq + 16], xm1 ; store 391 pshufb xm1, xm0 ; put last val in all val of m1 392%endif 393 394 add widthq, mmsize 395 jl .loop 396 RET 397 398%endmacro 399 400INIT_XMM ssse3 401ADD_GRADIENT_PRED 402 403%if HAVE_AVX2_EXTERNAL 404INIT_YMM avx2 405ADD_GRADIENT_PRED 406%endif 407