1;****************************************************************************** 2;* SIMD lossless video DSP utils 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2014 Michael Niedermayer 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27pb_ef: times 8 db 14,15 28pb_67: times 8 db 6, 7 29pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11 30pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7 31 32SECTION_TEXT 33 34%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub 35 movd m4, maskd 36 SPLATW m4, m4 37 add wd, wd 38 test wq, 2*mmsize - 1 39 jz %%.tomainloop 40 push tmpq 41%%.wordloop: 42 sub wq, 2 43%ifidn %2, add 44 mov tmpw, [srcq+wq] 45 add tmpw, [dstq+wq] 46%else 47 mov tmpw, [src1q+wq] 48 sub tmpw, [src2q+wq] 49%endif 50 and tmpw, maskw 51 mov [dstq+wq], tmpw 52 test wq, 2*mmsize - 1 53 jnz %%.wordloop 54 pop tmpq 55%%.tomainloop: 56%ifidn %2, add 57 add srcq, wq 58%else 59 add src1q, wq 60 add src2q, wq 61%endif 62 add dstq, wq 63 neg wq 64 jz %%.end 65%%.loop: 66%ifidn %2, add 67 mov%1 m0, [srcq+wq] 68 mov%1 m1, [dstq+wq] 69 mov%1 m2, [srcq+wq+mmsize] 70 mov%1 m3, [dstq+wq+mmsize] 71%else 72 mov%1 m0, [src1q+wq] 73 mov%1 m1, [src2q+wq] 74 mov%1 m2, [src1q+wq+mmsize] 75 mov%1 m3, [src2q+wq+mmsize] 76%endif 77 p%2w m0, m1 78 p%2w m2, m3 79 pand m0, m4 80 pand m2, m4 81 mov%1 [dstq+wq] , m0 82 mov%1 [dstq+wq+mmsize], m2 83 add wq, 2*mmsize 84 jl %%.loop 85%%.end: 86 RET 87%endmacro 88 89INIT_MMX mmx 90cglobal add_int16, 4,4,5, dst, src, mask, w, tmp 91 INT16_LOOP a, add 92 93INIT_XMM sse2 94cglobal add_int16, 4,4,5, dst, src, mask, w, tmp 95 test srcq, mmsize-1 96 jnz .unaligned 97 test dstq, mmsize-1 98 jnz .unaligned 99 INT16_LOOP a, add 100.unaligned: 101 INT16_LOOP u, add 102 103INIT_MMX mmx 104cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp 105 INT16_LOOP a, sub 106 107INIT_XMM sse2 108cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp 109 test src1q, mmsize-1 110 jnz .unaligned 111 test src2q, mmsize-1 112 jnz .unaligned 113 test dstq, mmsize-1 114 jnz .unaligned 115 INT16_LOOP a, sub 116.unaligned: 117 INT16_LOOP u, sub 118 119 120%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) 121 add wd, wd 122 add srcq, wq 123 add dstq, wq 124 neg wq 125%%.loop: 126 mov%2 m1, [srcq+wq] 127 mova m2, m1 128 pslld m1, 16 129 paddw m1, m2 130 mova m2, m1 131 132 pshufb m1, m3 133 paddw m1, m2 134 pshufb m0, m5 135%if mmsize == 16 136 mova m2, m1 137 pshufb m1, m4 138 paddw m1, m2 139%endif 140 paddw m0, m1 141 pand m0, m7 142%ifidn %1, a 143 mova [dstq+wq], m0 144%else 145 movq [dstq+wq], m0 146 movhps [dstq+wq+8], m0 147%endif 148 add wq, mmsize 149 jl %%.loop 150 mov eax, mmsize-1 151 sub eax, wd 152 mov wd, eax 153 shl wd, 8 154 lea eax, [wd+eax-1] 155 movd m1, eax 156 pshufb m0, m1 157 movd eax, m0 158 RET 159%endmacro 160 161; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) 162INIT_MMX ssse3 163cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left 164.skip_prologue: 165 mova m5, [pb_67] 166 mova m3, [pb_zzzz2323zzzzabab] 167 movd m0, leftm 168 psllq m0, 48 169 movd m7, maskm 170 SPLATW m7 ,m7 171 ADD_HFYU_LEFT_LOOP_INT16 a, a 172 173INIT_XMM sse4 174cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left 175 mova m5, [pb_ef] 176 mova m4, [pb_zzzzzzzz67676767] 177 mova m3, [pb_zzzz2323zzzzabab] 178 movd m0, leftm 179 pslldq m0, 14 180 movd m7, maskm 181 SPLATW m7 ,m7 182 test srcq, 15 183 jnz .src_unaligned 184 test dstq, 15 185 jnz .dst_unaligned 186 ADD_HFYU_LEFT_LOOP_INT16 a, a 187.dst_unaligned: 188 ADD_HFYU_LEFT_LOOP_INT16 u, a 189.src_unaligned: 190 ADD_HFYU_LEFT_LOOP_INT16 u, u 191 192; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top) 193INIT_MMX mmxext 194cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top 195 add wd, wd 196 movd mm6, maskd 197 SPLATW mm6, mm6 198 movq mm0, [topq] 199 movq mm2, mm0 200 movd mm4, [left_topq] 201 psllq mm2, 16 202 movq mm1, mm0 203 por mm4, mm2 204 movd mm3, [leftq] 205 psubw mm0, mm4 ; t-tl 206 add dstq, wq 207 add topq, wq 208 add diffq, wq 209 neg wq 210 jmp .skip 211.loop: 212 movq mm4, [topq+wq] 213 movq mm0, mm4 214 psllq mm4, 16 215 por mm4, mm1 216 movq mm1, mm0 ; t 217 psubw mm0, mm4 ; t-tl 218.skip: 219 movq mm2, [diffq+wq] 220%assign i 0 221%rep 4 222 movq mm4, mm0 223 paddw mm4, mm3 ; t-tl+l 224 pand mm4, mm6 225 movq mm5, mm3 226 pmaxsw mm3, mm1 227 pminsw mm5, mm1 228 pminsw mm3, mm4 229 pmaxsw mm3, mm5 ; median 230 paddw mm3, mm2 ; +residual 231 pand mm3, mm6 232%if i==0 233 movq mm7, mm3 234 psllq mm7, 48 235%else 236 movq mm4, mm3 237 psrlq mm7, 16 238 psllq mm4, 48 239 por mm7, mm4 240%endif 241%if i<3 242 psrlq mm0, 16 243 psrlq mm1, 16 244 psrlq mm2, 16 245%endif 246%assign i i+1 247%endrep 248 movq [dstq+wq], mm7 249 add wq, 8 250 jl .loop 251 movzx r2d, word [dstq-2] 252 mov [leftq], r2d 253 movzx r2d, word [topq-2] 254 mov [left_topq], r2d 255 RET 256 257cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top 258 add wd, wd 259 movd mm7, maskd 260 SPLATW mm7, mm7 261 movq mm0, [src1q] 262 movq mm2, [src2q] 263 psllq mm0, 16 264 psllq mm2, 16 265 movd mm6, [left_topq] 266 por mm0, mm6 267 movd mm6, [leftq] 268 por mm2, mm6 269 xor maskq, maskq 270.loop: 271 movq mm1, [src1q + maskq] 272 movq mm3, [src2q + maskq] 273 movq mm4, mm2 274 psubw mm2, mm0 275 paddw mm2, mm1 276 pand mm2, mm7 277 movq mm5, mm4 278 pmaxsw mm4, mm1 279 pminsw mm1, mm5 280 pminsw mm4, mm2 281 pmaxsw mm4, mm1 282 psubw mm3, mm4 283 pand mm3, mm7 284 movq [dstq + maskq], mm3 285 add maskq, 8 286 movq mm0, [src1q + maskq - 2] 287 movq mm2, [src2q + maskq - 2] 288 cmp maskq, wq 289 jb .loop 290 movzx maskd, word [src1q + wq - 2] 291 mov [left_topq], maskd 292 movzx maskd, word [src2q + wq - 2] 293 mov [leftq], maskd 294 RET 295