1;****************************************************************************** 2;* Copyright (c) 2010 David Conrad 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23SECTION_RODATA 24pw_7: times 8 dw 7 25convert_to_unsigned_10bit: times 4 dd 0x200 26clip_10bit: times 8 dw 0x3ff 27 28cextern pw_3 29cextern pw_16 30cextern pw_32 31cextern pb_80 32 33SECTION .text 34 35%macro UNPACK_ADD 6 36 mov%5 %1, %3 37 mov%6 m5, %4 38 mova m4, %1 39 mova %2, m5 40 punpcklbw %1, m7 41 punpcklbw m5, m7 42 punpckhbw m4, m7 43 punpckhbw %2, m7 44 paddw %1, m5 45 paddw %2, m4 46%endmacro 47 48%macro HPEL_FILTER 1 49; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); 50cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 51 mov src0q, srcq 52 lea stridex3q, [3*strideq] 53 sub src0q, stridex3q 54 pxor m7, m7 55.loop: 56 ; 7*(src[0] + src[1]) 57 UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a 58 pmullw m0, [pw_7] 59 pmullw m1, [pw_7] 60 61 ; 3*( ... + src[-2] + src[3]) 62 UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a 63 paddw m0, m2 64 paddw m1, m3 65 pmullw m0, [pw_3] 66 pmullw m1, [pw_3] 67 68 ; ... - 7*(src[-1] + src[2]) 69 UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a 70 pmullw m2, [pw_7] 71 pmullw m3, [pw_7] 72 psubw m0, m2 73 psubw m1, m3 74 75 ; ... - (src[-3] + src[4]) 76 UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a 77 psubw m0, m2 78 psubw m1, m3 79 80 paddw m0, [pw_16] 81 paddw m1, [pw_16] 82 psraw m0, 5 83 psraw m1, 5 84 packuswb m0, m1 85 mova [dstq], m0 86 add dstq, mmsize 87 add srcq, mmsize 88 add src0q, mmsize 89 sub widthd, mmsize 90 jg .loop 91 RET 92 93; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); 94cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width 95 dec widthd 96 pxor m7, m7 97 and widthd, ~(mmsize-1) 98.loop: 99 ; 7*(src[0] + src[1]) 100 UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u 101 pmullw m0, [pw_7] 102 pmullw m1, [pw_7] 103 104 ; 3*( ... + src[-2] + src[3]) 105 UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u 106 paddw m0, m2 107 paddw m1, m3 108 pmullw m0, [pw_3] 109 pmullw m1, [pw_3] 110 111 ; ... - 7*(src[-1] + src[2]) 112 UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u 113 pmullw m2, [pw_7] 114 pmullw m3, [pw_7] 115 psubw m0, m2 116 psubw m1, m3 117 118 ; ... - (src[-3] + src[4]) 119 UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u 120 psubw m0, m2 121 psubw m1, m3 122 123 paddw m0, [pw_16] 124 paddw m1, [pw_16] 125 psraw m0, 5 126 psraw m1, 5 127 packuswb m0, m1 128 mova [dstq + widthq], m0 129 sub widthd, mmsize 130 jge .loop 131 RET 132%endmacro 133 134%macro PUT_RECT 1 135; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) 136cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 137 mova m0, [pb_80] 138 add wd, (mmsize-1) 139 and wd, ~(mmsize-1) 140 141%if ARCH_X86_64 142 movsxd dst_strideq, dst_strided 143 movsxd src_strideq, src_strided 144 mov r7d, r5m 145 mov r8d, wd 146 %define wspill r8d 147 %define hd r7d 148%else 149 mov r4m, wd 150 %define wspill r4m 151 %define hd r5mp 152%endif 153 154.loopy: 155 lea src2q, [srcq+src_strideq] 156 lea dst2q, [dstq+dst_strideq] 157.loopx: 158 sub wd, mmsize 159 mova m1, [srcq +2*wq] 160 mova m2, [src2q+2*wq] 161 packsswb m1, [srcq +2*wq+mmsize] 162 packsswb m2, [src2q+2*wq+mmsize] 163 paddb m1, m0 164 paddb m2, m0 165 mova [dstq +wq], m1 166 mova [dst2q+wq], m2 167 jg .loopx 168 169 lea srcq, [srcq+src_strideq*2] 170 lea dstq, [dstq+dst_strideq*2] 171 sub hd, 2 172 mov wd, wspill 173 jg .loopy 174 RET 175%endm 176 177%macro ADD_RECT 1 178; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) 179cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h 180 mova m0, [pw_32] 181 add wd, (mmsize-1) 182 and wd, ~(mmsize-1) 183 184%if ARCH_X86_64 185 movsxd strideq, strided 186 movsxd idwt_strideq, idwt_strided 187 mov r8d, wd 188 %define wspill r8d 189%else 190 mov r5m, wd 191 %define wspill r5m 192%endif 193 194.loop: 195 sub wd, mmsize 196 movu m1, [srcq +2*wq] ; FIXME: ensure alignment 197 paddw m1, m0 198 psraw m1, 6 199 movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment 200 paddw m2, m0 201 psraw m2, 6 202 paddw m1, [idwtq+2*wq] 203 paddw m2, [idwtq+2*wq+mmsize] 204 packuswb m1, m2 205 mova [dstq +wq], m1 206 jg .loop 207 208 lea srcq, [srcq + 2*strideq] 209 add dstq, strideq 210 lea idwtq, [idwtq+ 2*idwt_strideq] 211 sub hd, 1 212 mov wd, wspill 213 jg .loop 214 RET 215%endm 216 217%macro ADD_OBMC 2 218; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) 219cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen 220 pxor m4, m4 221.loop: 222%assign i 0 223%rep %1 / mmsize 224 mova m0, [srcq+i] 225 mova m1, m0 226 punpcklbw m0, m4 227 punpckhbw m1, m4 228 mova m2, [obmcq+i] 229 mova m3, m2 230 punpcklbw m2, m4 231 punpckhbw m3, m4 232 pmullw m0, m2 233 pmullw m1, m3 234 movu m2, [dstq+2*i] 235 movu m3, [dstq+2*i+mmsize] 236 paddw m0, m2 237 paddw m1, m3 238 movu [dstq+2*i], m0 239 movu [dstq+2*i+mmsize], m1 240%assign i i+mmsize 241%endrep 242 lea srcq, [srcq+strideq] 243 lea dstq, [dstq+2*strideq] 244 add obmcq, 32 245 sub yblend, 1 246 jg .loop 247 RET 248%endm 249 250INIT_MMX 251%if ARCH_X86_64 == 0 252PUT_RECT mmx 253ADD_RECT mmx 254 255HPEL_FILTER mmx 256ADD_OBMC 32, mmx 257ADD_OBMC 16, mmx 258%endif 259ADD_OBMC 8, mmx 260 261INIT_XMM 262PUT_RECT sse2 263ADD_RECT sse2 264 265HPEL_FILTER sse2 266ADD_OBMC 32, sse2 267ADD_OBMC 16, sse2 268 269INIT_XMM sse4 270 271; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h) 272cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h 273 movd m2, qfd 274 movd m3, qsd 275 SPLATD m2 276 SPLATD m3 277 mov r4d, tot_hd 278 mov r3, dstq 279 280 .loop_v: 281 mov tot_hq, r4 282 mov dstq, r3 283 284 .loop_h: 285 movu m0, [srcq] 286 287 pabsd m1, m0 288 pmulld m1, m2 289 paddd m1, m3 290 psrld m1, 2 291 psignd m1, m0 292 293 movu [dstq], m1 294 295 add srcq, mmsize 296 add dstq, mmsize 297 sub tot_hq, 4 298 jg .loop_h 299 lea srcq, [srcq + 4*tot_hq] 300 301 add r3, strideq 302 dec tot_vd 303 jg .loop_v 304 305 RET 306 307INIT_XMM sse4 308; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height) 309%if ARCH_X86_64 310cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2 311%else 312cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2 313 %define hd r5mp 314%endif 315 shl wd, 2 316 add srcq, wq 317 neg wq 318 mov t2q, dstq 319 mov t1q, wq 320 pxor m2, m2 321 mova m3, [clip_10bit] 322 mova m4, [convert_to_unsigned_10bit] 323 324 .loop_h: 325 mov dstq, t2q 326 mov wq, t1q 327 328 .loop_w: 329 movu m0, [srcq+wq+0*mmsize] 330 movu m1, [srcq+wq+1*mmsize] 331 332 paddd m0, m4 333 paddd m1, m4 334 packusdw m0, m0, m1 335 CLIPW m0, m2, m3 ; packusdw saturates so it's fine 336 337 movu [dstq], m0 338 339 add dstq, 1*mmsize 340 add wq, 2*mmsize 341 jl .loop_w 342 343 add srcq, src_strideq 344 add t2q, dst_strideq 345 sub hd, 1 346 jg .loop_h 347 348 RET 349