1;****************************************************************************** 2;* Copyright (c) 2010 David Conrad 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23SECTION_RODATA 24pw_7: times 8 dw 7 25 26cextern pw_3 27cextern pw_16 28cextern pw_32 29cextern pb_80 30 31section .text 32 33%macro UNPACK_ADD 6 34 mov%5 %1, %3 35 mov%6 m5, %4 36 mova m4, %1 37 mova %2, m5 38 punpcklbw %1, m7 39 punpcklbw m5, m7 40 punpckhbw m4, m7 41 punpckhbw %2, m7 42 paddw %1, m5 43 paddw %2, m4 44%endmacro 45 46%macro HPEL_FILTER 1 47; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); 48cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 49 mov src0q, srcq 50 lea stridex3q, [3*strideq] 51 sub src0q, stridex3q 52 pxor m7, m7 53.loop: 54 ; 7*(src[0] + src[1]) 55 UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a 56 pmullw m0, [pw_7] 57 pmullw m1, [pw_7] 58 59 ; 3*( ... + src[-2] + src[3]) 60 UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a 61 paddw m0, m2 62 paddw m1, m3 63 pmullw m0, [pw_3] 64 pmullw m1, [pw_3] 65 66 ; ... - 7*(src[-1] + src[2]) 67 UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a 68 pmullw m2, [pw_7] 69 pmullw m3, [pw_7] 70 psubw m0, m2 71 psubw m1, m3 72 73 ; ... - (src[-3] + src[4]) 74 UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a 75 psubw m0, m2 76 psubw m1, m3 77 78 paddw m0, [pw_16] 79 paddw m1, [pw_16] 80 psraw m0, 5 81 psraw m1, 5 82 packuswb m0, m1 83 mova [dstq], m0 84 add dstq, mmsize 85 add srcq, mmsize 86 add src0q, mmsize 87 sub widthd, mmsize 88 jg .loop 89 RET 90 91; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); 92cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width 93 dec widthd 94 pxor m7, m7 95 and widthd, ~(mmsize-1) 96.loop: 97 ; 7*(src[0] + src[1]) 98 UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u 99 pmullw m0, [pw_7] 100 pmullw m1, [pw_7] 101 102 ; 3*( ... + src[-2] + src[3]) 103 UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u 104 paddw m0, m2 105 paddw m1, m3 106 pmullw m0, [pw_3] 107 pmullw m1, [pw_3] 108 109 ; ... - 7*(src[-1] + src[2]) 110 UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u 111 pmullw m2, [pw_7] 112 pmullw m3, [pw_7] 113 psubw m0, m2 114 psubw m1, m3 115 116 ; ... - (src[-3] + src[4]) 117 UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u 118 psubw m0, m2 119 psubw m1, m3 120 121 paddw m0, [pw_16] 122 paddw m1, [pw_16] 123 psraw m0, 5 124 psraw m1, 5 125 packuswb m0, m1 126 mova [dstq + widthq], m0 127 sub widthd, mmsize 128 jge .loop 129 RET 130%endmacro 131 132%macro PUT_RECT 1 133; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) 134cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 135 mova m0, [pb_80] 136 add wd, (mmsize-1) 137 and wd, ~(mmsize-1) 138 139%if ARCH_X86_64 140 movsxd dst_strideq, dst_strided 141 movsxd src_strideq, src_strided 142 mov r7d, r5m 143 mov r8d, wd 144 %define wspill r8d 145 %define hd r7d 146%else 147 mov r4m, wd 148 %define wspill r4m 149 %define hd r5mp 150%endif 151 152.loopy 153 lea src2q, [srcq+src_strideq*2] 154 lea dst2q, [dstq+dst_strideq] 155.loopx: 156 sub wd, mmsize 157 mova m1, [srcq +2*wq] 158 mova m2, [src2q+2*wq] 159 packsswb m1, [srcq +2*wq+mmsize] 160 packsswb m2, [src2q+2*wq+mmsize] 161 paddb m1, m0 162 paddb m2, m0 163 mova [dstq +wq], m1 164 mova [dst2q+wq], m2 165 jg .loopx 166 167 lea srcq, [srcq+src_strideq*4] 168 lea dstq, [dstq+dst_strideq*2] 169 sub hd, 2 170 mov wd, wspill 171 jg .loopy 172 RET 173%endm 174 175%macro ADD_RECT 1 176; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) 177cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h 178 mova m0, [pw_32] 179 add wd, (mmsize-1) 180 and wd, ~(mmsize-1) 181 182%if ARCH_X86_64 183 movsxd strideq, strided 184 movsxd idwt_strideq, idwt_strided 185 mov r8d, wd 186 %define wspill r8d 187%else 188 mov r5m, wd 189 %define wspill r5m 190%endif 191 192.loop: 193 sub wd, mmsize 194 movu m1, [srcq +2*wq] ; FIXME: ensure alignment 195 paddw m1, m0 196 psraw m1, 6 197 movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment 198 paddw m2, m0 199 psraw m2, 6 200 paddw m1, [idwtq+2*wq] 201 paddw m2, [idwtq+2*wq+mmsize] 202 packuswb m1, m2 203 mova [dstq +wq], m1 204 jg .loop 205 206 lea srcq, [srcq + 2*strideq] 207 add dstq, strideq 208 lea idwtq, [idwtq+ 2*idwt_strideq] 209 sub hd, 1 210 mov wd, wspill 211 jg .loop 212 RET 213%endm 214 215%macro ADD_OBMC 2 216; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) 217cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen 218 pxor m4, m4 219.loop: 220%assign i 0 221%rep %1 / mmsize 222 mova m0, [srcq+i] 223 mova m1, m0 224 punpcklbw m0, m4 225 punpckhbw m1, m4 226 mova m2, [obmcq+i] 227 mova m3, m2 228 punpcklbw m2, m4 229 punpckhbw m3, m4 230 pmullw m0, m2 231 pmullw m1, m3 232 movu m2, [dstq+2*i] 233 movu m3, [dstq+2*i+mmsize] 234 paddw m0, m2 235 paddw m1, m3 236 movu [dstq+2*i], m0 237 movu [dstq+2*i+mmsize], m1 238%assign i i+mmsize 239%endrep 240 lea srcq, [srcq+strideq] 241 lea dstq, [dstq+2*strideq] 242 add obmcq, 32 243 sub yblend, 1 244 jg .loop 245 RET 246%endm 247 248INIT_MMX 249%if ARCH_X86_64 == 0 250PUT_RECT mmx 251ADD_RECT mmx 252 253HPEL_FILTER mmx 254ADD_OBMC 32, mmx 255ADD_OBMC 16, mmx 256%endif 257ADD_OBMC 8, mmx 258 259INIT_XMM 260PUT_RECT sse2 261ADD_RECT sse2 262 263HPEL_FILTER sse2 264ADD_OBMC 32, sse2 265ADD_OBMC 16, sse2 266