1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders 3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25pw_row_coeffs: times 4 dw 13 26 times 4 dw 17 27 times 4 dw 7 28pd_512: times 2 dd 0x200 29pw_col_coeffs: dw 13, 13, 13, -13 30 dw 17, 7, 7, -17 31 dw 13, -13, 13, 13 32 dw -7, 17, -17, -7 33 34SECTION .text 35 36%macro IDCT_DC_NOROUND 1 37 imul %1, 13*13*3 38 sar %1, 11 39%endmacro 40 41%macro IDCT_DC_ROUND 1 42 imul %1, 13*13 43 add %1, 0x200 44 sar %1, 10 45%endmacro 46 47%macro rv34_idct 1 48cglobal rv34_idct_%1, 1, 2, 0 49 movsx r1, word [r0] 50 IDCT_DC r1 51 movd m0, r1d 52 pshufw m0, m0, 0 53 movq [r0+ 0], m0 54 movq [r0+ 8], m0 55 movq [r0+16], m0 56 movq [r0+24], m0 57 REP_RET 58%endmacro 59 60INIT_MMX mmxext 61%define IDCT_DC IDCT_DC_ROUND 62rv34_idct dc 63%define IDCT_DC IDCT_DC_NOROUND 64rv34_idct dc_noround 65 66; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); 67%if ARCH_X86_32 68INIT_MMX mmx 69cglobal rv34_idct_dc_add, 3, 3 70 ; calculate DC 71 IDCT_DC_ROUND r2 72 pxor m1, m1 73 movd m0, r2d 74 psubw m1, m0 75 packuswb m0, m0 76 packuswb m1, m1 77 punpcklbw m0, m0 78 punpcklbw m1, m1 79 punpcklwd m0, m0 80 punpcklwd m1, m1 81 82 ; add DC 83 lea r2, [r0+r1*2] 84 movh m2, [r0] 85 movh m3, [r0+r1] 86 movh m4, [r2] 87 movh m5, [r2+r1] 88 paddusb m2, m0 89 paddusb m3, m0 90 paddusb m4, m0 91 paddusb m5, m0 92 psubusb m2, m1 93 psubusb m3, m1 94 psubusb m4, m1 95 psubusb m5, m1 96 movh [r0], m2 97 movh [r0+r1], m3 98 movh [r2], m4 99 movh [r2+r1], m5 100 RET 101%endif 102 103; Load coeffs and perform row transform 104; Output: coeffs in mm[0467], rounder in mm5 105%macro ROW_TRANSFORM 1 106 pxor mm7, mm7 107 mova mm0, [%1+ 0*8] 108 mova mm1, [%1+ 1*8] 109 mova mm2, [%1+ 2*8] 110 mova mm3, [%1+ 3*8] 111 mova [%1+ 0*8], mm7 112 mova [%1+ 1*8], mm7 113 mova [%1+ 2*8], mm7 114 mova [%1+ 3*8], mm7 115 mova mm4, mm0 116 mova mm6, [pw_row_coeffs+ 0] 117 paddsw mm0, mm2 ; b0 + b2 118 psubsw mm4, mm2 ; b0 - b2 119 pmullw mm0, mm6 ; *13 = z0 120 pmullw mm4, mm6 ; *13 = z1 121 mova mm5, mm1 122 pmullw mm1, [pw_row_coeffs+ 8] ; b1*17 123 pmullw mm5, [pw_row_coeffs+16] ; b1* 7 124 mova mm7, mm3 125 pmullw mm3, [pw_row_coeffs+ 8] ; b3*17 126 pmullw mm7, [pw_row_coeffs+16] ; b3* 7 127 paddsw mm1, mm7 ; z3 = b1*17 + b3* 7 128 psubsw mm5, mm3 ; z2 = b1* 7 - b3*17 129 mova mm7, mm0 130 mova mm6, mm4 131 paddsw mm0, mm1 ; z0 + z3 132 psubsw mm7, mm1 ; z0 - z3 133 paddsw mm4, mm5 ; z1 + z2 134 psubsw mm6, mm5 ; z1 - z2 135 mova mm5, [pd_512] ; 0x200 136%endmacro 137 138; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); 139%macro COL_TRANSFORM 4 140 pshufw mm3, %2, 0xDD ; col. 1,3,1,3 141 pshufw %2, %2, 0x88 ; col. 0,2,0,2 142 pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1 143 pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2 144 paddd %2, mm5 145 pshufw mm1, %2, 01001110b ; z1 | z0 146 pshufw mm2, mm3, 01001110b ; z2 | z3 147 paddd %2, mm3 ; z0+z3 | z1+z2 148 psubd mm1, mm2 ; z1-z2 | z0-z3 149 movd mm3, %1 150 psrad %2, 10 151 pxor mm2, mm2 152 psrad mm1, 10 153 punpcklbw mm3, mm2 154 packssdw %2, mm1 155 paddw %2, mm3 156 packuswb %2, %2 157 movd %1, %2 158%endmacro 159INIT_MMX mmxext 160cglobal rv34_idct_add, 3,3,0, d, s, b 161 ROW_TRANSFORM bq 162 COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] 163 mova mm0, [pw_col_coeffs+ 0] 164 COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8] 165 mova mm4, [pw_col_coeffs+ 8] 166 lea dq, [dq + 2*sq] 167 COL_TRANSFORM [dq], mm6, mm0, mm4 168 COL_TRANSFORM [dq+sq], mm7, mm0, mm4 169 ret 170 171; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); 172%macro RV34_IDCT_DC_ADD 0 173cglobal rv34_idct_dc_add, 3, 3, 6 174 ; load data 175 IDCT_DC_ROUND r2 176 pxor m1, m1 177 178 ; calculate DC 179 movd m0, r2d 180 lea r2, [r0+r1*2] 181 movd m2, [r0] 182 movd m3, [r0+r1] 183 pshuflw m0, m0, 0 184 movd m4, [r2] 185 movd m5, [r2+r1] 186 punpcklqdq m0, m0 187 punpckldq m2, m3 188 punpckldq m4, m5 189 punpcklbw m2, m1 190 punpcklbw m4, m1 191 paddw m2, m0 192 paddw m4, m0 193 packuswb m2, m4 194 movd [r0], m2 195%if cpuflag(sse4) 196 pextrd [r0+r1], m2, 1 197 pextrd [r2], m2, 2 198 pextrd [r2+r1], m2, 3 199%else 200 psrldq m2, 4 201 movd [r0+r1], m2 202 psrldq m2, 4 203 movd [r2], m2 204 psrldq m2, 4 205 movd [r2+r1], m2 206%endif 207 RET 208%endmacro 209 210INIT_XMM sse2 211RV34_IDCT_DC_ADD 212INIT_XMM sse4 213RV34_IDCT_DC_ADD 214