1;****************************************************************************** 2;* SIMD-optimized IDCT-related routines 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2003-2013 Michael Niedermayer 5;* Copyright (c) 2013 Daniel Kang 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pb_80 29 30SECTION .text 31 32;-------------------------------------------------------------------------- 33;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, 34; ptrdiff_t line_size) 35;-------------------------------------------------------------------------- 36 37%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 38 mova m1, [blockq+mmsize*0+%1] 39 mova m2, [blockq+mmsize*2+%1] 40%if mmsize == 8 41 mova m3, [blockq+mmsize*4+%1] 42 mova m4, [blockq+mmsize*6+%1] 43%endif 44 packsswb m1, [blockq+mmsize*1+%1] 45 packsswb m2, [blockq+mmsize*3+%1] 46%if mmsize == 8 47 packsswb m3, [blockq+mmsize*5+%1] 48 packsswb m4, [blockq+mmsize*7+%1] 49%endif 50 paddb m1, m0 51 paddb m2, m0 52%if mmsize == 8 53 paddb m3, m0 54 paddb m4, m0 55 movq [pixelsq+lsizeq*0], m1 56 movq [pixelsq+lsizeq*1], m2 57 movq [pixelsq+lsizeq*2], m3 58 movq [pixelsq+lsize3q ], m4 59%else 60 movq [pixelsq+lsizeq*0], m1 61 movhps [pixelsq+lsizeq*1], m1 62 movq [pixelsq+lsizeq*2], m2 63 movhps [pixelsq+lsize3q ], m2 64%endif 65%endmacro 66 67%macro PUT_SIGNED_PIXELS_CLAMPED 1 68cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3 69 mova m0, [pb_80] 70 lea lsize3q, [lsizeq*3] 71 PUT_SIGNED_PIXELS_CLAMPED_HALF 0 72 lea pixelsq, [pixelsq+lsizeq*4] 73 PUT_SIGNED_PIXELS_CLAMPED_HALF 64 74 RET 75%endmacro 76 77INIT_MMX mmx 78PUT_SIGNED_PIXELS_CLAMPED 0 79INIT_XMM sse2 80PUT_SIGNED_PIXELS_CLAMPED 3 81 82;-------------------------------------------------------------------------- 83; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, 84; ptrdiff_t line_size); 85;-------------------------------------------------------------------------- 86; %1 = block offset 87%macro PUT_PIXELS_CLAMPED_HALF 1 88 mova m0, [blockq+mmsize*0+%1] 89 mova m1, [blockq+mmsize*2+%1] 90%if mmsize == 8 91 mova m2, [blockq+mmsize*4+%1] 92 mova m3, [blockq+mmsize*6+%1] 93%endif 94 packuswb m0, [blockq+mmsize*1+%1] 95 packuswb m1, [blockq+mmsize*3+%1] 96%if mmsize == 8 97 packuswb m2, [blockq+mmsize*5+%1] 98 packuswb m3, [blockq+mmsize*7+%1] 99 movq [pixelsq], m0 100 movq [lsizeq+pixelsq], m1 101 movq [2*lsizeq+pixelsq], m2 102 movq [lsize3q+pixelsq], m3 103%else 104 movq [pixelsq], m0 105 movhps [lsizeq+pixelsq], m0 106 movq [2*lsizeq+pixelsq], m1 107 movhps [lsize3q+pixelsq], m1 108%endif 109%endmacro 110 111%macro PUT_PIXELS_CLAMPED 0 112cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 113 lea lsize3q, [lsizeq*3] 114 PUT_PIXELS_CLAMPED_HALF 0 115 lea pixelsq, [pixelsq+lsizeq*4] 116 PUT_PIXELS_CLAMPED_HALF 64 117 RET 118%endmacro 119 120INIT_MMX mmx 121PUT_PIXELS_CLAMPED 122INIT_XMM sse2 123PUT_PIXELS_CLAMPED 124 125;-------------------------------------------------------------------------- 126; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, 127; ptrdiff_t line_size); 128;-------------------------------------------------------------------------- 129; %1 = block offset 130%macro ADD_PIXELS_CLAMPED 1 131 mova m0, [blockq+mmsize*0+%1] 132 mova m1, [blockq+mmsize*1+%1] 133%if mmsize == 8 134 mova m5, [blockq+mmsize*2+%1] 135 mova m6, [blockq+mmsize*3+%1] 136%endif 137 movq m2, [pixelsq] 138 movq m3, [pixelsq+lsizeq] 139%if mmsize == 8 140 mova m7, m2 141 punpcklbw m2, m4 142 punpckhbw m7, m4 143 paddsw m0, m2 144 paddsw m1, m7 145 mova m7, m3 146 punpcklbw m3, m4 147 punpckhbw m7, m4 148 paddsw m5, m3 149 paddsw m6, m7 150%else 151 punpcklbw m2, m4 152 punpcklbw m3, m4 153 paddsw m0, m2 154 paddsw m1, m3 155%endif 156 packuswb m0, m1 157%if mmsize == 8 158 packuswb m5, m6 159 movq [pixelsq], m0 160 movq [pixelsq+lsizeq], m5 161%else 162 movq [pixelsq], m0 163 movhps [pixelsq+lsizeq], m0 164%endif 165%endmacro 166 167%macro ADD_PIXELS_CLAMPED 0 168cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize 169 pxor m4, m4 170 ADD_PIXELS_CLAMPED 0 171 lea pixelsq, [pixelsq+lsizeq*2] 172 ADD_PIXELS_CLAMPED 32 173 lea pixelsq, [pixelsq+lsizeq*2] 174 ADD_PIXELS_CLAMPED 64 175 lea pixelsq, [pixelsq+lsizeq*2] 176 ADD_PIXELS_CLAMPED 96 177 RET 178%endmacro 179 180INIT_MMX mmx 181ADD_PIXELS_CLAMPED 182INIT_XMM sse2 183ADD_PIXELS_CLAMPED 184