1;***************************************************************************** 2;* x86-optimized functions for convolution filter 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23SECTION_RODATA 24half: dd 0.5 25 26SECTION .text 27 28; void filter_3x3_sse4(uint8_t *dst, int width, 29; float rdiv, float bias, const int *const matrix, 30; const uint8_t *c[], int peak, int radius, 31; int dstride, int stride) 32 33 34%macro PROCESS_V 1 35 movss m2, [matrixq + 4 * %1] 36 VBROADCASTSS m2, m2 37 movss m3, [c%1q + xq] 38 punpcklbw m3, m6 39 punpcklwd m3, m6 40 pmulld m2, m3 41 paddd m4, m2 42%endmacro 43 44%macro PROCESS_S 1 45 movzx ptrd, byte [c%1q + xq] 46 imul ptrd, [matrixq + 4 * %1] 47 add rd, ptrd 48%endmacro 49 50%macro FILTER_3X3 0 51%if UNIX64 52cglobal filter_3x3, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x 53%else 54cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x 55%endif 56 57%if WIN64 58 SWAP m0, m2 59 SWAP m1, m3 60 mov r2q, matrixmp 61 mov r3q, ptrmp 62 DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x 63%endif 64 movsxdifnidn widthq, widthd 65 VBROADCASTSS m0, m0 66 VBROADCASTSS m1, m1 67 pxor m6, m6 68 movss m5, [half] 69 VBROADCASTSS m5, m5 70 mov c0q, [ptrq + 0*gprsize] 71 mov c1q, [ptrq + 1*gprsize] 72 mov c2q, [ptrq + 2*gprsize] 73 mov c3q, [ptrq + 3*gprsize] 74 mov c4q, [ptrq + 4*gprsize] 75 mov c5q, [ptrq + 5*gprsize] 76 mov c6q, [ptrq + 6*gprsize] 77 mov c7q, [ptrq + 7*gprsize] 78 mov c8q, [ptrq + 8*gprsize] 79 80 xor xq, xq 81 cmp widthq, mmsize/4 82 jl .loop2 83 84 mov rq, widthq 85 and rq, mmsize/4-1 86 sub widthq, rq 87 88.loop1: 89 pxor m4, m4 ; sum = 0; 90 91 PROCESS_V 0 92 PROCESS_V 1 93 PROCESS_V 2 94 PROCESS_V 3 95 PROCESS_V 4 96 PROCESS_V 5 97 PROCESS_V 6 98 PROCESS_V 7 99 PROCESS_V 8 100 101 cvtdq2ps m4, m4 102 mulps m4, m0 ; sum *= rdiv 103 addps m4, m1 ; sum += bias 104 addps m4, m5 ; sum += 0.5 105 cvttps2dq m4, m4 106 packssdw m4, m4 107 packuswb m4, m4 108 movss [dstq + xq], m4 109 110 add xq, mmsize/4 111 cmp xq, widthq 112 jl .loop1 113 114 add widthq, rq 115 cmp xq, widthq 116 jge .end 117 118.loop2: 119 ; reuse r to hold sum, init with zero 120 xor rd, rd 121 122 PROCESS_S 0 123 PROCESS_S 1 124 PROCESS_S 2 125 PROCESS_S 3 126 PROCESS_S 4 127 PROCESS_S 5 128 PROCESS_S 6 129 PROCESS_S 7 130 PROCESS_S 8 131 132 pxor m4, m4 133 cvtsi2ss m4, rd 134 mulss m4, m0 ; sum *= rdiv 135 addss m4, m1 ; sum += bias 136 addss m4, m5 ; sum += 0.5 137 ; we don't have simple scalar instructions to convert 138 ; from 32bit to 8bit with saturation, so here 139 ; just use packed version SSE instructions for simplicity. 140 cvttps2dq m4, m4 ; trunc to integer 141 packssdw m4, m4 142 packuswb m4, m4 143 movd rd, m4 144 mov [dstq + xq], rb 145 146 add xq, 1 147 cmp xq, widthq 148 jl .loop2 149.end: 150 RET 151%endmacro 152 153%if ARCH_X86_64 154INIT_XMM sse4 155FILTER_3X3 156%endif 157