1;****************************************************************************** 2;* x86 optimizations for PNG decoding 3;* 4;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> 5;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pw_255 29 30SECTION_TEXT 31 32; %1 = nr. of xmm registers used 33%macro ADD_BYTES_FN 1 34cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i 35%if ARCH_X86_64 36 movsxd waq, wad 37%endif 38 xor iq, iq 39 40 ; vector loop 41 mov wq, waq 42 and waq, ~(mmsize*2-1) 43 jmp .end_v 44.loop_v: 45 mova m0, [src1q+iq] 46 mova m1, [src1q+iq+mmsize] 47 paddb m0, [src2q+iq] 48 paddb m1, [src2q+iq+mmsize] 49 mova [dstq+iq ], m0 50 mova [dstq+iq+mmsize], m1 51 add iq, mmsize*2 52.end_v: 53 cmp iq, waq 54 jl .loop_v 55 56%if mmsize == 16 57 ; vector loop 58 mov waq, wq 59 and waq, ~7 60 jmp .end_l 61.loop_l: 62 movq mm0, [src1q+iq] 63 paddb mm0, [src2q+iq] 64 movq [dstq+iq ], mm0 65 add iq, 8 66.end_l: 67 cmp iq, waq 68 jl .loop_l 69%endif 70 71 ; scalar loop for leftover 72 jmp .end_s 73.loop_s: 74 mov wab, [src1q+iq] 75 add wab, [src2q+iq] 76 mov [dstq+iq], wab 77 inc iq 78.end_s: 79 cmp iq, wq 80 jl .loop_s 81 REP_RET 82%endmacro 83 84%if ARCH_X86_32 85INIT_MMX mmx 86ADD_BYTES_FN 0 87%endif 88 89INIT_XMM sse2 90ADD_BYTES_FN 2 91 92%macro ADD_PAETH_PRED_FN 1 93cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr 94%if ARCH_X86_64 95 movsxd bppq, bppd 96 movsxd wq, wd 97%endif 98 lea endq, [dstq+wq-(mmsize/2-1)] 99 sub topq, dstq 100 sub srcq, dstq 101 sub dstq, bppq 102 pxor m7, m7 103 104 PUSH dstq 105 lea cntrq, [bppq-1] 106 shr cntrq, 2 + mmsize/16 107.bpp_loop: 108 lea dstq, [dstq+cntrq*(mmsize/2)] 109 movh m0, [dstq] 110 movh m1, [topq+dstq] 111 punpcklbw m0, m7 112 punpcklbw m1, m7 113 add dstq, bppq 114.loop: 115 mova m2, m1 116 movh m1, [topq+dstq] 117 mova m3, m2 118 punpcklbw m1, m7 119 mova m4, m2 120 psubw m3, m1 121 psubw m4, m0 122 mova m5, m3 123 paddw m5, m4 124%if cpuflag(ssse3) 125 pabsw m3, m3 126 pabsw m4, m4 127 pabsw m5, m5 128%else ; !cpuflag(ssse3) 129 psubw m7, m5 130 pmaxsw m5, m7 131 pxor m6, m6 132 pxor m7, m7 133 psubw m6, m3 134 psubw m7, m4 135 pmaxsw m3, m6 136 pmaxsw m4, m7 137 pxor m7, m7 138%endif ; cpuflag(ssse3) 139 mova m6, m4 140 pminsw m6, m5 141 pcmpgtw m3, m6 142 pcmpgtw m4, m5 143 mova m6, m4 144 pand m4, m3 145 pandn m6, m3 146 pandn m3, m0 147 movh m0, [srcq+dstq] 148 pand m6, m1 149 pand m2, m4 150 punpcklbw m0, m7 151 paddw m0, m6 152 paddw m3, m2 153 paddw m0, m3 154 pand m0, [pw_255] 155 mova m3, m0 156 packuswb m3, m3 157 movh [dstq], m3 158 add dstq, bppq 159 cmp dstq, endq 160 jle .loop 161 162 mov dstq, [rsp] 163 dec cntrq 164 jge .bpp_loop 165 POP dstq 166 RET 167%endmacro 168 169INIT_MMX mmxext 170ADD_PAETH_PRED_FN 0 171 172INIT_MMX ssse3 173ADD_PAETH_PRED_FN 0 174