1;****************************************************************************** 2;* optimized bswap buffer functions 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2003-2013 Michael Niedermayer 5;* Copyright (c) 2013 Daniel Kang 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 28 29cextern pb_80 30 31SECTION_TEXT 32 33; %1 = aligned/unaligned 34%macro BSWAP_LOOPS 1 35 mov r3, r2 36 sar r2, 3 37 jz .left4_%1 38.loop8_%1: 39 mov%1 m0, [r1 + 0] 40 mov%1 m1, [r1 + 16] 41%if cpuflag(ssse3) 42 pshufb m0, m2 43 pshufb m1, m2 44 mov%1 [r0 + 0], m0 45 mov%1 [r0 + 16], m1 46%else 47 pshuflw m0, m0, 10110001b 48 pshuflw m1, m1, 10110001b 49 pshufhw m0, m0, 10110001b 50 pshufhw m1, m1, 10110001b 51 mova m2, m0 52 mova m3, m1 53 psllw m0, 8 54 psllw m1, 8 55 psrlw m2, 8 56 psrlw m3, 8 57 por m2, m0 58 por m3, m1 59 mov%1 [r0 + 0], m2 60 mov%1 [r0 + 16], m3 61%endif 62 add r0, 32 63 add r1, 32 64 dec r2 65 jnz .loop8_%1 66.left4_%1: 67 mov r2, r3 68 and r3, 4 69 jz .left 70 mov%1 m0, [r1] 71%if cpuflag(ssse3) 72 pshufb m0, m2 73 mov%1 [r0], m0 74%else 75 pshuflw m0, m0, 10110001b 76 pshufhw m0, m0, 10110001b 77 mova m2, m0 78 psllw m0, 8 79 psrlw m2, 8 80 por m2, m0 81 mov%1 [r0], m2 82%endif 83 add r1, 16 84 add r0, 16 85%endmacro 86 87; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w); 88%macro BSWAP32_BUF 0 89%if cpuflag(ssse3) 90cglobal bswap32_buf, 3,4,3 91 mov r3, r1 92 mova m2, [pb_bswap32] 93%else 94cglobal bswap32_buf, 3,4,5 95 mov r3, r1 96%endif 97 or r3, r0 98 and r3, 15 99 jz .start_align 100 BSWAP_LOOPS u 101 jmp .left 102.start_align: 103 BSWAP_LOOPS a 104.left: 105%if cpuflag(ssse3) 106 mov r3, r2 107 and r2, 2 108 jz .left1 109 movq m0, [r1] 110 pshufb m0, m2 111 movq [r0], m0 112 add r1, 8 113 add r0, 8 114.left1: 115 and r3, 1 116 jz .end 117 mov r2d, [r1] 118 bswap r2d 119 mov [r0], r2d 120%else 121 and r2, 3 122 jz .end 123.loop2: 124 mov r3d, [r1] 125 bswap r3d 126 mov [r0], r3d 127 add r1, 4 128 add r0, 4 129 dec r2 130 jnz .loop2 131%endif 132.end: 133 RET 134%endmacro 135 136INIT_XMM sse2 137BSWAP32_BUF 138 139INIT_XMM ssse3 140BSWAP32_BUF 141