1;******************************************************************************
2;* optimized bswap buffer functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2003-2013 Michael Niedermayer
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
28
29cextern pb_80
30
31SECTION_TEXT
32
33; %1 = aligned/unaligned
34%macro BSWAP_LOOPS  1
35    mov      r3, r2
36    sar      r2, 3
37    jz       .left4_%1
38.loop8_%1:
39    mov%1    m0, [r1 +  0]
40    mov%1    m1, [r1 + 16]
41%if cpuflag(ssse3)
42    pshufb   m0, m2
43    pshufb   m1, m2
44    mov%1    [r0 +  0], m0
45    mov%1    [r0 + 16], m1
46%else
47    pshuflw  m0, m0, 10110001b
48    pshuflw  m1, m1, 10110001b
49    pshufhw  m0, m0, 10110001b
50    pshufhw  m1, m1, 10110001b
51    mova     m2, m0
52    mova     m3, m1
53    psllw    m0, 8
54    psllw    m1, 8
55    psrlw    m2, 8
56    psrlw    m3, 8
57    por      m2, m0
58    por      m3, m1
59    mov%1    [r0 +  0], m2
60    mov%1    [r0 + 16], m3
61%endif
62    add      r0, 32
63    add      r1, 32
64    dec      r2
65    jnz      .loop8_%1
66.left4_%1:
67    mov      r2, r3
68    and      r3, 4
69    jz       .left
70    mov%1    m0, [r1]
71%if cpuflag(ssse3)
72    pshufb   m0, m2
73    mov%1    [r0], m0
74%else
75    pshuflw  m0, m0, 10110001b
76    pshufhw  m0, m0, 10110001b
77    mova     m2, m0
78    psllw    m0, 8
79    psrlw    m2, 8
80    por      m2, m0
81    mov%1    [r0], m2
82%endif
83    add      r1, 16
84    add      r0, 16
85%endmacro
86
87; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
88%macro BSWAP32_BUF 0
89%if cpuflag(ssse3)
90cglobal bswap32_buf, 3,4,3
91    mov      r3, r1
92    mova     m2, [pb_bswap32]
93%else
94cglobal bswap32_buf, 3,4,5
95    mov      r3, r1
96%endif
97    or       r3, r0
98    and      r3, 15
99    jz       .start_align
100    BSWAP_LOOPS  u
101    jmp      .left
102.start_align:
103    BSWAP_LOOPS  a
104.left:
105%if cpuflag(ssse3)
106    mov      r3, r2
107    and      r2, 2
108    jz       .left1
109    movq     m0, [r1]
110    pshufb   m0, m2
111    movq     [r0], m0
112    add      r1, 8
113    add      r0, 8
114.left1:
115    and      r3, 1
116    jz       .end
117    mov      r2d, [r1]
118    bswap    r2d
119    mov      [r0], r2d
120%else
121    and      r2, 3
122    jz       .end
123.loop2:
124    mov      r3d, [r1]
125    bswap    r3d
126    mov      [r0], r3d
127    add      r1, 4
128    add      r0, 4
129    dec      r2
130    jnz      .loop2
131%endif
132.end:
133    RET
134%endmacro
135
136INIT_XMM sse2
137BSWAP32_BUF
138
139INIT_XMM ssse3
140BSWAP32_BUF
141