1;******************************************************************************
2;* optimized bswap buffer functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2003-2013 Michael Niedermayer
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
28
29cextern pb_80
30
31SECTION .text
32
33; %1 = aligned/unaligned
34%macro BSWAP_LOOPS  1
35    mov      r3d, r2d
36    sar      r2d, 3
37    jz       .left4_%1
38%if cpuflag(avx2)
39    sar      r2d, 1
40    jz       .left8_%1
41%endif
42.loop8_%1:
43    mov%1    m0, [r1 +  0]
44    mov%1    m1, [r1 + mmsize]
45%if cpuflag(ssse3)||cpuflag(avx2)
46    pshufb   m0, m2
47    pshufb   m1, m2
48    mov%1    [r0 +  0], m0
49    mov%1    [r0 + mmsize], m1
50%else
51    pshuflw  m0, m0, 10110001b
52    pshuflw  m1, m1, 10110001b
53    pshufhw  m0, m0, 10110001b
54    pshufhw  m1, m1, 10110001b
55    mova     m2, m0
56    mova     m3, m1
57    psllw    m0, 8
58    psllw    m1, 8
59    psrlw    m2, 8
60    psrlw    m3, 8
61    por      m2, m0
62    por      m3, m1
63    mov%1    [r0 +  0], m2
64    mov%1    [r0 + 16], m3
65%endif
66    add      r0, mmsize*2
67    add      r1, mmsize*2
68    dec      r2d
69    jnz      .loop8_%1
70%if cpuflag(avx2)
71.left8_%1:
72    mov      r2d, r3d
73    test     r3d, 8
74    jz       .left4_%1
75    mov%1    m0, [r1]
76    pshufb   m0, m2
77    mov%1    [r0 +  0], m0
78    add r1, mmsize
79    add r0, mmsize
80%endif
81.left4_%1:
82    mov      r2d, r3d
83    test     r3d, 4
84    jz       .left
85    mov%1    xm0, [r1]
86%if cpuflag(ssse3)
87    pshufb   xm0, xm2
88    mov%1    [r0], xm0
89%else
90    pshuflw  m0, m0, 10110001b
91    pshufhw  m0, m0, 10110001b
92    mova     m2, m0
93    psllw    m0, 8
94    psrlw    m2, 8
95    por      m2, m0
96    mov%1    [r0], m2
97%endif
98    add      r1, 16
99    add      r0, 16
100%endmacro
101
102; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
103%macro BSWAP32_BUF 0
104%if cpuflag(ssse3)||cpuflag(avx2)
105cglobal bswap32_buf, 3,4,3
106    mov      r3, r1
107    VBROADCASTI128  m2, [pb_bswap32]
108%else
109cglobal bswap32_buf, 3,4,5
110    mov      r3, r1
111%endif
112    or       r3, r0
113    test     r3, mmsize - 1
114    jz       .start_align
115    BSWAP_LOOPS  u
116    jmp      .left
117.start_align:
118    BSWAP_LOOPS  a
119.left:
120%if cpuflag(ssse3)
121    test     r2d, 2
122    jz       .left1
123    movq     xm0, [r1]
124    pshufb   xm0, xm2
125    movq     [r0], xm0
126    add      r1, 8
127    add      r0, 8
128.left1:
129    test     r2d, 1
130    jz       .end
131    mov      r2d, [r1]
132    bswap    r2d
133    mov      [r0], r2d
134%else
135    and      r2d, 3
136    jz       .end
137.loop2:
138    mov      r3d, [r1]
139    bswap    r3d
140    mov      [r0], r3d
141    add      r1, 4
142    add      r0, 4
143    dec      r2d
144    jnz      .loop2
145%endif
146.end:
147    RET
148%endmacro
149
150INIT_XMM sse2
151BSWAP32_BUF
152
153INIT_XMM ssse3
154BSWAP32_BUF
155
156%if HAVE_AVX2_EXTERNAL
157INIT_YMM avx2
158BSWAP32_BUF
159%endif
160