1;******************************************************************************
2;* X86 Optimized functions for Open Exr Decoder
3;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
4;*
5;* reorder_pixels, predictor based on patch by John Loy
6;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
7;*
8;* predictor AVX/AVX2 by Henrik Gramner
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29cextern pb_15
30cextern pb_80
31
32SECTION .text
33
34;------------------------------------------------------------------------------
35; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
36;------------------------------------------------------------------------------
37
38%macro REORDER_PIXELS 0
39cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
40    lea                              src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
41    add                               dstq, sizeq         ; dst offset by size
42    shr                              sizeq, 1             ; half_size
43    add                              src1q, sizeq         ; offset src by half_size
44    neg                              sizeq                ; size = offset for dst, src1, src2
45.loop:
46
47    mova                                m0, [src1q+sizeq]        ; load first part
48    movu                                m1, [src2q+sizeq]        ; load second part
49    SBUTTERFLY bw, 0, 1, 2                                       ; interleaved
50    mova                 [dstq+2*sizeq   ], xm0                  ; copy to dst
51    mova                 [dstq+2*sizeq+16], xm1
52%if cpuflag(avx2)
53    vperm2i128                          m0, m0, m1, q0301
54    mova                 [dstq+2*sizeq+32], m0
55%endif
56    add     sizeq, mmsize
57    jl .loop
58    RET
59%endmacro
60
61INIT_XMM sse2
62REORDER_PIXELS
63
64%if HAVE_AVX2_EXTERNAL
65INIT_YMM avx2
66REORDER_PIXELS
67%endif
68
69
70;------------------------------------------------------------------------------
71; void ff_predictor(uint8_t *src, ptrdiff_t size);
72;------------------------------------------------------------------------------
73
74%macro PREDICTOR 0
75cglobal predictor, 2,2,5, src, size
76    mova             m0, [pb_80]
77    mova            xm1, [pb_15]
78    mova            xm2, xm0
79    add            srcq, sizeq
80    neg           sizeq
81.loop:
82    pxor             m3, m0, [srcq + sizeq]
83    pslldq           m4, m3, 1
84    paddb            m3, m4
85    pslldq           m4, m3, 2
86    paddb            m3, m4
87    pslldq           m4, m3, 4
88    paddb            m3, m4
89    pslldq           m4, m3, 8
90%if mmsize == 32
91    paddb            m3, m4
92    paddb           xm2, xm3
93    vextracti128    xm4, m3, 1
94    mova [srcq + sizeq], xm2
95    pshufb          xm2, xm1
96    paddb           xm2, xm4
97    mova [srcq + sizeq + 16], xm2
98%else
99    paddb            m2, m3
100    paddb            m2, m4
101    mova [srcq + sizeq], m2
102%endif
103    pshufb          xm2, xm1
104    add           sizeq, mmsize
105    jl .loop
106    RET
107%endmacro
108
109INIT_XMM ssse3
110PREDICTOR
111
112INIT_XMM avx
113PREDICTOR
114
115%if HAVE_AVX2_EXTERNAL
116INIT_YMM avx2
117PREDICTOR
118%endif
119