1;******************************************************************************
2;* SIMD-optimized MLP DSP functions
3;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION .text
25
26%if ARCH_X86_64
27
28%macro SHLX 2
29%if cpuflag(bmi2)
30   shlx %1, %1, %2q
31%else
32   shl  %1, %2b
33%endif
34%endmacro
35
36%macro REMATRIX 0
37    movdqa        m0, [samplesq]
38    movdqa        m1, [coeffsq ]
39    pshufd        m2, m0, q2301
40    pshufd        m3, m1, q2301
41    pmuldq        m0, m1
42    pmuldq        m3, m2
43    paddq         m0, m3
44%if notcpuflag(avx2)
45    movdqa        m1, [samplesq + 16]
46    movdqa        m2, [coeffsq  + 16]
47    pshufd        m3, m1, q2301
48    pshufd        m4, m2, q2301
49    pmuldq        m1, m2
50    pmuldq        m4, m3
51    paddq         m0, m1
52    paddq         m0, m4
53%else
54    vextracti128 xm1, m0, 1
55    paddq        xm0, xm1
56%endif
57%endmacro
58
59%macro LOOP_END 0
60    pshufd       xm1, xm0, q0032
61    paddq        xm0, xm1
62    movq      accumq, xm0
63    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
64    sar       accumq, 14                            ; accum >>= 14
65    and       accumd, maskd                         ; accum &= mask
66    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
67    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
68    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
69    add     samplesq, 32                            ; samples += MAX_CHANNELS;
70    cmp   blsbs_ptrq, cntq
71%endmacro
72
73%macro LOOP_SHIFT_END 0
74    pshufd       xm1, xm0, q0032
75    paddq        xm0, xm1
76    movq      accumq, xm0
77    and       indexd, auspd                         ; index &= access_unit_size_pow2;
78    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
79    add       indexd, index2d                       ; index += index2
80    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
81    add       accumq, noiseq                        ; accum += noise_buffer[index]
82    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
83    sar       accumq, 14                            ; accum >>= 14
84    and       accumd, maskd                         ; accum &= mask
85    add       accumd, noised                        ; accum += *bypassed_lsbs
86    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
87    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
88    add     samplesq, 32                            ; samples += MAX_CHANNELS;
89    cmp   blsbs_ptrq, cntq
90%endmacro
91
92;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
93;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
94;                             int index, unsigned int dest_ch, uint16_t blockpos,
95;                             unsigned int maxchan, int matrix_noise_shift,
96;                             int access_unit_size_pow2, int32_t mask)
97%macro MLP_REMATRIX_CHANNEL 0
98cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
99                                        index, dest_ch, blockpos, maxchan, mns, \
100                                        accum, mask, cnt
101    mov         mnsd, mnsm                          ; load matrix_noise_shift
102    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
103    mov     maxchand, maxchanm                      ; load maxchan
104    mov        maskd, maskm                         ; load mask
105%if WIN64
106    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
107%endif
108    shl     dest_chd, 2
109    lea         cntq, [blsbs_ptrq + blockposq*8]
110    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
111    jne .shift                                      ; jump if true
112    cmp     maxchand, 4                             ; is maxchan < 4?
113    jl .loop4                                       ; jump if true
114
115align 16
116.loop8:
117    ; Process 5 or more channels
118    REMATRIX
119    LOOP_END
120    jne .loop8
121    RET
122
123align 16
124.loop4:
125    ; Process up to 4 channels
126    movdqa       xm0, [samplesq]
127    movdqa       xm1, [coeffsq ]
128    pshufd       xm2, xm0, q2301
129    pshufd       xm3, xm1, q2301
130    pmuldq       xm0, xm1
131    pmuldq       xm3, xm2
132    paddq        xm0, xm3
133    LOOP_END
134    jne .loop4
135    RET
136
137.shift:
138%if WIN64
139    mov       indexd, indexm         ; load index (not needed on UNIX64)
140%endif
141    mov          r9d, r9m            ; load access_unit_size_pow2
142%if cpuflag(bmi2)
143    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
144    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
145                index, dest_ch, accum, index2, mns, \
146                ausp, mask, cnt, noise
147    add         mnsd, 7              ; matrix_noise_shift += 7
148%else ; sse4
149    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
150%if WIN64
151    ; r0 = rcx
152    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
153                index2, accum, ausp, mask, cnt, noise
154%else ; UNIX64
155    ; r3 = rcx
156    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
157                index2, accum, ausp, mask, cnt, noise
158%endif
159    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
160%endif ; cpuflag
161    sub        auspd, 1              ; access_unit_size_pow2 -= 1
162    cmp          r7d, 4              ; is maxchan < 4?
163    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
164    jl .loop4_shift                  ; jump if maxchan < 4
165
166align 16
167.loop8_shift:
168    ; Process 5 or more channels
169    REMATRIX
170    LOOP_SHIFT_END
171    jne .loop8_shift
172    RET
173
174align 16
175.loop4_shift:
176    ; Process up to 4 channels
177    movdqa       xm0, [samplesq]
178    movdqa       xm1, [coeffsq ]
179    pshufd       xm2, xm0, q2301
180    pshufd       xm3, xm1, q2301
181    pmuldq       xm0, xm1
182    pmuldq       xm3, xm2
183    paddq        xm0, xm3
184    LOOP_SHIFT_END
185    jne .loop4_shift
186    RET
187%endmacro
188
189INIT_XMM sse4
190MLP_REMATRIX_CHANNEL
191%if HAVE_AVX2_EXTERNAL
192INIT_YMM avx2, bmi2
193MLP_REMATRIX_CHANNEL
194%endif
195
196%endif ; ARCH_X86_64
197