1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
11@//  to support float instead of SC32.
12@//
13
14@//
15@// Description:
16@// Compute a Radix 4 FFT stage for a N point complex signal
17@//
18@//
19
20
21@// Include standard headers
22
23#include "dl/api/armCOMM_s.h"
24#include "dl/api/omxtypes_s.h"
25
26@// Import symbols required from other files
27@// (For example tables)
28
29
30
31
32@// Set debugging level
33@//DEBUG_ON    SETL {TRUE}
34
35
36@// Guarding implementation by the processor name
37
38
39@// Import symbols required from other files
40@// (For example tables)
41    @//IMPORT  armAAC_constTable
42
43@//Input Registers
44
45#define pSrc            r0
46#define pDst            r2
47#define pTwiddle        r1
48#define subFFTNum       r6
49#define subFFTSize      r7
50
51
52
53@//Output Registers
54
55
56@//Local Scratch Registers
57
58#define outPointStep    r3
59#define grpCount        r4
60#define dstStep         r5
61#define grpTwStep       r8
62#define stepTwiddle     r9
63#define twStep          r10
64#define pTmp            r4
65#define step16          r11
66#define step24          r12
67
68
69@// Neon Registers
70
71#define dButterfly1Real02       D0.F32
72#define dButterfly1Imag02       D1.F32
73#define dButterfly1Real13       D2.F32
74#define dButterfly1Imag13       D3.F32
75#define dButterfly2Real02       D4.F32
76#define dButterfly2Imag02       D5.F32
77#define dButterfly2Real13       D6.F32
78#define dButterfly2Imag13       D7.F32
79#define dXr0                    D0.F32
80#define dXi0                    D1.F32
81#define dXr1                    D2.F32
82#define dXi1                    D3.F32
83#define dXr2                    D4.F32
84#define dXi2                    D5.F32
85#define dXr3                    D6.F32
86#define dXi3                    D7.F32
87
88#define dYr0                    D16.F32
89#define dYi0                    D17.F32
90#define dYr1                    D18.F32
91#define dYi1                    D19.F32
92#define dYr2                    D20.F32
93#define dYi2                    D21.F32
94#define dYr3                    D22.F32
95#define dYi3                    D23.F32
96
97#define dW1r                    D8.F32
98#define dW1i                    D9.F32
99#define dW2r                    D10.F32
100#define dW2i                    D11.F32
101#define dW3r                    D12.F32
102#define dW3i                    D13.F32
103#define qT0                     d14.f32
104#define qT1                     d16.F32
105#define qT2                     d18.F32
106#define qT3                     d20.f32
107#define qT4                     d22.f32
108#define qT5                     d24.f32
109
110#define dZr0                    D14.F32
111#define dZi0                    D15.F32
112#define dZr1                    D26.F32
113#define dZi1                    D27.F32
114#define dZr2                    D28.F32
115#define dZi2                    D29.F32
116#define dZr3                    D30.F32
117#define dZi3                    D31.F32
118
119#define qX0                     Q0.F32
120#define qY0                     Q8.F32
121#define qY1                     Q9.F32
122#define qY2                     Q10.F32
123#define qY3                     Q11.F32
124#define qZ0                     Q7.F32
125#define qZ1                     Q13.F32
126#define qZ2                     Q14.F32
127#define qZ3                     Q15.F32
128
129
130
131        .MACRO FFTSTAGE scaled, inverse , name
132
133        @// Define stack arguments
134
135
136        @// pOut0+1 increments pOut0 by 8 bytes
137        @// pOut0+outPointStep == increment of 8*outPointStep bytes
138        MOV     outPointStep,subFFTSize,LSL #3
139
140        @// Update grpCount and grpSize rightaway
141
142        VLD2    {dW1r,dW1i},[pTwiddle, :128]             @// [wi|wr]
143        MOV     step16,#16
144        LSL     grpCount,subFFTSize,#2
145
146        VLD1    dW2r,[pTwiddle, :64]                     @// [wi|wr]
147        MOV     subFFTNum,#1                            @//after the last stage
148
149        VLD1    dW3r,[pTwiddle, :64],step16              @// [wi|wr]
150        MOV     stepTwiddle,#0
151
152        VLD1    dW2i,[pTwiddle, :64]!                    @// [wi|wr]
153        SUB     grpTwStep,stepTwiddle,#8                @// grpTwStep = -8 to start with
154
155        @// update subFFTSize for the next stage
156        MOV     subFFTSize,grpCount
157        VLD1    dW3i,[pTwiddle, :64],grpTwStep           @// [wi|wr]
158        MOV     dstStep,outPointStep,LSL #1
159
160        @// AC.r AC.i BD.r BD.i
161        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
162        ADD     dstStep,dstStep,outPointStep            @// dstStep = 3*outPointStep
163        RSB     dstStep,dstStep,#16                     @// dstStep = - 3*outPointStep+16
164        MOV     step24,#24
165
166        @// AC.r AC.i BD.r BD.i
167        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
168
169
170        @// Process two groups at a time
171
172radix4lsGrpLoop\name :
173
174        VZIP    dW2r,dW2i
175        ADD     stepTwiddle,stepTwiddle,#16
176        VZIP    dW3r,dW3i
177        ADD     grpTwStep,stepTwiddle,#4
178        VUZP     dButterfly1Real13, dButterfly2Real13   @// B.r D.r
179        SUB     twStep,stepTwiddle,#16                  @// -16+stepTwiddle
180        VUZP     dButterfly1Imag13, dButterfly2Imag13   @// B.i D.i
181        MOV     grpTwStep,grpTwStep,LSL #1
182        VUZP     dButterfly1Real02, dButterfly2Real02   @// A.r C.r
183        RSB     grpTwStep,grpTwStep,#0                  @// -8-2*stepTwiddle
184
185
186        VUZP     dButterfly1Imag02, dButterfly2Imag02   @// A.i C.i
187
188
189        @// grpCount is multiplied by 4
190        SUBS    grpCount,grpCount,#8
191
192        .ifeqs  "\inverse", "TRUE"
193            VMUL   dZr1,dW1r,dXr1
194            VMLA   dZr1,dW1i,dXi1                       @// real part
195            VMUL   dZi1,dW1r,dXi1
196            VMLS   dZi1,dW1i,dXr1                       @// imag part
197
198        .else
199
200            VMUL   dZr1,dW1r,dXr1
201            VMLS   dZr1,dW1i,dXi1                       @// real part
202            VMUL   dZi1,dW1r,dXi1
203            VMLA   dZi1,dW1i,dXr1                       @// imag part
204
205        .endif
206
207        VLD2    {dW1r,dW1i},[pTwiddle, :128],stepTwiddle      @// [wi|wr]
208
209        .ifeqs  "\inverse", "TRUE"
210            VMUL   dZr2,dW2r,dXr2
211            VMLA   dZr2,dW2i,dXi2                       @// real part
212            VMUL   dZi2,dW2r,dXi2
213            VLD1   dW2r,[pTwiddle, :64],step16           @// [wi|wr]
214            VMLS   dZi2,dW2i,dXr2                       @// imag part
215
216        .else
217
218            VMUL   dZr2,dW2r,dXr2
219            VMLS   dZr2,dW2i,dXi2                       @// real part
220            VMUL   dZi2,dW2r,dXi2
221            VLD1    dW2r,[pTwiddle, :64],step16          @// [wi|wr]
222            VMLA   dZi2,dW2i,dXr2                       @// imag part
223
224        .endif
225
226
227        VLD1    dW2i,[pTwiddle, :64],twStep              @// [wi|wr]
228
229        @// move qX0 so as to load for the next iteration
230        VMOV     qZ0,qX0
231
232        .ifeqs  "\inverse", "TRUE"
233            VMUL   dZr3,dW3r,dXr3
234            VMLA   dZr3,dW3i,dXi3                       @// real part
235            VMUL   dZi3,dW3r,dXi3
236            VLD1    dW3r,[pTwiddle, :64],step24
237            VMLS   dZi3,dW3i,dXr3                       @// imag part
238
239        .else
240
241            VMUL   dZr3,dW3r,dXr3
242            VMLS   dZr3,dW3i,dXi3                       @// real part
243            VMUL   dZi3,dW3r,dXi3
244            VLD1    dW3r,[pTwiddle, :64],step24
245            VMLA   dZi3,dW3i,dXr3                       @// imag part
246
247        .endif
248
249        VLD1    dW3i,[pTwiddle, :64],grpTwStep           @// [wi|wr]
250
251        @// Don't do the load on the last iteration so we don't read past the end
252        @// of pSrc.
253        addeq   pSrc, pSrc, #64
254        beq     radix4lsSkipRead\name
255        @// AC.r AC.i BD.r BD.i
256        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
257
258        @// AC.r AC.i BD.r BD.i
259        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
260radix4lsSkipRead\name:
261
262        @// finish first stage of 4 point FFT
263
264        VADD    qY0,qZ0,qZ2
265        VSUB    qY2,qZ0,qZ2
266        VADD    qY1,qZ1,qZ3
267        VSUB    qY3,qZ1,qZ3
268
269
270        @// finish second stage of 4 point FFT
271
272        .ifeqs  "\inverse", "TRUE"
273
274            VSUB    qZ0,qY2,qY1
275
276            VADD    dZr3,dYr0,dYi3
277            VST2    {dZr0,dZi0},[pDst, :128],outPointStep
278            VSUB    dZi3,dYi0,dYr3
279
280            VADD    qZ2,qY2,qY1
281            VST2    {dZr3,dZi3},[pDst, :128],outPointStep
282
283            VSUB    dZr1,dYr0,dYi3
284            VST2    {dZr2,dZi2},[pDst, :128],outPointStep
285            VADD    dZi1,dYi0,dYr3
286
287            @// dstStep = -outPointStep + 16
288            VST2    {dZr1,dZi1},[pDst, :128],dstStep
289
290
291        .else
292
293            VSUB    qZ0,qY2,qY1
294
295            VSUB    dZr1,dYr0,dYi3
296            VST2    {dZr0,dZi0},[pDst, :128],outPointStep
297            VADD    dZi1,dYi0,dYr3
298
299            VADD    qZ2,qY2,qY1
300            VST2    {dZr1,dZi1},[pDst, :128],outPointStep
301
302            VADD    dZr3,dYr0,dYi3
303            VST2    {dZr2,dZi2},[pDst, :128],outPointStep
304            VSUB    dZi3,dYi0,dYr3
305
306            @// dstStep = -outPointStep + 16
307            VST2    {dZr3,dZi3},[pDst, :128],dstStep
308
309
310        .endif
311
312        BGT     radix4lsGrpLoop\name
313
314
315        @// Reset and Swap pSrc and pDst for the next stage
316        MOV     pTmp,pDst
317        @// Extra increment done in final iteration of the loop
318        SUB     pSrc,pSrc,#64
319        @// pDst -= 4*size; pSrc -= 8*size bytes
320        SUB     pDst,pSrc,outPointStep,LSL #2
321        SUB     pSrc,pTmp,outPointStep
322        SUB     pTwiddle,pTwiddle,subFFTSize,LSL #1
323        @// Extra increment done in final iteration of the loop
324        SUB     pTwiddle,pTwiddle,#16
325
326        .endm
327
328
329        M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
330        FFTSTAGE "FALSE","FALSE",fwd
331        M_END
332
333
334        M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
335        FFTSTAGE "FALSE","TRUE",inv
336        M_END
337
338
339        .end
340