1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This file was originally licensed as follows. It has been
11@//  relicensed with permission from the copyright holders.
12
13@//
14@//
15@// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
16@// OpenMAX DL: v1.0.2
17@// Last Modified Revision:   7765
18@// Last Modified Date:       Thu, 27 Sep 2007
19@//
20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@//
22@//
23@//
24@// Description:
25@// Compute a Radix 4 FFT stage for a N point complex signal
26@//
27@//
28
29
30@// Include standard headers
31
32#include "dl/api/armCOMM_s.h"
33#include "dl/api/omxtypes_s.h"
34
35
36@// Import symbols required from other files
37@// (For example tables)
38
39
40
41
42@// Set debugging level
43@//DEBUG_ON    SETL {TRUE}
44
45
46@// Guarding implementation by the processor name
47
48
49
50
51
52
53@// Guarding implementation by the processor name
54
55
56@// Import symbols required from other files
57@// (For example tables)
58    @//IMPORT  armAAC_constTable
59
60@//Input Registers
61
62#define pSrc                            r0
63#define pDst                            r2
64#define pTwiddle                        r1
65#define subFFTNum                       r6
66#define subFFTSize                      r7
67
68
69
70@//Output Registers
71
72
73@//Local Scratch Registers
74
75#define outPointStep                    r3
76#define grpCount                        r4
77#define dstStep                         r5
78#define pw1                             r8
79#define pw2                             r9
80#define pw3                             r10
81#define pTmp                            r4
82
83
84@// Neon Registers
85
86#define dButterfly1Real02               D0.S16
87#define dButterfly1Imag02               D1.S16
88#define dButterfly1Real13               D2.S16
89#define dButterfly1Imag13               D3.S16
90#define dButterfly2Real02               D4.S16
91#define dButterfly2Imag02               D5.S16
92#define dButterfly2Real13               D6.S16
93#define dButterfly2Imag13               D7.S16
94#define dXr0                            D0.S16
95#define dXi0                            D1.S16
96#define dXr1                            D2.S16
97#define dXi1                            D3.S16
98#define dXr2                            D4.S16
99#define dXi2                            D5.S16
100#define dXr3                            D6.S16
101#define dXi3                            D7.S16
102
103#define dW1rS32                         D8.S32
104#define dW1iS32                         D9.S32
105#define dW2rS32                         D10.S32
106#define dW2iS32                         D11.S32
107#define dW3rS32                         D12.S32
108#define dW3iS32                         D13.S32
109
110#define dW1r                            D8.S16
111#define dW1i                            D9.S16
112#define dW2r                            D10.S16
113#define dW2i                            D11.S16
114#define dW3r                            D12.S16
115#define dW3i                            D13.S16
116
117#define dTmp0                           D12.S16
118#define dTmp1                           D13.S16
119#define dTmp1S32                        D13.S32
120#define dTmp2S32                        D14.S32
121#define dTmp3S32                        D15.S32
122
123#define dYr0                            D18.S16
124#define dYi0                            D19.S16
125#define dYr1                            D16.S16
126#define dYi1                            D17.S16
127#define dYr2                            D20.S16
128#define dYi2                            D21.S16
129#define dYr3                            D14.S16
130#define dYi3                            D15.S16
131#define qY0                             Q9.S16
132#define qY1                             Q8.S16
133#define qY2                             Q10.S16
134#define qY3                             Q7.S16
135
136#define qX0                             Q0.S16
137#define qX1                             Q1.S16
138#define qX2                             Q2.S16
139#define qX3                             Q3.S16
140
141#define qT0                             Q9.S32
142#define qT1                             Q10.S32
143#define qT2                             Q7.S32
144#define qT3                             Q8.S32
145
146#define dZr0                            D22.S16
147#define dZi0                            D23.S16
148#define dZr1                            D24.S16
149#define dZi1                            D25.S16
150#define dZr2                            D26.S16
151#define dZi2                            D27.S16
152#define dZr3                            D28.S16
153#define dZi3                            D29.S16
154
155#define qZ0                             Q11.S16
156#define qZ1                             Q12.S16
157#define qZ2                             Q13.S16
158#define qZ3                             Q14.S16
159
160
161        .MACRO FFTSTAGE scaled, inverse , name
162
163        @// Define stack arguments
164
165        MOV     pw2,pTwiddle
166        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2, :256]!
167
168        MOV     pw3,pTwiddle
169        MOV     pw1,pTwiddle
170        @// pOut0+1 increments pOut0 by 8 bytes
171        @// pOut0+outPointStep == increment of 4*outPointStep bytes
172        MOV     outPointStep,subFFTSize,LSL #2
173
174        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3, :64]!
175        MOV     subFFTNum,#1                            @//after the last stage
176        LSL     grpCount,subFFTSize,#2
177
178
179        @// Update grpCount and grpSize rightaway
180        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3, :64]!
181
182        @// update subFFTSize for the next stage
183        MOV     subFFTSize,grpCount
184        MOV     dstStep,outPointStep,LSL #1
185
186        VLD2 {dW1r,dW1i}, [pw1, :128]!
187
188
189        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
190        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
191
192        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
193        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
194
195        @// Process 4 groups at a time
196
197grpLoop\name:
198
199
200        @// Rearrange the third twiddle
201        VUZP    dW3r,dW3i
202        SUBS    grpCount,grpCount,#16                    @// grpCount is multiplied by 4
203
204
205        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
206        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
207        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
208        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
209
210
211        .ifeqs  "\inverse", "TRUE"
212            VMULL   qT0,dXr1,dW1r
213            VMLAL   qT0,dXi1,dW1i                       @// real part
214            VMULL   qT1,dXi1,dW1r
215            VMLSL   qT1,dXr1,dW1i                       @// imag part
216
217        .ELSE
218            VMULL   qT0,dXr1,dW1r
219            VMLSL   qT0,dXi1,dW1i                       @// real part
220            VMULL   qT1,dXi1,dW1r
221            VMLAL   qT1,dXr1,dW1i                       @// imag part
222
223        .ENDIF
224
225        @// Load the first twiddle for 4 groups : w^1
226        @// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
227
228        VLD2 {dW1r,dW1i}, [pw1, :128]!
229
230        .ifeqs  "\inverse", "TRUE"
231            VMULL   qT2,dXr2,dW2r
232            VMLAL   qT2,dXi2,dW2i                       @// real part
233            VMULL   qT3,dXi2,dW2r
234            VMLSL   qT3,dXr2,dW2i                       @// imag part
235
236        .ELSE
237            VMULL   qT2,dXr2,dW2r
238            VMLSL   qT2,dXi2,dW2i                       @// real part
239            VMULL   qT3,dXi2,dW2r
240            VMLAL   qT3,dXr2,dW2i                       @// imag part
241
242        .ENDIF
243
244        VRSHRN  dZr1,qT0,#15
245        VRSHRN  dZi1,qT1,#15
246
247
248
249        .ifeqs  "\inverse", "TRUE"
250            VMULL   qT0,dXr3,dW3r
251            VMLAL   qT0,dXi3,dW3i                       @// real part
252            VMULL   qT1,dXi3,dW3r
253            VMLSL   qT1,dXr3,dW3i                       @// imag part
254
255        .ELSE
256            VMULL   qT0,dXr3,dW3r
257            VMLSL   qT0,dXi3,dW3i                       @// real part
258            VMULL   qT1,dXi3,dW3r
259            VMLAL   qT1,dXr3,dW3i                       @// imag part
260
261        .ENDIF
262
263        @// Load the second twiddle for 4 groups : w^2
264        @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
265        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2, :256]!
266
267
268        VRSHRN  dZr2,qT2,#15
269        VRSHRN  dZi2,qT3,#15
270
271        @// Load the third twiddle for 4 groups : w^3
272        @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
273
274        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3, :64]!
275
276        VRSHRN  dZr3,qT0,#15
277        VRSHRN  dZi3,qT1,#15
278
279        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3, :64]!
280
281        .ifeqs "\scaled", "TRUE"
282
283            @// finish first stage of 4 point FFT
284
285            VHADD    qY0,qX0,qZ2
286            VHSUB    qY2,qX0,qZ2
287            VHADD    qY1,qZ1,qZ3
288            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
289
290            VHSUB    qY3,qZ1,qZ3
291
292            @// finish second stage of 4 point FFT
293
294            VHSUB    qZ0,qY2,qY1
295            VHADD    qZ2,qY2,qY1
296            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
297
298
299            .ifeqs "\inverse", "TRUE"
300
301                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
302                VST2    {dZr0,dZi0},[pDst, :128],outPointStep
303                VHSUB    dZi3,dYi0,dYr3
304
305                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
306                VHADD    dZi1,dYi0,dYr3
307                VST2    {dZr3,dZi3},[pDst, :128],outPointStep
308                VST2    {dZr2,dZi2},[pDst, :128],outPointStep
309                VST2    {dZr1,dZi1},[pDst, :128],dstStep              @// dstStep = -3*outPointStep + 16
310
311            .ELSE
312
313                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
314                VHADD    dZi1,dYi0,dYr3
315
316                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
317                VST2    {dZr0,dZi0},[pDst, :128],outPointStep
318                VHSUB    dZi3,dYi0,dYr3
319                VST2    {dZr1,dZi1},[pDst, :128],outPointStep
320                VST2    {dZr2,dZi2},[pDst, :128],outPointStep
321                VST2    {dZr3,dZi3},[pDst, :128],dstStep              @// dstStep = -3*outPointStep + 16
322
323            .ENDIF
324
325        .ELSE
326
327            @// finish first stage of 4 point FFT
328
329            VADD    qY0,qX0,qZ2
330            VSUB    qY2,qX0,qZ2
331            VADD    qY1,qZ1,qZ3
332            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
333
334            VSUB    qY3,qZ1,qZ3
335
336            @// finish second stage of 4 point FFT
337
338            VSUB    qZ0,qY2,qY1
339            VADD    qZ2,qY2,qY1
340            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
341
342
343            .ifeqs "\inverse", "TRUE"
344
345                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
346                VST2    {dZr0,dZi0},[pDst, :128],outPointStep
347                VSUB    dZi3,dYi0,dYr3
348
349                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
350                VADD    dZi1,dYi0,dYr3
351                VST2    {dZr3,dZi3},[pDst, :128],outPointStep
352                VST2    {dZr2,dZi2},[pDst, :128],outPointStep
353                VST2    {dZr1,dZi1},[pDst, :128],dstStep              @// dstStep = -3*outPointStep + 16
354
355            .ELSE
356
357                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
358                VADD    dZi1,dYi0,dYr3
359
360                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
361                VST2    {dZr0,dZi0},[pDst, :128],outPointStep
362                VSUB    dZi3,dYi0,dYr3
363                VST2    {dZr1,dZi1},[pDst, :128],outPointStep
364                VST2    {dZr2,dZi2},[pDst, :128],outPointStep
365                VST2    {dZr3,dZi3},[pDst, :128],dstStep              @// dstStep = -3*outPointStep + 16
366
367            .ENDIF
368
369
370
371
372        .ENDIF
373
374        BGT     grpLoop\name
375
376
377        @// Reset and Swap pSrc and pDst for the next stage
378        MOV     pTmp,pDst
379        SUB     pSrc,pSrc,#64                       @// Extra increment currently done in the loop
380        SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= size; pSrc -= 4*size bytes
381        SUB     pSrc,pTmp,outPointStep
382
383        .endm
384
385
386        M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
387        FFTSTAGE "FALSE","FALSE",FWD
388        M_END
389
390
391        M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
392        FFTSTAGE "FALSE","TRUE",INV
393        M_END
394
395
396        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
397        FFTSTAGE "TRUE","FALSE",FWDSFS
398        M_END
399
400
401        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
402        FFTSTAGE "TRUE","TRUE",INVSFS
403        M_END
404
405
406
407
408
409
410    .END
411