1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s 11@// to support float instead of SC32. 12@// 13 14@// 15@// Description: 16@// Compute a Radix 4 FFT stage for a N point complex signal 17@// 18@// 19 20 21@// Include standard headers 22 23#include "dl/api/armCOMM_s.h" 24#include "dl/api/omxtypes_s.h" 25 26@// Import symbols required from other files 27@// (For example tables) 28 29 30 31 32@// Set debugging level 33@//DEBUG_ON SETL {TRUE} 34 35 36@// Guarding implementation by the processor name 37 38 39@// Import symbols required from other files 40@// (For example tables) 41 @//IMPORT armAAC_constTable 42 43@//Input Registers 44 45#define pSrc r0 46#define pDst r2 47#define pTwiddle r1 48#define subFFTNum r6 49#define subFFTSize r7 50 51 52 53@//Output Registers 54 55 56@//Local Scratch Registers 57 58#define outPointStep r3 59#define grpCount r4 60#define dstStep r5 61#define grpTwStep r8 62#define stepTwiddle r9 63#define twStep r10 64#define pTmp r4 65#define step16 r11 66#define step24 r12 67 68 69@// Neon Registers 70 71#define dButterfly1Real02 D0.F32 72#define dButterfly1Imag02 D1.F32 73#define dButterfly1Real13 D2.F32 74#define dButterfly1Imag13 D3.F32 75#define dButterfly2Real02 D4.F32 76#define dButterfly2Imag02 D5.F32 77#define dButterfly2Real13 D6.F32 78#define dButterfly2Imag13 D7.F32 79#define dXr0 D0.F32 80#define dXi0 D1.F32 81#define dXr1 D2.F32 82#define dXi1 D3.F32 83#define dXr2 D4.F32 84#define dXi2 D5.F32 85#define dXr3 D6.F32 86#define dXi3 D7.F32 87 88#define dYr0 D16.F32 89#define dYi0 D17.F32 90#define dYr1 D18.F32 91#define dYi1 D19.F32 92#define dYr2 D20.F32 93#define dYi2 D21.F32 94#define dYr3 D22.F32 95#define dYi3 D23.F32 96 97#define dW1r D8.F32 98#define dW1i D9.F32 99#define dW2r D10.F32 100#define dW2i D11.F32 101#define dW3r D12.F32 102#define dW3i D13.F32 103#define qT0 d14.f32 104#define qT1 d16.F32 105#define qT2 d18.F32 106#define qT3 d20.f32 107#define qT4 d22.f32 108#define qT5 d24.f32 109 110#define dZr0 D14.F32 111#define dZi0 D15.F32 112#define dZr1 D26.F32 113#define dZi1 D27.F32 114#define dZr2 D28.F32 115#define dZi2 D29.F32 116#define dZr3 D30.F32 117#define dZi3 D31.F32 118 119#define qX0 Q0.F32 120#define qY0 Q8.F32 121#define qY1 Q9.F32 122#define qY2 Q10.F32 123#define qY3 Q11.F32 124#define qZ0 Q7.F32 125#define qZ1 Q13.F32 126#define qZ2 Q14.F32 127#define qZ3 Q15.F32 128 129 130 131 .MACRO FFTSTAGE scaled, inverse , name 132 133 @// Define stack arguments 134 135 136 @// pOut0+1 increments pOut0 by 8 bytes 137 @// pOut0+outPointStep == increment of 8*outPointStep bytes 138 MOV outPointStep,subFFTSize,LSL #3 139 140 @// Update grpCount and grpSize rightaway 141 142 VLD2 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr] 143 MOV step16,#16 144 LSL grpCount,subFFTSize,#2 145 146 VLD1 dW2r,[pTwiddle, :64] @// [wi|wr] 147 MOV subFFTNum,#1 @//after the last stage 148 149 VLD1 dW3r,[pTwiddle, :64],step16 @// [wi|wr] 150 MOV stepTwiddle,#0 151 152 VLD1 dW2i,[pTwiddle, :64]! @// [wi|wr] 153 SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with 154 155 @// update subFFTSize for the next stage 156 MOV subFFTSize,grpCount 157 VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr] 158 MOV dstStep,outPointStep,LSL #1 159 160 @// AC.r AC.i BD.r BD.i 161 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! 162 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep 163 RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16 164 MOV step24,#24 165 166 @// AC.r AC.i BD.r BD.i 167 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! 168 169 170 @// Process two groups at a time 171 172radix4lsGrpLoop\name : 173 174 VZIP dW2r,dW2i 175 ADD stepTwiddle,stepTwiddle,#16 176 VZIP dW3r,dW3i 177 ADD grpTwStep,stepTwiddle,#4 178 VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r 179 SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle 180 VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i 181 MOV grpTwStep,grpTwStep,LSL #1 182 VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r 183 RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle 184 185 186 VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i 187 188 189 @// grpCount is multiplied by 4 190 SUBS grpCount,grpCount,#8 191 192 .ifeqs "\inverse", "TRUE" 193 VMUL dZr1,dW1r,dXr1 194 VMLA dZr1,dW1i,dXi1 @// real part 195 VMUL dZi1,dW1r,dXi1 196 VMLS dZi1,dW1i,dXr1 @// imag part 197 198 .else 199 200 VMUL dZr1,dW1r,dXr1 201 VMLS dZr1,dW1i,dXi1 @// real part 202 VMUL dZi1,dW1r,dXi1 203 VMLA dZi1,dW1i,dXr1 @// imag part 204 205 .endif 206 207 VLD2 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr] 208 209 .ifeqs "\inverse", "TRUE" 210 VMUL dZr2,dW2r,dXr2 211 VMLA dZr2,dW2i,dXi2 @// real part 212 VMUL dZi2,dW2r,dXi2 213 VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr] 214 VMLS dZi2,dW2i,dXr2 @// imag part 215 216 .else 217 218 VMUL dZr2,dW2r,dXr2 219 VMLS dZr2,dW2i,dXi2 @// real part 220 VMUL dZi2,dW2r,dXi2 221 VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr] 222 VMLA dZi2,dW2i,dXr2 @// imag part 223 224 .endif 225 226 227 VLD1 dW2i,[pTwiddle, :64],twStep @// [wi|wr] 228 229 @// move qX0 so as to load for the next iteration 230 VMOV qZ0,qX0 231 232 .ifeqs "\inverse", "TRUE" 233 VMUL dZr3,dW3r,dXr3 234 VMLA dZr3,dW3i,dXi3 @// real part 235 VMUL dZi3,dW3r,dXi3 236 VLD1 dW3r,[pTwiddle, :64],step24 237 VMLS dZi3,dW3i,dXr3 @// imag part 238 239 .else 240 241 VMUL dZr3,dW3r,dXr3 242 VMLS dZr3,dW3i,dXi3 @// real part 243 VMUL dZi3,dW3r,dXi3 244 VLD1 dW3r,[pTwiddle, :64],step24 245 VMLA dZi3,dW3i,dXr3 @// imag part 246 247 .endif 248 249 VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr] 250 251 @// Don't do the load on the last iteration so we don't read past the end 252 @// of pSrc. 253 addeq pSrc, pSrc, #64 254 beq radix4lsSkipRead\name 255 @// AC.r AC.i BD.r BD.i 256 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! 257 258 @// AC.r AC.i BD.r BD.i 259 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! 260radix4lsSkipRead\name: 261 262 @// finish first stage of 4 point FFT 263 264 VADD qY0,qZ0,qZ2 265 VSUB qY2,qZ0,qZ2 266 VADD qY1,qZ1,qZ3 267 VSUB qY3,qZ1,qZ3 268 269 270 @// finish second stage of 4 point FFT 271 272 .ifeqs "\inverse", "TRUE" 273 274 VSUB qZ0,qY2,qY1 275 276 VADD dZr3,dYr0,dYi3 277 VST2 {dZr0,dZi0},[pDst, :128],outPointStep 278 VSUB dZi3,dYi0,dYr3 279 280 VADD qZ2,qY2,qY1 281 VST2 {dZr3,dZi3},[pDst, :128],outPointStep 282 283 VSUB dZr1,dYr0,dYi3 284 VST2 {dZr2,dZi2},[pDst, :128],outPointStep 285 VADD dZi1,dYi0,dYr3 286 287 @// dstStep = -outPointStep + 16 288 VST2 {dZr1,dZi1},[pDst, :128],dstStep 289 290 291 .else 292 293 VSUB qZ0,qY2,qY1 294 295 VSUB dZr1,dYr0,dYi3 296 VST2 {dZr0,dZi0},[pDst, :128],outPointStep 297 VADD dZi1,dYi0,dYr3 298 299 VADD qZ2,qY2,qY1 300 VST2 {dZr1,dZi1},[pDst, :128],outPointStep 301 302 VADD dZr3,dYr0,dYi3 303 VST2 {dZr2,dZi2},[pDst, :128],outPointStep 304 VSUB dZi3,dYi0,dYr3 305 306 @// dstStep = -outPointStep + 16 307 VST2 {dZr3,dZi3},[pDst, :128],dstStep 308 309 310 .endif 311 312 BGT radix4lsGrpLoop\name 313 314 315 @// Reset and Swap pSrc and pDst for the next stage 316 MOV pTmp,pDst 317 @// Extra increment done in final iteration of the loop 318 SUB pSrc,pSrc,#64 319 @// pDst -= 4*size; pSrc -= 8*size bytes 320 SUB pDst,pSrc,outPointStep,LSL #2 321 SUB pSrc,pTmp,outPointStep 322 SUB pTwiddle,pTwiddle,subFFTSize,LSL #1 323 @// Extra increment done in final iteration of the loop 324 SUB pTwiddle,pTwiddle,#16 325 326 .endm 327 328 329 M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4 330 FFTSTAGE "FALSE","FALSE",fwd 331 M_END 332 333 334 M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4 335 FFTSTAGE "FALSE","TRUE",inv 336 M_END 337 338 339 .end 340