1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 24 окт. 2018 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_ARM_NEON_D32_FFT_SCRAMBLE_H_ 23 #define DSP_ARCH_ARM_NEON_D32_FFT_SCRAMBLE_H_ 24 25 #ifndef DSP_ARCH_ARM_NEON_32_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_ARM_NEON_32_IMPL */ 28 29 namespace neon_d32 30 { scramble_direct(float * dst_re,float * dst_im,const float * src_re,const float * src_im,size_t rank)31 void scramble_direct(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t rank) 32 { 33 if ((dst_re == src_re) || (dst_im == src_im)) 34 { 35 IF_ARCH_ARM( 36 size_t count = 1 << rank; 37 size_t i, j, rrank = rank; 38 float *d_re, *d_im; 39 ); 40 41 // Self algorithm 42 ARCH_ARM_ASM( 43 // Do bit-reverse shuffle 44 __ASM_EMIT("rsb %[rrank], %[rrank], $32") // rrank = 32 - rank 45 __ASM_EMIT("mov %[i], $1") // i = 1 46 47 __ASM_EMIT("1:") 48 #if defined(ARCH_ARM6) 49 __ASM_EMIT("push {%[dst_re], %[dst_im]}") 50 ARMV6_MV_RBIT32("%[j]", "%[i]", "%[dst_re]", "%[dst_im]", "%[masks]") // j = reverse_bits(i) 51 __ASM_EMIT("pop {%[dst_re], %[dst_im]}") 52 #else 53 __ASM_EMIT("rbit %[j], %[i]") // j = reverse_bits(i) 54 #endif 55 __ASM_EMIT("add %[src_re], $4") 56 __ASM_EMIT("lsr %[j], %[rrank]") // j = reverse_bits(i) >> rank 57 __ASM_EMIT("add %[src_im], $4") 58 __ASM_EMIT("cmp %[i], %[j]") // i <=> j 59 __ASM_EMIT("bhs 2f") // if (i >= j) continue 60 __ASM_EMIT("add %[d_re], %[dst_re], %[j], LSL $2") // d_re = &dst_re[j] 61 __ASM_EMIT("vldm %[src_re], {s0}") // s0 = *src_re 62 __ASM_EMIT("add %[d_im], %[dst_im], %[j], LSL $2") // d_im = &dst_im[j] 63 __ASM_EMIT("vldm %[src_im], {s1}") // s1 = *src_im 64 __ASM_EMIT("vldm %[d_re], {s2}") // s2 = *td_re 65 __ASM_EMIT("vldm %[d_im], {s3}") // s3 = *td_im 66 __ASM_EMIT("vstm %[src_re], {s2}") // *(src_re++) = s2 67 __ASM_EMIT("vstm %[src_im], {s3}") // *(src_im++) = s3 68 __ASM_EMIT("vstm %[d_re], {s0}") // *d_re = s0 69 __ASM_EMIT("vstm %[d_im], {s1}") // *d_im = s1 70 __ASM_EMIT("2:") 71 __ASM_EMIT("add %[i], $1") // i++ 72 __ASM_EMIT("cmp %[i], %[count]") // i <=> count 73 __ASM_EMIT("blo 1b") 74 75 // Perform x8 butterflies 76 __ASM_EMIT("3:") 77 __ASM_EMIT("vld2.32 {q0-q1}, [%[dst_re]]") // q0 = r0 r2 r4 r6, q1 = r1 r3 r5 r7 78 __ASM_EMIT("vld2.32 {q2-q3}, [%[dst_im]]") // q2 = i0 i2 i4 i6, q3 = i1 i3 i5 i7 79 __ASM_EMIT("vadd.f32 q4, q0, q1") // q4 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' 80 __ASM_EMIT("vadd.f32 q5, q2, q3") // q5 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' 81 __ASM_EMIT("vsub.f32 q0, q0, q1") // q0 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' 82 __ASM_EMIT("vsub.f32 q1, q2, q3") // q1 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' 83 84 // q4 = r0' r2' r4' r6' 85 // q0 = r1' r3' r5' r7' 86 // q5 = i0' i2' i4' i6' 87 // q1 = i1' i3' i5' i7' 88 __ASM_EMIT("vuzp.32 q4, q0") // q4 = r0' r4' r1' r5', q0 = r2' r6' r3' r7' 89 __ASM_EMIT("vuzp.32 q5, q1") // q5 = i0' i4' i1' i5', q1 = i2' i6' i3' i7' 90 __ASM_EMIT("vswp d1, d3") // q0 = r2' r6' i3' i7', q1 = i2' i6' r3' r7' 91 __ASM_EMIT("vadd.f32 q2, q4, q0") // q2 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" 92 __ASM_EMIT("vsub.f32 q3, q4, q0") // q3 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" 93 __ASM_EMIT("vadd.f32 q0, q5, q1") // q0 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" 94 __ASM_EMIT("vsub.f32 q1, q5, q1") // q1 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" 95 96 // q0 = i0" i4" i3" i7" 97 // q1 = i2" i6" i1" i5" 98 // q2 = r0" r4" r1" r5" 99 // q3 = r2" r6" r3" r7" 100 __ASM_EMIT("vswp d1, d3") // q0 = i0" i4" i1" i5", q1 = i2" i6" i3" i7" 101 __ASM_EMIT("vuzp.32 q2, q3") // q2 = r0" r1" r2" r3", q3 = r4" r5" r6" r7" 102 __ASM_EMIT("vuzp.32 q0, q1") // q0 = i0" i1" i2" i3", q1 = i4" i5" i6" i7" 103 104 __ASM_EMIT("vst1.32 {q2-q3}, [%[dst_re]]!") 105 __ASM_EMIT("subs %[count], $8") // i <=> count 106 __ASM_EMIT("vst1.32 {q0-q1}, [%[dst_im]]!") 107 __ASM_EMIT("bne 3b") 108 109 110 : [src_re] "+r" (src_re), [src_im] "+r" (src_im), 111 [dst_re] "+r" (dst_re), [dst_im] "+r" (dst_im), 112 [d_re] "=&r" (d_re), [d_im] "=&r" (d_im), 113 [rrank] "+r" (rrank), [i] "=&r" (i), [j] "=&r" (j), 114 [count] "+r" (count) 115 : IF_ARCH_ARM6([masks] "r" (__rb_masks)) 116 : "cc", "memory", 117 "q0", "q1", "q2", "q3", "q4", "q5" 118 ); 119 } 120 else 121 { 122 IF_ARCH_ARM( 123 size_t i, j, rrank = rank - 3; 124 size_t regs = 1 << rrank; 125 float *s_re, *s_im; 126 ); 127 128 ARCH_ARM_ASM( 129 __ASM_EMIT("eor %[i], %[i]") // i = 0 130 __ASM_EMIT("rsb %[rrank], %[rrank], $32") // rrank = 32 - rank 131 132 __ASM_EMIT("1:") 133 #if defined(ARCH_ARM6) 134 __ASM_EMIT("push {%[dst_re], %[dst_im]}") 135 ARMV6_MV_RBIT32("%[j]", "%[i]", "%[dst_re]", "%[dst_im]", "%[masks]") // j = reverse_bits(i) 136 __ASM_EMIT("pop {%[dst_re], %[dst_im]}") 137 #else 138 __ASM_EMIT("rbit %[j], %[i]") // j = reverse_bits(i) 139 #endif 140 __ASM_EMIT("lsr %[j], %[rrank]") // j = reverse_bits(i) >> rank 141 142 __ASM_EMIT("add %[s_re], %[src_re], %[j], LSL $2") // s_re = &src_re[i] 143 __ASM_EMIT("add %[s_im], %[src_im], %[j], LSL $2") // s_re = &src_im[i] 144 __ASM_EMIT("vldm %[s_re], {s0}") // q0 = r0 ? ? ? 145 __ASM_EMIT("vldm %[s_im], {s8}") // q2 = i0 ? ? ? 146 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs] 147 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs] 148 __ASM_EMIT("vldm %[s_re], {s2}") // q0 = r0 ? r4 ? 149 __ASM_EMIT("vldm %[s_im], {s10}") // q2 = i0 ? i4 ? 150 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*2] 151 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*2] 152 __ASM_EMIT("vldm %[s_re], {s1}") // q0 = r0 r2 r4 ? 153 __ASM_EMIT("vldm %[s_im], {s9}") // q2 = i0 i2 i4 ? 154 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*3] 155 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*3] 156 __ASM_EMIT("vldm %[s_re], {s3}") // q0 = r0 r2 r4 r6 157 __ASM_EMIT("vldm %[s_im], {s11}") // q2 = i0 i2 i4 i6 158 159 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*4] 160 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*4] 161 __ASM_EMIT("vldm %[s_re], {s4}") // q1 = r1 ? ? ? 162 __ASM_EMIT("vldm %[s_im], {s12}") // q3 = i1 ? ? ? 163 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*5] 164 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*5] 165 __ASM_EMIT("vldm %[s_re], {s6}") // q1 = r1 ? r5 ? 166 __ASM_EMIT("vldm %[s_im], {s14}") // q3 = i1 ? i5 ? 167 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*6] 168 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*6] 169 __ASM_EMIT("vldm %[s_re], {s5}") // q1 = r1 r3 r5 ? 170 __ASM_EMIT("vldm %[s_im], {s13}") // q3 = i1 i3 i5 ? 171 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*7] 172 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*7] 173 __ASM_EMIT("vldm %[s_re], {s7}") // q1 = r1 r3 r5 r7 174 __ASM_EMIT("vldm %[s_im], {s15}") // q3 = i1 i3 i5 i7 175 176 // q0 = r0 r2 r4 r6 177 // q1 = r1 r3 r5 r7 178 // q2 = i0 i2 i4 i6 179 // q3 = i1 i3 i5 i7 180 __ASM_EMIT("add %[i], $1") 181 182 // Perform x8 butterflies 183 __ASM_EMIT("vadd.f32 q4, q0, q1") // q4 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' 184 __ASM_EMIT("vadd.f32 q5, q2, q3") // q5 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' 185 __ASM_EMIT("vsub.f32 q0, q0, q1") // q0 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' 186 __ASM_EMIT("vsub.f32 q1, q2, q3") // q1 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' 187 188 // q4 = r0' r2' r4' r6' 189 // q0 = r1' r3' r5' r7' 190 // q5 = i0' i2' i4' i6' 191 // q1 = i1' i3' i5' i7' 192 __ASM_EMIT("vuzp.32 q4, q0") // q4 = r0' r4' r1' r5', q0 = r2' r6' r3' r7' 193 __ASM_EMIT("vuzp.32 q5, q1") // q5 = i0' i4' i1' i5', q1 = i2' i6' i3' i7' 194 __ASM_EMIT("vswp d1, d3") // q0 = r2' r6' i3' i7', q1 = i2' i6' r3' r7' 195 __ASM_EMIT("vadd.f32 q2, q4, q0") // q2 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r1" r5" 196 __ASM_EMIT("vsub.f32 q3, q4, q0") // q3 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r3" r7" 197 __ASM_EMIT("vadd.f32 q0, q5, q1") // q0 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i3" i7" 198 __ASM_EMIT("vsub.f32 q1, q5, q1") // q1 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i1" i5" 199 200 // q0 = i0" i4" i3" i7" 201 // q1 = i2" i6" i1" i5" 202 // q2 = r0" r4" r1" r5" 203 // q3 = r2" r6" r3" r7" 204 __ASM_EMIT("vswp d1, d3") // q0 = i0" i4" i1" i5", q1 = i2" i6" i3" i7" 205 __ASM_EMIT("vuzp.32 q2, q3") // q2 = r0" r1" r2" r3", q3 = r4" r5" r6" r7" 206 __ASM_EMIT("vuzp.32 q0, q1") // q0 = i0" i1" i2" i3", q1 = i4" i5" i6" i7" 207 208 __ASM_EMIT("vst1.32 {q2-q3}, [%[dst_re]]!") 209 __ASM_EMIT("cmp %[i], %[regs]") 210 __ASM_EMIT("vst1.32 {q0-q1}, [%[dst_im]]!") 211 __ASM_EMIT("blo 1b") 212 213 : [dst_re] "+r" (dst_re), [dst_im] "+r" (dst_im), 214 [s_re] "=&r" (s_re), [s_im] "=&r" (s_im), 215 [rrank] "+r" (rrank), [i] "=&r" (i), [j] "=&r" (j) 216 : [src_re] "r" (src_re), [src_im] "r" (src_im), 217 [regs] "r" (regs) 218 IF_ARCH_ARM6(, [masks] "r" (__rb_masks)) 219 : "cc", "memory", 220 "q0", "q1", "q2", "q3", "q4", "q5" 221 ); 222 } 223 } 224 scramble_reverse(float * dst_re,float * dst_im,const float * src_re,const float * src_im,size_t rank)225 void scramble_reverse(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t rank) 226 { 227 if ((dst_re == src_re) || (dst_im == src_im)) 228 { 229 IF_ARCH_ARM( 230 size_t count = 1 << rank; 231 size_t i, j, rrank = rank; 232 float *d_re, *d_im; 233 ); 234 235 // Self algorithm 236 ARCH_ARM_ASM( 237 // Do bit-reverse shuffle 238 __ASM_EMIT("rsb %[rrank], %[rrank], $32") // rrank = 32 - rank 239 __ASM_EMIT("mov %[i], $1") // i = 1 240 241 __ASM_EMIT("1:") 242 #if defined(ARCH_ARM6) 243 __ASM_EMIT("push {%[dst_re], %[dst_im]}") 244 ARMV6_MV_RBIT32("%[j]", "%[i]", "%[dst_re]", "%[dst_im]", "%[masks]") // j = reverse_bits(i) 245 __ASM_EMIT("pop {%[dst_re], %[dst_im]}") 246 #else 247 __ASM_EMIT("rbit %[j], %[i]") // j = reverse_bits(i) 248 #endif 249 __ASM_EMIT("add %[src_re], $4") 250 __ASM_EMIT("lsr %[j], %[rrank]") // j = reverse_bits(i) >> rank 251 __ASM_EMIT("add %[src_im], $4") 252 __ASM_EMIT("cmp %[i], %[j]") // i <=> j 253 __ASM_EMIT("bhs 2f") // if (i >= j) continue 254 __ASM_EMIT("add %[d_re], %[dst_re], %[j], LSL $2") // d_re = &dst_re[j] 255 __ASM_EMIT("vldm %[src_re], {s0}") // s0 = *src_re 256 __ASM_EMIT("add %[d_im], %[dst_im], %[j], LSL $2") // d_im = &dst_im[j] 257 __ASM_EMIT("vldm %[src_im], {s1}") // s1 = *src_im 258 __ASM_EMIT("vldm %[d_re], {s2}") // s2 = *td_re 259 __ASM_EMIT("vldm %[d_im], {s3}") // s3 = *td_im 260 __ASM_EMIT("vstm %[src_re], {s2}") // *(src_re++) = s2 261 __ASM_EMIT("vstm %[src_im], {s3}") // *(src_im++) = s3 262 __ASM_EMIT("vstm %[d_re], {s0}") // *d_re = s0 263 __ASM_EMIT("vstm %[d_im], {s1}") // *d_im = s1 264 __ASM_EMIT("2:") 265 __ASM_EMIT("add %[i], $1") // i++ 266 __ASM_EMIT("cmp %[i], %[count]") // i <=> count 267 __ASM_EMIT("blo 1b") 268 269 __ASM_EMIT("eor %[i], %[i]") 270 271 // Perform x8 butterflies 272 __ASM_EMIT("3:") 273 __ASM_EMIT("vld2.32 {q0-q1}, [%[dst_re]]") // q0 = r0 r2 r4 r6, q1 = r1 r3 r5 r7 274 __ASM_EMIT("vld2.32 {q2-q3}, [%[dst_im]]") // q2 = i0 i2 i4 i6, q3 = i1 i3 i5 i7 275 __ASM_EMIT("vadd.f32 q4, q0, q1") // q4 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' 276 __ASM_EMIT("vadd.f32 q5, q2, q3") // q5 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' 277 __ASM_EMIT("vsub.f32 q0, q0, q1") // q0 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' 278 __ASM_EMIT("vsub.f32 q1, q2, q3") // q1 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' 279 280 // q4 = r0' r2' r4' r6' 281 // q0 = r1' r3' r5' r7' 282 // q5 = i0' i2' i4' i6' 283 // q1 = i1' i3' i5' i7' 284 __ASM_EMIT("vuzp.32 q4, q0") // q4 = r0' r4' r1' r5', q0 = r2' r6' r3' r7' 285 __ASM_EMIT("vuzp.32 q5, q1") // q5 = i0' i4' i1' i5', q1 = i2' i6' i3' i7' 286 __ASM_EMIT("vswp d1, d3") // q0 = r2' r6' i3' i7', q1 = i2' i6' r3' r7' 287 __ASM_EMIT("vadd.f32 q2, q4, q0") // q2 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" 288 __ASM_EMIT("vsub.f32 q3, q4, q0") // q3 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" 289 __ASM_EMIT("vadd.f32 q0, q5, q1") // q0 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" 290 __ASM_EMIT("vsub.f32 q1, q5, q1") // q1 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" 291 292 // q0 = i0" i4" i1" i5" 293 // q1 = i2" i6" i3" i7" 294 // q2 = r0" r4" r3" r7" 295 // q3 = r2" r6" r1" r5" 296 __ASM_EMIT("vswp d5, d7") // q2 = r0" r4" r1" r5", q3 = r2" r6" r3" r7" 297 __ASM_EMIT("vuzp.32 q2, q3") // q2 = r0" r1" r2" r3", q3 = r4" r5" r6" r7" 298 __ASM_EMIT("vuzp.32 q0, q1") // q0 = i0" i1" i2" i3", q1 = i4" i5" i6" i7" 299 300 __ASM_EMIT("vst1.32 {q2-q3}, [%[dst_re]]!") 301 __ASM_EMIT("subs %[count], $8") // i <=> count 302 __ASM_EMIT("vst1.32 {q0-q1}, [%[dst_im]]!") 303 __ASM_EMIT("bne 3b") 304 305 306 : [src_re] "+r" (src_re), [src_im] "+r" (src_im), 307 [dst_re] "+r" (dst_re), [dst_im] "+r" (dst_im), 308 [d_re] "=&r" (d_re), [d_im] "=&r" (d_im), 309 [rrank] "+r" (rrank), [i] "=&r" (i), [j] "=&r" (j), 310 [count] "+r" (count) 311 : IF_ARCH_ARM6([masks] "r" (__rb_masks)) 312 : "cc", "memory", 313 "q0", "q1", "q2", "q3", "q4", "q5" 314 ); 315 } 316 else 317 { 318 IF_ARCH_ARM( 319 size_t i, j, rrank = rank - 3; 320 size_t regs = 1 << rrank; 321 float *s_re, *s_im; 322 ); 323 324 ARCH_ARM_ASM( 325 __ASM_EMIT("eor %[i], %[i]") // i = 0 326 __ASM_EMIT("rsb %[rrank], %[rrank], $32") // rrank = 32 - rank 327 328 __ASM_EMIT("1:") 329 #if defined(ARCH_ARM6) 330 __ASM_EMIT("push {%[dst_re], %[dst_im]}") 331 ARMV6_MV_RBIT32("%[j]", "%[i]", "%[dst_re]", "%[dst_im]", "%[masks]") // j = reverse_bits(i) 332 __ASM_EMIT("pop {%[dst_re], %[dst_im]}") 333 #else 334 __ASM_EMIT("rbit %[j], %[i]") // j = reverse_bits(i) 335 #endif 336 __ASM_EMIT("lsr %[j], %[rrank]") // j = reverse_bits(i) >> rank 337 338 __ASM_EMIT("add %[s_re], %[src_re], %[j], LSL $2") // s_re = &src_re[i] 339 __ASM_EMIT("add %[s_im], %[src_im], %[j], LSL $2") // s_re = &src_im[i] 340 __ASM_EMIT("vldm %[s_re], {s0}") // q0 = r0 ? ? ? 341 __ASM_EMIT("vldm %[s_im], {s8}") // q2 = i0 ? ? ? 342 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs] 343 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs] 344 __ASM_EMIT("vldm %[s_re], {s2}") // q0 = r0 ? r4 ? 345 __ASM_EMIT("vldm %[s_im], {s10}") // q2 = i0 ? i4 ? 346 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*2] 347 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*2] 348 __ASM_EMIT("vldm %[s_re], {s1}") // q0 = r0 r2 r4 ? 349 __ASM_EMIT("vldm %[s_im], {s9}") // q2 = i0 i2 i4 ? 350 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*3] 351 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*3] 352 __ASM_EMIT("vldm %[s_re], {s3}") // q0 = r0 r2 r4 r6 353 __ASM_EMIT("vldm %[s_im], {s11}") // q2 = i0 i2 i4 i6 354 355 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*4] 356 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*4] 357 __ASM_EMIT("vldm %[s_re], {s4}") // q1 = r1 ? ? ? 358 __ASM_EMIT("vldm %[s_im], {s12}") // q3 = i1 ? ? ? 359 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*5] 360 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*5] 361 __ASM_EMIT("vldm %[s_re], {s6}") // q1 = r1 ? r5 ? 362 __ASM_EMIT("vldm %[s_im], {s14}") // q3 = i1 ? i5 ? 363 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*6] 364 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*6] 365 __ASM_EMIT("vldm %[s_re], {s5}") // q1 = r1 r3 r5 ? 366 __ASM_EMIT("vldm %[s_im], {s13}") // q3 = i1 i3 i5 ? 367 __ASM_EMIT("add %[s_re], %[s_re], %[regs], LSL $2") // s_re = &src_re[i + regs*7] 368 __ASM_EMIT("add %[s_im], %[s_im], %[regs], LSL $2") // s_re = &src_im[i + regs*7] 369 __ASM_EMIT("vldm %[s_re], {s7}") // q1 = r1 r3 r5 r7 370 __ASM_EMIT("vldm %[s_im], {s15}") // q3 = i1 i3 i5 i7 371 372 // q0 = r0 r2 r4 r6 373 // q1 = r1 r3 r5 r7 374 // q2 = i0 i2 i4 i6 375 // q3 = i1 i3 i5 i7 376 __ASM_EMIT("add %[i], $1") 377 378 // Perform x8 butterflies 379 __ASM_EMIT("vadd.f32 q4, q0, q1") // q4 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' 380 __ASM_EMIT("vadd.f32 q5, q2, q3") // q5 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' 381 __ASM_EMIT("vsub.f32 q0, q0, q1") // q0 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' 382 __ASM_EMIT("vsub.f32 q1, q2, q3") // q1 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' 383 384 // q4 = r0' r2' r4' r6' 385 // q0 = r1' r3' r5' r7' 386 // q5 = i0' i2' i4' i6' 387 // q1 = i1' i3' i5' i7' 388 __ASM_EMIT("vuzp.32 q4, q0") // q4 = r0' r4' r1' r5', q0 = r2' r6' r3' r7' 389 __ASM_EMIT("vuzp.32 q5, q1") // q5 = i0' i4' i1' i5', q1 = i2' i6' i3' i7' 390 __ASM_EMIT("vswp d1, d3") // q0 = r2' r6' i3' i7', q1 = i2' i6' r3' r7' 391 __ASM_EMIT("vadd.f32 q2, q4, q0") // q2 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" 392 __ASM_EMIT("vsub.f32 q3, q4, q0") // q3 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" 393 __ASM_EMIT("vadd.f32 q0, q5, q1") // q0 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" 394 __ASM_EMIT("vsub.f32 q1, q5, q1") // q1 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" 395 396 // q0 = i0" i4" i1" i5" 397 // q1 = i2" i6" i3" i7" 398 // q2 = r0" r4" r3" r7" 399 // q3 = r2" r6" r1" r5" 400 __ASM_EMIT("vswp d5, d7") // q2 = r0" r4" r1" r5", q3 = r2" r6" r3" r7" 401 __ASM_EMIT("vuzp.32 q2, q3") // q2 = r0" r1" r2" r3", q3 = r4" r5" r6" r7" 402 __ASM_EMIT("vuzp.32 q0, q1") // q0 = i0" i1" i2" i3", q1 = i4" i5" i6" i7" 403 404 __ASM_EMIT("vst1.32 {q2-q3}, [%[dst_re]]!") 405 __ASM_EMIT("cmp %[i], %[regs]") 406 __ASM_EMIT("vst1.32 {q0-q1}, [%[dst_im]]!") 407 __ASM_EMIT("blo 1b") 408 409 : [dst_re] "+r" (dst_re), [dst_im] "+r" (dst_im), 410 [s_re] "=&r" (s_re), [s_im] "=&r" (s_im), 411 [rrank] "+r" (rrank), [i] "=&r" (i), [j] "=&r" (j) 412 : [src_re] "r" (src_re), [src_im] "r" (src_im), 413 [regs] "r" (regs) 414 IF_ARCH_ARM6(, [masks] "r" (__rb_masks)) 415 : "cc", "memory", 416 "q0", "q1", "q2", "q3", "q4", "q5" 417 ); 418 } 419 } 420 421 } 422 423 #endif /* DSP_ARCH_ARM_NEON_D32_FFT_SCRAMBLE_H_ */ 424