1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 4 янв. 2020 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_AARCH64_ASIMD_FILTERS_TRANSFER_H_ 23 #define DSP_ARCH_AARCH64_ASIMD_FILTERS_TRANSFER_H_ 24 25 #ifndef DSP_ARCH_AARCH64_ASIMD_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_AARCH64_ASIMD_IMPL */ 28 29 namespace asimd 30 { filter_transfer_calc_ri(float * re,float * im,const f_cascade_t * c,const float * freq,size_t count)31 void filter_transfer_calc_ri(float *re, float *im, const f_cascade_t *c, const float *freq, size_t count) 32 { 33 ARCH_AARCH64_ASM( 34 // Unpack filter params 35 __ASM_EMIT("ld3r {v18.4s, v19.4s, v20.4s}, [%[c]]") 36 __ASM_EMIT("add %[c], %[c], #0x10") 37 __ASM_EMIT("ld3r {v21.4s, v22.4s, v23.4s}, [%[c]]") 38 // x8 blocks 39 __ASM_EMIT("subs %[count], %[count], #8") 40 __ASM_EMIT("b.lo 2f") 41 __ASM_EMIT("1:") 42 __ASM_EMIT("ldp q6, q7, [%[f]]") // v6 = f 43 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 44 __ASM_EMIT("fmul v17.4s, v7.4s, v7.4s") 45 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 46 __ASM_EMIT("fmul v5.4s, v19.4s, v7.4s") 47 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 48 __ASM_EMIT("fmul v7.4s, v22.4s, v7.4s") 49 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 50 __ASM_EMIT("fmul v1.4s, v20.4s, v17.4s") 51 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 52 __ASM_EMIT("fmul v17.4s, v23.4s, v17.4s") 53 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 54 __ASM_EMIT("fsub v4.4s, v18.4s, v1.4s") 55 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 56 __ASM_EMIT("fsub v17.4s, v21.4s, v17.4s") 57 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 58 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 59 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 60 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 61 __ASM_EMIT("fmul v2.4s, v4.4s, v17.4s") 62 __ASM_EMIT("fmul v3.4s, v5.4s, v17.4s") 63 __ASM_EMIT("fmla v2.4s, v5.4s, v7.4s") 64 __ASM_EMIT("fmls v3.4s, v4.4s, v7.4s") 65 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 66 __ASM_EMIT("fmul v5.4s, v17.4s, v17.4s") 67 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 68 __ASM_EMIT("fmla v5.4s, v7.4s, v7.4s") 69 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 70 __ASM_EMIT("frecpe v7.4s, v5.4s") 71 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 72 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 73 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 74 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 75 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 76 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 77 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 78 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 79 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 80 __ASM_EMIT("fmul v2.4s, v2.4s, v7.4s") 81 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 82 __ASM_EMIT("fmul v3.4s, v3.4s, v7.4s") 83 // Store data 84 __ASM_EMIT("stp q0, q2, [%[re]]") 85 __ASM_EMIT("stp q1, q3, [%[im]]") 86 __ASM_EMIT("subs %[count], %[count], #8") 87 __ASM_EMIT("add %[f], %[f], #0x20") 88 __ASM_EMIT("add %[re], %[re], #0x20") 89 __ASM_EMIT("add %[im], %[im], #0x20") 90 __ASM_EMIT("b.hs 1b") 91 __ASM_EMIT("2:") 92 // x4 blocks 93 __ASM_EMIT("adds %[count], %[count], #4") 94 __ASM_EMIT("b.lt 4f") 95 __ASM_EMIT("1:") 96 __ASM_EMIT("ldr q6, [%[f]]") // v6 = f 97 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 98 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 99 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 100 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 101 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 102 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 103 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 104 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 105 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 106 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 107 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 108 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 109 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 110 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 111 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 112 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 113 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 114 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 115 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 116 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 117 // Update data 118 __ASM_EMIT("str q0, [%[re]]") 119 __ASM_EMIT("str q1, [%[im]]") 120 __ASM_EMIT("sub %[count], %[count], #4") 121 __ASM_EMIT("add %[f], %[f], #0x10") 122 __ASM_EMIT("add %[re], %[re], #0x10") 123 __ASM_EMIT("add %[im], %[im], #0x10") 124 __ASM_EMIT("4:") 125 // x2 blocks 126 __ASM_EMIT("adds %[count], %[count], #2") 127 __ASM_EMIT("blt 6f") 128 __ASM_EMIT("1:") 129 __ASM_EMIT("ldr d6, [%[f]]") // v6 = f 130 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 131 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 132 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 133 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 134 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 135 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 136 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 137 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 138 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 139 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 140 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 141 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 142 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 143 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 144 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 145 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 146 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 147 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 148 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 149 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 150 // Update data 151 __ASM_EMIT("str d0, [%[re]]") 152 __ASM_EMIT("str d1, [%[im]]") 153 __ASM_EMIT("sub %[count], %[count], #2") 154 __ASM_EMIT("add %[f], %[f], #0x08") 155 __ASM_EMIT("add %[re], %[re], #0x08") 156 __ASM_EMIT("add %[im], %[im], #0x08") 157 __ASM_EMIT("6:") 158 // x1 blocks 159 __ASM_EMIT("adds %[count], %[count], #1") 160 __ASM_EMIT("b.lt 8f") 161 __ASM_EMIT("1:") 162 __ASM_EMIT("ld1r {v6.4s}, [%[f]]") // v6 = f 163 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 164 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 165 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 166 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 167 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 168 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 169 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 170 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 171 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 172 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 173 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 174 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 175 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 176 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 177 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 178 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 179 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 180 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 181 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 182 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 183 // Update data 184 __ASM_EMIT("st1 {v0.s}[0], [%[re]]") 185 __ASM_EMIT("st1 {v1.s}[0], [%[im]]") 186 __ASM_EMIT("8:") 187 188 : [re] "+r" (re), [im] "+r" (im), [f] "+r" (freq), 189 [count] "+r" (count), [c] "+r" (c) 190 : 191 : "cc", "memory", 192 "v0", "v1", "v2", "v3", 193 "v4", "v5", "v6", "v7", 194 "v16", "v17", "v18", "v19", 195 "v20", "v21", "v22", "v23" 196 ); 197 } 198 filter_transfer_apply_ri(float * re,float * im,const f_cascade_t * c,const float * freq,size_t count)199 void filter_transfer_apply_ri(float *re, float *im, const f_cascade_t *c, const float *freq, size_t count) 200 { 201 ARCH_AARCH64_ASM( 202 // Unpack filter params 203 __ASM_EMIT("ld3r {v18.4s, v19.4s, v20.4s}, [%[c]]") 204 __ASM_EMIT("add %[c], %[c], #0x10") 205 __ASM_EMIT("ld3r {v21.4s, v22.4s, v23.4s}, [%[c]]") 206 // x8 blocks 207 __ASM_EMIT("subs %[count], %[count], #8") 208 __ASM_EMIT("b.lo 2f") 209 __ASM_EMIT("1:") 210 __ASM_EMIT("ldp q6, q7, [%[f]]") // v6 = f 211 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 212 __ASM_EMIT("fmul v17.4s, v7.4s, v7.4s") 213 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 214 __ASM_EMIT("fmul v5.4s, v19.4s, v7.4s") 215 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 216 __ASM_EMIT("fmul v7.4s, v22.4s, v7.4s") 217 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 218 __ASM_EMIT("fmul v1.4s, v20.4s, v17.4s") 219 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 220 __ASM_EMIT("fmul v17.4s, v23.4s, v17.4s") 221 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 222 __ASM_EMIT("fsub v4.4s, v18.4s, v1.4s") 223 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 224 __ASM_EMIT("fsub v17.4s, v21.4s, v17.4s") 225 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 226 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 227 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 228 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 229 __ASM_EMIT("fmul v2.4s, v4.4s, v17.4s") 230 __ASM_EMIT("fmul v3.4s, v5.4s, v17.4s") 231 __ASM_EMIT("fmla v2.4s, v5.4s, v7.4s") 232 __ASM_EMIT("fmls v3.4s, v4.4s, v7.4s") 233 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 234 __ASM_EMIT("fmul v5.4s, v17.4s, v17.4s") 235 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 236 __ASM_EMIT("fmla v5.4s, v7.4s, v7.4s") 237 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 238 __ASM_EMIT("frecpe v7.4s, v5.4s") 239 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 240 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 241 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 242 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 243 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 244 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 245 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 246 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 247 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 248 __ASM_EMIT("fmul v2.4s, v2.4s, v7.4s") 249 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 250 __ASM_EMIT("fmul v3.4s, v3.4s, v7.4s") 251 // Update data 252 __ASM_EMIT("ldp q6, q7, [%[re]]") // v6 = b_re 253 __ASM_EMIT("ldp q16, q17, [%[im]]") // v16 = b_im 254 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 255 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 256 __ASM_EMIT("fmls v4.4s, v1.4s, v16.4s") // v4 = a_re*b_re - a_im*b_im 257 __ASM_EMIT("fmla v5.4s, v0.4s, v16.4s") // v5 = a_im*b_re + a_re*b_im 258 __ASM_EMIT("fmul v6.4s, v2.4s, v7.4s") 259 __ASM_EMIT("fmul v7.4s, v3.4s, v7.4s") 260 __ASM_EMIT("fmls v6.4s, v3.4s, v17.4s") 261 __ASM_EMIT("fmla v7.4s, v2.4s, v17.4s") 262 __ASM_EMIT("stp q4, q6, [%[re]]") 263 __ASM_EMIT("stp q5, q7, [%[im]]") 264 __ASM_EMIT("subs %[count], %[count], #8") 265 __ASM_EMIT("add %[f], %[f], #0x20") 266 __ASM_EMIT("add %[re], %[re], #0x20") 267 __ASM_EMIT("add %[im], %[im], #0x20") 268 __ASM_EMIT("b.hs 1b") 269 __ASM_EMIT("2:") 270 // x4 blocks 271 __ASM_EMIT("adds %[count], %[count], #4") 272 __ASM_EMIT("b.lt 4f") 273 __ASM_EMIT("1:") 274 __ASM_EMIT("ldr q6, [%[f]]") // v6 = f 275 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 276 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 277 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 278 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 279 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 280 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 281 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 282 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 283 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 284 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 285 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 286 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 287 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 288 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 289 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 290 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 291 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 292 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 293 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 294 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 295 // Update data 296 __ASM_EMIT("ldr q6, [%[re]]") // v6 = b_re 297 __ASM_EMIT("ldr q16, [%[im]]") // v16 = b_im 298 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 299 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 300 __ASM_EMIT("fmls v4.4s, v1.4s, v16.4s") // v4 = a_re*b_re - a_im*b_im 301 __ASM_EMIT("fmla v5.4s, v0.4s, v16.4s") // v5 = a_im*b_re + a_re*b_im 302 __ASM_EMIT("str q4, [%[re]]") 303 __ASM_EMIT("str q5, [%[im]]") 304 __ASM_EMIT("sub %[count], %[count], #4") 305 __ASM_EMIT("add %[f], %[f], #0x10") 306 __ASM_EMIT("add %[re], %[re], #0x10") 307 __ASM_EMIT("add %[im], %[im], #0x10") 308 __ASM_EMIT("4:") 309 // x2 blocks 310 __ASM_EMIT("adds %[count], %[count], #2") 311 __ASM_EMIT("blt 6f") 312 __ASM_EMIT("1:") 313 __ASM_EMIT("ldr d6, [%[f]]") // v6 = f 314 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 315 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 316 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 317 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 318 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 319 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 320 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 321 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 322 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 323 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 324 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 325 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 326 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 327 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 328 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 329 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 330 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 331 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 332 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 333 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 334 // Update data 335 __ASM_EMIT("ldr d6, [%[re]]") // v6 = b_re 336 __ASM_EMIT("ldr d16, [%[im]]") // v16 = b_im 337 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 338 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 339 __ASM_EMIT("fmls v4.4s, v1.4s, v16.4s") // v4 = a_re*b_re - a_im*b_im 340 __ASM_EMIT("fmla v5.4s, v0.4s, v16.4s") // v5 = a_im*b_re + a_re*b_im 341 __ASM_EMIT("str d4, [%[re]]") 342 __ASM_EMIT("str d5, [%[im]]") 343 __ASM_EMIT("sub %[count], %[count], #2") 344 __ASM_EMIT("add %[f], %[f], #0x08") 345 __ASM_EMIT("add %[re], %[re], #0x08") 346 __ASM_EMIT("add %[im], %[im], #0x08") 347 __ASM_EMIT("6:") 348 // x1 blocks 349 __ASM_EMIT("adds %[count], %[count], #1") 350 __ASM_EMIT("b.lt 8f") 351 __ASM_EMIT("1:") 352 __ASM_EMIT("ld1r {v6.4s}, [%[f]]") // v6 = f 353 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 354 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 355 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 356 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 357 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 358 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 359 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 360 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 361 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 362 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 363 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 364 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 365 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 366 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 367 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 368 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 369 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 370 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 371 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 372 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 373 // Update data 374 __ASM_EMIT("ld1r {v6.4s}, [%[re]]") // v6 = b_re 375 __ASM_EMIT("ld1r {v16.4s}, [%[im]]") // v16 = b_im 376 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 377 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 378 __ASM_EMIT("fmls v4.4s, v1.4s, v16.4s") // v4 = a_re*b_re - a_im*b_im 379 __ASM_EMIT("fmla v5.4s, v0.4s, v16.4s") // v5 = a_im*b_re + a_re*b_im 380 __ASM_EMIT("st1 {v4.s}[0], [%[re]]") 381 __ASM_EMIT("st1 {v5.s}[0], [%[im]]") 382 __ASM_EMIT("8:") 383 384 : [re] "+r" (re), [im] "+r" (im), [f] "+r" (freq), 385 [count] "+r" (count), [c] "+r" (c) 386 : 387 : "cc", "memory", 388 "v0", "v1", "v2", "v3", 389 "v4", "v5", "v6", "v7", 390 "v16", "v17", "v18", "v19", 391 "v20", "v21", "v22", "v23" 392 ); 393 } 394 filter_transfer_calc_pc(float * dst,const f_cascade_t * c,const float * freq,size_t count)395 void filter_transfer_calc_pc(float *dst, const f_cascade_t *c, const float *freq, size_t count) 396 { 397 ARCH_AARCH64_ASM( 398 // Unpack filter params 399 __ASM_EMIT("ld3r {v18.4s, v19.4s, v20.4s}, [%[c]]") 400 __ASM_EMIT("add %[c], %[c], #0x10") 401 __ASM_EMIT("ld3r {v21.4s, v22.4s, v23.4s}, [%[c]]") 402 // x8 blocks 403 __ASM_EMIT("subs %[count], %[count], #8") 404 __ASM_EMIT("b.lo 2f") 405 __ASM_EMIT("1:") 406 __ASM_EMIT("ldp q6, q7, [%[f]]") // v6 = f 407 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 408 __ASM_EMIT("fmul v17.4s, v7.4s, v7.4s") 409 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 410 __ASM_EMIT("fmul v5.4s, v19.4s, v7.4s") 411 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 412 __ASM_EMIT("fmul v7.4s, v22.4s, v7.4s") 413 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 414 __ASM_EMIT("fmul v1.4s, v20.4s, v17.4s") 415 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 416 __ASM_EMIT("fmul v17.4s, v23.4s, v17.4s") 417 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 418 __ASM_EMIT("fsub v4.4s, v18.4s, v1.4s") 419 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 420 __ASM_EMIT("fsub v17.4s, v21.4s, v17.4s") 421 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 422 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 423 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 424 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 425 __ASM_EMIT("fmul v2.4s, v4.4s, v17.4s") 426 __ASM_EMIT("fmul v3.4s, v5.4s, v17.4s") 427 __ASM_EMIT("fmla v2.4s, v5.4s, v7.4s") 428 __ASM_EMIT("fmls v3.4s, v4.4s, v7.4s") 429 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 430 __ASM_EMIT("fmul v5.4s, v17.4s, v17.4s") 431 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 432 __ASM_EMIT("fmla v5.4s, v7.4s, v7.4s") 433 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 434 __ASM_EMIT("frecpe v7.4s, v5.4s") 435 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 436 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 437 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 438 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 439 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 440 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 441 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 442 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 443 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 444 __ASM_EMIT("fmul v2.4s, v2.4s, v7.4s") 445 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 446 __ASM_EMIT("fmul v3.4s, v3.4s, v7.4s") 447 // Store data 448 __ASM_EMIT("st2 {v0.4s, v1.4s}, [%[dst]]") 449 __ASM_EMIT("add %[dst], %[dst], #0x20") 450 __ASM_EMIT("st2 {v2.4s, v3.4s}, [%[dst]]") 451 __ASM_EMIT("add %[dst], %[dst], #0x20") 452 __ASM_EMIT("subs %[count], %[count], #8") 453 __ASM_EMIT("add %[f], %[f], #0x20") 454 __ASM_EMIT("b.hs 1b") 455 __ASM_EMIT("2:") 456 // x4 blocks 457 __ASM_EMIT("adds %[count], %[count], #4") 458 __ASM_EMIT("b.lt 4f") 459 __ASM_EMIT("1:") 460 __ASM_EMIT("ldr q6, [%[f]]") // v6 = f 461 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 462 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 463 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 464 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 465 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 466 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 467 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 468 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 469 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 470 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 471 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 472 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 473 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 474 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 475 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 476 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 477 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 478 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 479 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 480 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 481 // Store data 482 __ASM_EMIT("st2 {v0.4s, v1.4s}, [%[dst]]") 483 __ASM_EMIT("add %[dst], %[dst], #0x20") 484 __ASM_EMIT("sub %[count], %[count], #4") 485 __ASM_EMIT("add %[f], %[f], #0x10") 486 __ASM_EMIT("4:") 487 // x2 blocks 488 __ASM_EMIT("adds %[count], %[count], #2") 489 __ASM_EMIT("blt 6f") 490 __ASM_EMIT("1:") 491 __ASM_EMIT("ldr d6, [%[f]]") // v6 = f 492 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 493 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 494 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 495 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 496 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 497 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 498 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 499 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 500 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 501 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 502 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 503 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 504 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 505 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 506 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 507 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 508 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 509 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 510 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 511 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 512 // Update data 513 __ASM_EMIT("st2 {v0.2s, v1.2s}, [%[dst]]") 514 __ASM_EMIT("add %[dst], %[dst], #0x10") 515 __ASM_EMIT("sub %[count], %[count], #2") 516 __ASM_EMIT("add %[f], %[f], #0x08") 517 __ASM_EMIT("6:") 518 // x1 blocks 519 __ASM_EMIT("adds %[count], %[count], #1") 520 __ASM_EMIT("b.lt 8f") 521 __ASM_EMIT("1:") 522 __ASM_EMIT("ld1r {v6.4s}, [%[f]]") // v6 = f 523 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 524 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 525 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 526 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 527 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 528 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 529 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 530 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 531 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 532 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 533 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 534 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 535 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 536 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 537 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 538 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 539 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 540 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 541 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 542 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 543 // Update data 544 __ASM_EMIT("st2 {v0.s, v1.s}[0], [%[dst]]") 545 __ASM_EMIT("8:") 546 547 : [dst] "+r" (dst), [f] "+r" (freq), 548 [count] "+r" (count), [c] "+r" (c) 549 : 550 : "cc", "memory", 551 "v0", "v1", "v2", "v3", 552 "v4", "v5", "v6", "v7", 553 "v16", "v17", "v18", "v19", 554 "v20", "v21", "v22", "v23" 555 ); 556 } 557 filter_transfer_apply_pc(float * dst,const f_cascade_t * c,const float * freq,size_t count)558 void filter_transfer_apply_pc(float *dst, const f_cascade_t *c, const float *freq, size_t count) 559 { 560 ARCH_AARCH64_ASM( 561 // Unpack filter params 562 __ASM_EMIT("ld3r {v18.4s, v19.4s, v20.4s}, [%[c]]") 563 __ASM_EMIT("add %[c], %[c], #0x10") 564 __ASM_EMIT("ld3r {v21.4s, v22.4s, v23.4s}, [%[c]]") 565 // x8 blocks 566 __ASM_EMIT("subs %[count], %[count], #8") 567 __ASM_EMIT("b.lo 2f") 568 __ASM_EMIT("1:") 569 __ASM_EMIT("ldp q6, q7, [%[f]]") // v6 = f 570 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 571 __ASM_EMIT("fmul v17.4s, v7.4s, v7.4s") 572 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 573 __ASM_EMIT("fmul v5.4s, v19.4s, v7.4s") 574 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 575 __ASM_EMIT("fmul v7.4s, v22.4s, v7.4s") 576 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 577 __ASM_EMIT("fmul v1.4s, v20.4s, v17.4s") 578 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 579 __ASM_EMIT("fmul v17.4s, v23.4s, v17.4s") 580 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 581 __ASM_EMIT("fsub v4.4s, v18.4s, v1.4s") 582 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 583 __ASM_EMIT("fsub v17.4s, v21.4s, v17.4s") 584 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 585 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 586 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 587 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 588 __ASM_EMIT("fmul v2.4s, v4.4s, v17.4s") 589 __ASM_EMIT("fmul v3.4s, v5.4s, v17.4s") 590 __ASM_EMIT("fmla v2.4s, v5.4s, v7.4s") 591 __ASM_EMIT("fmls v3.4s, v4.4s, v7.4s") 592 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 593 __ASM_EMIT("fmul v5.4s, v17.4s, v17.4s") 594 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 595 __ASM_EMIT("fmla v5.4s, v7.4s, v7.4s") 596 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 597 __ASM_EMIT("frecpe v7.4s, v5.4s") 598 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 599 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 600 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 601 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 602 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 603 __ASM_EMIT("frecps v17.4s, v7.4s, v5.4s") 604 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 605 __ASM_EMIT("fmul v7.4s, v17.4s, v7.4s") 606 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 607 __ASM_EMIT("fmul v2.4s, v2.4s, v7.4s") 608 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 609 __ASM_EMIT("fmul v3.4s, v3.4s, v7.4s") 610 // Update data 611 __ASM_EMIT("ld2 {v6.4s, v7.4s}, [%[dst]]") // v6 = r0 r1 r2 r3, v7 = i0 i1 i2 i3 612 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 613 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 614 __ASM_EMIT("fmls v4.4s, v1.4s, v7.4s") // v4 = a_re*b_re - a_im*b_im 615 __ASM_EMIT("fmla v5.4s, v0.4s, v7.4s") // v5 = a_im*b_re + a_re*b_im 616 __ASM_EMIT("st2 {v4.4s, v5.4s}, [%[dst]]") 617 __ASM_EMIT("add %[dst], %[dst], #0x20") 618 __ASM_EMIT("ld2 {v6.4s, v7.4s}, [%[dst]]") // v6 = r4 r5 r6 r7, v7 = i4 i5 i6 i7 619 __ASM_EMIT("fmul v4.4s, v2.4s, v6.4s") 620 __ASM_EMIT("fmul v5.4s, v3.4s, v6.4s") 621 __ASM_EMIT("fmls v4.4s, v3.4s, v7.4s") 622 __ASM_EMIT("fmla v5.4s, v2.4s, v7.4s") 623 __ASM_EMIT("st2 {v4.4s, v5.4s}, [%[dst]]") 624 __ASM_EMIT("add %[dst], %[dst], #0x20") 625 __ASM_EMIT("subs %[count], %[count], #8") 626 __ASM_EMIT("add %[f], %[f], #0x20") 627 __ASM_EMIT("b.hs 1b") 628 __ASM_EMIT("2:") 629 // x4 blocks 630 __ASM_EMIT("adds %[count], %[count], #4") 631 __ASM_EMIT("b.lt 4f") 632 __ASM_EMIT("1:") 633 __ASM_EMIT("ldr q6, [%[f]]") // v6 = f 634 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 635 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 636 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 637 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 638 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 639 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 640 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 641 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 642 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 643 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 644 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 645 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 646 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 647 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 648 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 649 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 650 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 651 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 652 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 653 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 654 // Update data 655 __ASM_EMIT("ld2 {v6.4s, v7.4s}, [%[dst]]") // v6 = r0 r1 r2 r3, v7 = i0 i1 i2 i3 656 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 657 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 658 __ASM_EMIT("fmls v4.4s, v1.4s, v7.4s") // v4 = a_re*b_re - a_im*b_im 659 __ASM_EMIT("fmla v5.4s, v0.4s, v7.4s") // v5 = a_im*b_re + a_re*b_im 660 __ASM_EMIT("st2 {v4.4s, v5.4s}, [%[dst]]") 661 __ASM_EMIT("add %[dst], %[dst], #0x20") 662 __ASM_EMIT("sub %[count], %[count], #4") 663 __ASM_EMIT("add %[f], %[f], #0x10") 664 __ASM_EMIT("4:") 665 // x2 blocks 666 __ASM_EMIT("adds %[count], %[count], #2") 667 __ASM_EMIT("blt 6f") 668 __ASM_EMIT("1:") 669 __ASM_EMIT("ldr d6, [%[f]]") // v6 = f 670 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 671 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 672 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 673 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 674 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 675 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 676 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 677 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 678 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 679 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 680 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 681 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 682 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 683 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 684 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 685 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 686 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 687 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 688 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 689 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 690 // Update data 691 __ASM_EMIT("ld2 {v6.2s, v7.2s}, [%[dst]]") // v6 = r0 r1 r2 r3, v7 = i0 i1 i2 i3 692 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 693 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 694 __ASM_EMIT("fmls v4.4s, v1.4s, v7.4s") // v4 = a_re*b_re - a_im*b_im 695 __ASM_EMIT("fmla v5.4s, v0.4s, v7.4s") // v5 = a_im*b_re + a_re*b_im 696 __ASM_EMIT("st2 {v4.2s, v5.2s}, [%[dst]]") 697 __ASM_EMIT("add %[dst], %[dst], #0x10") 698 __ASM_EMIT("sub %[count], %[count], #2") 699 __ASM_EMIT("add %[f], %[f], #0x08") 700 __ASM_EMIT("6:") 701 // x1 blocks 702 __ASM_EMIT("adds %[count], %[count], #1") 703 __ASM_EMIT("b.lt 8f") 704 __ASM_EMIT("1:") 705 __ASM_EMIT("ld1r {v6.4s}, [%[f]]") // v6 = f 706 __ASM_EMIT("fmul v16.4s, v6.4s, v6.4s") // v16 = f2 = f*f 707 __ASM_EMIT("fmul v3.4s, v19.4s, v6.4s") // v3 = t_im = t1*f 708 __ASM_EMIT("fmul v6.4s, v22.4s, v6.4s") // v6 = b_im = b1*f 709 __ASM_EMIT("fmul v0.4s, v20.4s, v16.4s") // v0 = t2*f2 710 __ASM_EMIT("fmul v16.4s, v23.4s, v16.4s") // v16 = b2*f2 711 __ASM_EMIT("fsub v2.4s, v18.4s, v0.4s") // v2 = t_re = t0 - t2*f2 712 __ASM_EMIT("fsub v16.4s, v21.4s, v16.4s") // v16 = b_re = b0 - b2*f2 713 __ASM_EMIT("fmul v0.4s, v2.4s, v16.4s") // v0 = t_re*b_re 714 __ASM_EMIT("fmul v1.4s, v3.4s, v16.4s") // v1 = t_im*b_re 715 __ASM_EMIT("fmla v0.4s, v3.4s, v6.4s") // v0 = t_re*b_re + t_im*b_im 716 __ASM_EMIT("fmls v1.4s, v2.4s, v6.4s") // v1 = t_im*b_re - t_re*b_im 717 __ASM_EMIT("fmul v4.4s, v16.4s, v16.4s") // v4 = b_re*b_re 718 __ASM_EMIT("fmla v4.4s, v6.4s, v6.4s") // v4 = W = b_re*b_re + b_im*b_im 719 __ASM_EMIT("frecpe v6.4s, v4.4s") // v6 = s2 720 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2) 721 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = s2' = s2 * (2 - R*s2) 722 __ASM_EMIT("frecps v16.4s, v6.4s, v4.4s") // v16 = (2 - R*s2') 723 __ASM_EMIT("fmul v6.4s, v16.4s, v6.4s") // v6 = 1/W = s2" = s2' * (2 - R*s2') 724 __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") // v0 = a_re = t_re / W 725 __ASM_EMIT("fmul v1.4s, v1.4s, v6.4s") // v1 = a_im = t_im / W 726 // Update data 727 __ASM_EMIT("ld2r {v6.4s, v7.4s}, [%[dst]]") // v6 = r0, v7 =i0 728 __ASM_EMIT("fmul v4.4s, v0.4s, v6.4s") // v4 = a_re*b_re 729 __ASM_EMIT("fmul v5.4s, v1.4s, v6.4s") // v5 = a_im*b_re 730 __ASM_EMIT("fmls v4.4s, v1.4s, v7.4s") // v4 = a_re*b_re - a_im*b_im 731 __ASM_EMIT("fmla v5.4s, v0.4s, v7.4s") // v5 = a_im*b_re + a_re*b_im 732 __ASM_EMIT("st2 {v4.s, v5.s}[0], [%[dst]]") 733 __ASM_EMIT("8:") 734 735 : [dst] "+r" (dst), [f] "+r" (freq), 736 [count] "+r" (count), [c] "+r" (c) 737 : 738 : "cc", "memory", 739 "v0", "v1", "v2", "v3", 740 "v4", "v5", "v6", "v7", 741 "v16", "v17", "v18", "v19", 742 "v20", "v21", "v22", "v23" 743 ); 744 } 745 } 746 747 #endif /* DSP_ARCH_AARCH64_ASIMD_FILTERS_TRANSFER_H_ */ 748