1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 4 янв. 2020 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_AARCH64_ASIMD_FILTERS_TRANSFER_H_
23 #define DSP_ARCH_AARCH64_ASIMD_FILTERS_TRANSFER_H_
24 
25 #ifndef DSP_ARCH_AARCH64_ASIMD_IMPL
26     #error "This header should not be included directly"
27 #endif /* DSP_ARCH_AARCH64_ASIMD_IMPL */
28 
29 namespace asimd
30 {
filter_transfer_calc_ri(float * re,float * im,const f_cascade_t * c,const float * freq,size_t count)31     void filter_transfer_calc_ri(float *re, float *im, const f_cascade_t *c, const float *freq, size_t count)
32     {
33         ARCH_AARCH64_ASM(
34             // Unpack filter params
35             __ASM_EMIT("ld3r                {v18.4s, v19.4s, v20.4s}, [%[c]]")
36             __ASM_EMIT("add                 %[c], %[c], #0x10")
37             __ASM_EMIT("ld3r                {v21.4s, v22.4s, v23.4s}, [%[c]]")
38             // x8 blocks
39             __ASM_EMIT("subs                %[count], %[count], #8")
40             __ASM_EMIT("b.lo                2f")
41             __ASM_EMIT("1:")
42             __ASM_EMIT("ldp                 q6, q7, [%[f]]")                        // v6   = f
43             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
44             __ASM_EMIT("fmul                v17.4s, v7.4s, v7.4s")
45             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
46             __ASM_EMIT("fmul                v5.4s, v19.4s, v7.4s")
47             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
48             __ASM_EMIT("fmul                v7.4s, v22.4s, v7.4s")
49             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
50             __ASM_EMIT("fmul                v1.4s, v20.4s, v17.4s")
51             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
52             __ASM_EMIT("fmul                v17.4s, v23.4s, v17.4s")
53             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
54             __ASM_EMIT("fsub                v4.4s, v18.4s, v1.4s")
55             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
56             __ASM_EMIT("fsub                v17.4s, v21.4s, v17.4s")
57             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
58             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
59             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
60             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
61             __ASM_EMIT("fmul                v2.4s, v4.4s, v17.4s")
62             __ASM_EMIT("fmul                v3.4s, v5.4s, v17.4s")
63             __ASM_EMIT("fmla                v2.4s, v5.4s, v7.4s")
64             __ASM_EMIT("fmls                v3.4s, v4.4s, v7.4s")
65             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
66             __ASM_EMIT("fmul                v5.4s, v17.4s, v17.4s")
67             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
68             __ASM_EMIT("fmla                v5.4s, v7.4s, v7.4s")
69             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
70             __ASM_EMIT("frecpe              v7.4s, v5.4s")
71             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
72             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
73             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
74             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
75             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
76             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
77             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
78             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
79             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
80             __ASM_EMIT("fmul                v2.4s, v2.4s, v7.4s")
81             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
82             __ASM_EMIT("fmul                v3.4s, v3.4s, v7.4s")
83             // Store data
84             __ASM_EMIT("stp                 q0, q2, [%[re]]")
85             __ASM_EMIT("stp                 q1, q3, [%[im]]")
86             __ASM_EMIT("subs                %[count], %[count], #8")
87             __ASM_EMIT("add                 %[f], %[f], #0x20")
88             __ASM_EMIT("add                 %[re], %[re], #0x20")
89             __ASM_EMIT("add                 %[im], %[im], #0x20")
90             __ASM_EMIT("b.hs                1b")
91             __ASM_EMIT("2:")
92             // x4 blocks
93             __ASM_EMIT("adds                %[count], %[count], #4")
94             __ASM_EMIT("b.lt                4f")
95             __ASM_EMIT("1:")
96             __ASM_EMIT("ldr                 q6, [%[f]]")                            // v6   = f
97             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
98             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
99             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
100             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
101             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
102             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
103             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
104             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
105             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
106             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
107             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
108             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
109             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
110             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
111             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
112             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
113             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
114             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
115             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
116             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
117             // Update data
118             __ASM_EMIT("str                 q0, [%[re]]")
119             __ASM_EMIT("str                 q1, [%[im]]")
120             __ASM_EMIT("sub                 %[count], %[count], #4")
121             __ASM_EMIT("add                 %[f], %[f], #0x10")
122             __ASM_EMIT("add                 %[re], %[re], #0x10")
123             __ASM_EMIT("add                 %[im], %[im], #0x10")
124             __ASM_EMIT("4:")
125             // x2 blocks
126             __ASM_EMIT("adds                %[count], %[count], #2")
127             __ASM_EMIT("blt                 6f")
128             __ASM_EMIT("1:")
129             __ASM_EMIT("ldr                 d6, [%[f]]")                            // v6   = f
130             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
131             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
132             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
133             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
134             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
135             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
136             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
137             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
138             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
139             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
140             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
141             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
142             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
143             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
144             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
145             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
146             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
147             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
148             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
149             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
150             // Update data
151             __ASM_EMIT("str                 d0, [%[re]]")
152             __ASM_EMIT("str                 d1, [%[im]]")
153             __ASM_EMIT("sub                 %[count], %[count], #2")
154             __ASM_EMIT("add                 %[f], %[f], #0x08")
155             __ASM_EMIT("add                 %[re], %[re], #0x08")
156             __ASM_EMIT("add                 %[im], %[im], #0x08")
157             __ASM_EMIT("6:")
158             // x1 blocks
159             __ASM_EMIT("adds                %[count], %[count], #1")
160             __ASM_EMIT("b.lt                8f")
161             __ASM_EMIT("1:")
162             __ASM_EMIT("ld1r                {v6.4s}, [%[f]]")                       // v6   = f
163             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
164             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
165             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
166             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
167             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
168             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
169             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
170             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
171             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
172             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
173             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
174             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
175             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
176             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
177             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
178             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
179             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
180             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
181             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
182             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
183             // Update data
184             __ASM_EMIT("st1                 {v0.s}[0], [%[re]]")
185             __ASM_EMIT("st1                 {v1.s}[0], [%[im]]")
186             __ASM_EMIT("8:")
187 
188             : [re] "+r" (re), [im] "+r" (im), [f] "+r" (freq),
189               [count] "+r" (count), [c] "+r" (c)
190             :
191             : "cc", "memory",
192               "v0", "v1", "v2", "v3",
193               "v4", "v5", "v6", "v7",
194               "v16", "v17", "v18", "v19",
195               "v20", "v21", "v22", "v23"
196         );
197     }
198 
filter_transfer_apply_ri(float * re,float * im,const f_cascade_t * c,const float * freq,size_t count)199     void filter_transfer_apply_ri(float *re, float *im, const f_cascade_t *c, const float *freq, size_t count)
200     {
201         ARCH_AARCH64_ASM(
202             // Unpack filter params
203             __ASM_EMIT("ld3r                {v18.4s, v19.4s, v20.4s}, [%[c]]")
204             __ASM_EMIT("add                 %[c], %[c], #0x10")
205             __ASM_EMIT("ld3r                {v21.4s, v22.4s, v23.4s}, [%[c]]")
206             // x8 blocks
207             __ASM_EMIT("subs                %[count], %[count], #8")
208             __ASM_EMIT("b.lo                2f")
209             __ASM_EMIT("1:")
210             __ASM_EMIT("ldp                 q6, q7, [%[f]]")                        // v6   = f
211             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
212             __ASM_EMIT("fmul                v17.4s, v7.4s, v7.4s")
213             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
214             __ASM_EMIT("fmul                v5.4s, v19.4s, v7.4s")
215             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
216             __ASM_EMIT("fmul                v7.4s, v22.4s, v7.4s")
217             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
218             __ASM_EMIT("fmul                v1.4s, v20.4s, v17.4s")
219             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
220             __ASM_EMIT("fmul                v17.4s, v23.4s, v17.4s")
221             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
222             __ASM_EMIT("fsub                v4.4s, v18.4s, v1.4s")
223             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
224             __ASM_EMIT("fsub                v17.4s, v21.4s, v17.4s")
225             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
226             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
227             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
228             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
229             __ASM_EMIT("fmul                v2.4s, v4.4s, v17.4s")
230             __ASM_EMIT("fmul                v3.4s, v5.4s, v17.4s")
231             __ASM_EMIT("fmla                v2.4s, v5.4s, v7.4s")
232             __ASM_EMIT("fmls                v3.4s, v4.4s, v7.4s")
233             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
234             __ASM_EMIT("fmul                v5.4s, v17.4s, v17.4s")
235             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
236             __ASM_EMIT("fmla                v5.4s, v7.4s, v7.4s")
237             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
238             __ASM_EMIT("frecpe              v7.4s, v5.4s")
239             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
240             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
241             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
242             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
243             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
244             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
245             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
246             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
247             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
248             __ASM_EMIT("fmul                v2.4s, v2.4s, v7.4s")
249             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
250             __ASM_EMIT("fmul                v3.4s, v3.4s, v7.4s")
251             // Update data
252             __ASM_EMIT("ldp                 q6, q7, [%[re]]")                       // v6   = b_re
253             __ASM_EMIT("ldp                 q16, q17, [%[im]]")                     // v16  = b_im
254             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
255             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
256             __ASM_EMIT("fmls                v4.4s, v1.4s, v16.4s")                  // v4   = a_re*b_re - a_im*b_im
257             __ASM_EMIT("fmla                v5.4s, v0.4s, v16.4s")                  // v5   = a_im*b_re + a_re*b_im
258             __ASM_EMIT("fmul                v6.4s, v2.4s, v7.4s")
259             __ASM_EMIT("fmul                v7.4s, v3.4s, v7.4s")
260             __ASM_EMIT("fmls                v6.4s, v3.4s, v17.4s")
261             __ASM_EMIT("fmla                v7.4s, v2.4s, v17.4s")
262             __ASM_EMIT("stp                 q4, q6, [%[re]]")
263             __ASM_EMIT("stp                 q5, q7, [%[im]]")
264             __ASM_EMIT("subs                %[count], %[count], #8")
265             __ASM_EMIT("add                 %[f], %[f], #0x20")
266             __ASM_EMIT("add                 %[re], %[re], #0x20")
267             __ASM_EMIT("add                 %[im], %[im], #0x20")
268             __ASM_EMIT("b.hs                1b")
269             __ASM_EMIT("2:")
270             // x4 blocks
271             __ASM_EMIT("adds                %[count], %[count], #4")
272             __ASM_EMIT("b.lt                4f")
273             __ASM_EMIT("1:")
274             __ASM_EMIT("ldr                 q6, [%[f]]")                            // v6   = f
275             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
276             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
277             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
278             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
279             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
280             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
281             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
282             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
283             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
284             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
285             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
286             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
287             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
288             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
289             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
290             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
291             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
292             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
293             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
294             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
295             // Update data
296             __ASM_EMIT("ldr                 q6, [%[re]]")                           // v6   = b_re
297             __ASM_EMIT("ldr                 q16, [%[im]]")                          // v16  = b_im
298             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
299             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
300             __ASM_EMIT("fmls                v4.4s, v1.4s, v16.4s")                  // v4   = a_re*b_re - a_im*b_im
301             __ASM_EMIT("fmla                v5.4s, v0.4s, v16.4s")                  // v5   = a_im*b_re + a_re*b_im
302             __ASM_EMIT("str                 q4, [%[re]]")
303             __ASM_EMIT("str                 q5, [%[im]]")
304             __ASM_EMIT("sub                 %[count], %[count], #4")
305             __ASM_EMIT("add                 %[f], %[f], #0x10")
306             __ASM_EMIT("add                 %[re], %[re], #0x10")
307             __ASM_EMIT("add                 %[im], %[im], #0x10")
308             __ASM_EMIT("4:")
309             // x2 blocks
310             __ASM_EMIT("adds                %[count], %[count], #2")
311             __ASM_EMIT("blt                 6f")
312             __ASM_EMIT("1:")
313             __ASM_EMIT("ldr                 d6, [%[f]]")                            // v6   = f
314             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
315             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
316             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
317             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
318             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
319             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
320             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
321             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
322             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
323             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
324             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
325             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
326             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
327             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
328             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
329             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
330             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
331             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
332             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
333             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
334             // Update data
335             __ASM_EMIT("ldr                 d6, [%[re]]")                           // v6   = b_re
336             __ASM_EMIT("ldr                 d16, [%[im]]")                          // v16  = b_im
337             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
338             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
339             __ASM_EMIT("fmls                v4.4s, v1.4s, v16.4s")                  // v4   = a_re*b_re - a_im*b_im
340             __ASM_EMIT("fmla                v5.4s, v0.4s, v16.4s")                  // v5   = a_im*b_re + a_re*b_im
341             __ASM_EMIT("str                 d4, [%[re]]")
342             __ASM_EMIT("str                 d5, [%[im]]")
343             __ASM_EMIT("sub                 %[count], %[count], #2")
344             __ASM_EMIT("add                 %[f], %[f], #0x08")
345             __ASM_EMIT("add                 %[re], %[re], #0x08")
346             __ASM_EMIT("add                 %[im], %[im], #0x08")
347             __ASM_EMIT("6:")
348             // x1 blocks
349             __ASM_EMIT("adds                %[count], %[count], #1")
350             __ASM_EMIT("b.lt                8f")
351             __ASM_EMIT("1:")
352             __ASM_EMIT("ld1r                {v6.4s}, [%[f]]")                       // v6   = f
353             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
354             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
355             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
356             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
357             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
358             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
359             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
360             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
361             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
362             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
363             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
364             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
365             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
366             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
367             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
368             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
369             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
370             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
371             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
372             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
373             // Update data
374             __ASM_EMIT("ld1r                {v6.4s}, [%[re]]")                      // v6   = b_re
375             __ASM_EMIT("ld1r                {v16.4s}, [%[im]]")                     // v16  = b_im
376             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
377             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
378             __ASM_EMIT("fmls                v4.4s, v1.4s, v16.4s")                  // v4   = a_re*b_re - a_im*b_im
379             __ASM_EMIT("fmla                v5.4s, v0.4s, v16.4s")                  // v5   = a_im*b_re + a_re*b_im
380             __ASM_EMIT("st1                 {v4.s}[0], [%[re]]")
381             __ASM_EMIT("st1                 {v5.s}[0], [%[im]]")
382             __ASM_EMIT("8:")
383 
384             : [re] "+r" (re), [im] "+r" (im), [f] "+r" (freq),
385               [count] "+r" (count), [c] "+r" (c)
386             :
387             : "cc", "memory",
388               "v0", "v1", "v2", "v3",
389               "v4", "v5", "v6", "v7",
390               "v16", "v17", "v18", "v19",
391               "v20", "v21", "v22", "v23"
392         );
393     }
394 
filter_transfer_calc_pc(float * dst,const f_cascade_t * c,const float * freq,size_t count)395     void filter_transfer_calc_pc(float *dst, const f_cascade_t *c, const float *freq, size_t count)
396     {
397         ARCH_AARCH64_ASM(
398             // Unpack filter params
399             __ASM_EMIT("ld3r                {v18.4s, v19.4s, v20.4s}, [%[c]]")
400             __ASM_EMIT("add                 %[c], %[c], #0x10")
401             __ASM_EMIT("ld3r                {v21.4s, v22.4s, v23.4s}, [%[c]]")
402             // x8 blocks
403             __ASM_EMIT("subs                %[count], %[count], #8")
404             __ASM_EMIT("b.lo                2f")
405             __ASM_EMIT("1:")
406             __ASM_EMIT("ldp                 q6, q7, [%[f]]")                        // v6   = f
407             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
408             __ASM_EMIT("fmul                v17.4s, v7.4s, v7.4s")
409             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
410             __ASM_EMIT("fmul                v5.4s, v19.4s, v7.4s")
411             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
412             __ASM_EMIT("fmul                v7.4s, v22.4s, v7.4s")
413             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
414             __ASM_EMIT("fmul                v1.4s, v20.4s, v17.4s")
415             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
416             __ASM_EMIT("fmul                v17.4s, v23.4s, v17.4s")
417             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
418             __ASM_EMIT("fsub                v4.4s, v18.4s, v1.4s")
419             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
420             __ASM_EMIT("fsub                v17.4s, v21.4s, v17.4s")
421             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
422             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
423             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
424             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
425             __ASM_EMIT("fmul                v2.4s, v4.4s, v17.4s")
426             __ASM_EMIT("fmul                v3.4s, v5.4s, v17.4s")
427             __ASM_EMIT("fmla                v2.4s, v5.4s, v7.4s")
428             __ASM_EMIT("fmls                v3.4s, v4.4s, v7.4s")
429             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
430             __ASM_EMIT("fmul                v5.4s, v17.4s, v17.4s")
431             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
432             __ASM_EMIT("fmla                v5.4s, v7.4s, v7.4s")
433             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
434             __ASM_EMIT("frecpe              v7.4s, v5.4s")
435             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
436             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
437             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
438             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
439             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
440             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
441             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
442             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
443             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
444             __ASM_EMIT("fmul                v2.4s, v2.4s, v7.4s")
445             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
446             __ASM_EMIT("fmul                v3.4s, v3.4s, v7.4s")
447             // Store data
448             __ASM_EMIT("st2                 {v0.4s, v1.4s}, [%[dst]]")
449             __ASM_EMIT("add                 %[dst], %[dst], #0x20")
450             __ASM_EMIT("st2                 {v2.4s, v3.4s}, [%[dst]]")
451             __ASM_EMIT("add                 %[dst], %[dst], #0x20")
452             __ASM_EMIT("subs                %[count], %[count], #8")
453             __ASM_EMIT("add                 %[f], %[f], #0x20")
454             __ASM_EMIT("b.hs                1b")
455             __ASM_EMIT("2:")
456             // x4 blocks
457             __ASM_EMIT("adds                %[count], %[count], #4")
458             __ASM_EMIT("b.lt                4f")
459             __ASM_EMIT("1:")
460             __ASM_EMIT("ldr                 q6, [%[f]]")                            // v6   = f
461             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
462             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
463             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
464             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
465             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
466             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
467             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
468             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
469             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
470             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
471             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
472             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
473             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
474             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
475             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
476             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
477             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
478             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
479             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
480             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
481             // Store data
482             __ASM_EMIT("st2                 {v0.4s, v1.4s}, [%[dst]]")
483             __ASM_EMIT("add                 %[dst], %[dst], #0x20")
484             __ASM_EMIT("sub                 %[count], %[count], #4")
485             __ASM_EMIT("add                 %[f], %[f], #0x10")
486             __ASM_EMIT("4:")
487             // x2 blocks
488             __ASM_EMIT("adds                %[count], %[count], #2")
489             __ASM_EMIT("blt                 6f")
490             __ASM_EMIT("1:")
491             __ASM_EMIT("ldr                 d6, [%[f]]")                            // v6   = f
492             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
493             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
494             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
495             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
496             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
497             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
498             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
499             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
500             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
501             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
502             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
503             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
504             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
505             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
506             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
507             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
508             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
509             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
510             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
511             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
512             // Update data
513             __ASM_EMIT("st2                 {v0.2s, v1.2s}, [%[dst]]")
514             __ASM_EMIT("add                 %[dst], %[dst], #0x10")
515             __ASM_EMIT("sub                 %[count], %[count], #2")
516             __ASM_EMIT("add                 %[f], %[f], #0x08")
517             __ASM_EMIT("6:")
518             // x1 blocks
519             __ASM_EMIT("adds                %[count], %[count], #1")
520             __ASM_EMIT("b.lt                8f")
521             __ASM_EMIT("1:")
522             __ASM_EMIT("ld1r                {v6.4s}, [%[f]]")                       // v6   = f
523             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
524             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
525             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
526             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
527             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
528             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
529             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
530             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
531             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
532             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
533             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
534             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
535             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
536             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
537             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
538             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
539             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
540             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
541             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
542             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
543             // Update data
544             __ASM_EMIT("st2                 {v0.s, v1.s}[0], [%[dst]]")
545             __ASM_EMIT("8:")
546 
547             : [dst] "+r" (dst), [f] "+r" (freq),
548               [count] "+r" (count), [c] "+r" (c)
549             :
550             : "cc", "memory",
551               "v0", "v1", "v2", "v3",
552               "v4", "v5", "v6", "v7",
553               "v16", "v17", "v18", "v19",
554               "v20", "v21", "v22", "v23"
555         );
556     }
557 
filter_transfer_apply_pc(float * dst,const f_cascade_t * c,const float * freq,size_t count)558     void filter_transfer_apply_pc(float *dst, const f_cascade_t *c, const float *freq, size_t count)
559     {
560         ARCH_AARCH64_ASM(
561             // Unpack filter params
562             __ASM_EMIT("ld3r                {v18.4s, v19.4s, v20.4s}, [%[c]]")
563             __ASM_EMIT("add                 %[c], %[c], #0x10")
564             __ASM_EMIT("ld3r                {v21.4s, v22.4s, v23.4s}, [%[c]]")
565             // x8 blocks
566             __ASM_EMIT("subs                %[count], %[count], #8")
567             __ASM_EMIT("b.lo                2f")
568             __ASM_EMIT("1:")
569             __ASM_EMIT("ldp                 q6, q7, [%[f]]")                        // v6   = f
570             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
571             __ASM_EMIT("fmul                v17.4s, v7.4s, v7.4s")
572             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
573             __ASM_EMIT("fmul                v5.4s, v19.4s, v7.4s")
574             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
575             __ASM_EMIT("fmul                v7.4s, v22.4s, v7.4s")
576             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
577             __ASM_EMIT("fmul                v1.4s, v20.4s, v17.4s")
578             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
579             __ASM_EMIT("fmul                v17.4s, v23.4s, v17.4s")
580             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
581             __ASM_EMIT("fsub                v4.4s, v18.4s, v1.4s")
582             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
583             __ASM_EMIT("fsub                v17.4s, v21.4s, v17.4s")
584             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
585             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
586             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
587             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
588             __ASM_EMIT("fmul                v2.4s, v4.4s, v17.4s")
589             __ASM_EMIT("fmul                v3.4s, v5.4s, v17.4s")
590             __ASM_EMIT("fmla                v2.4s, v5.4s, v7.4s")
591             __ASM_EMIT("fmls                v3.4s, v4.4s, v7.4s")
592             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
593             __ASM_EMIT("fmul                v5.4s, v17.4s, v17.4s")
594             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
595             __ASM_EMIT("fmla                v5.4s, v7.4s, v7.4s")
596             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
597             __ASM_EMIT("frecpe              v7.4s, v5.4s")
598             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
599             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
600             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
601             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
602             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
603             __ASM_EMIT("frecps              v17.4s, v7.4s, v5.4s")
604             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
605             __ASM_EMIT("fmul                v7.4s, v17.4s, v7.4s")
606             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
607             __ASM_EMIT("fmul                v2.4s, v2.4s, v7.4s")
608             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
609             __ASM_EMIT("fmul                v3.4s, v3.4s, v7.4s")
610             // Update data
611             __ASM_EMIT("ld2                 {v6.4s, v7.4s}, [%[dst]]")              // v6   = r0 r1 r2 r3, v7 = i0 i1 i2 i3
612             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
613             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
614             __ASM_EMIT("fmls                v4.4s, v1.4s, v7.4s")                   // v4   = a_re*b_re - a_im*b_im
615             __ASM_EMIT("fmla                v5.4s, v0.4s, v7.4s")                   // v5   = a_im*b_re + a_re*b_im
616             __ASM_EMIT("st2                 {v4.4s, v5.4s}, [%[dst]]")
617             __ASM_EMIT("add                 %[dst], %[dst], #0x20")
618             __ASM_EMIT("ld2                 {v6.4s, v7.4s}, [%[dst]]")              // v6   = r4 r5 r6 r7, v7 = i4 i5 i6 i7
619             __ASM_EMIT("fmul                v4.4s, v2.4s, v6.4s")
620             __ASM_EMIT("fmul                v5.4s, v3.4s, v6.4s")
621             __ASM_EMIT("fmls                v4.4s, v3.4s, v7.4s")
622             __ASM_EMIT("fmla                v5.4s, v2.4s, v7.4s")
623             __ASM_EMIT("st2                 {v4.4s, v5.4s}, [%[dst]]")
624             __ASM_EMIT("add                 %[dst], %[dst], #0x20")
625             __ASM_EMIT("subs                %[count], %[count], #8")
626             __ASM_EMIT("add                 %[f], %[f], #0x20")
627             __ASM_EMIT("b.hs                1b")
628             __ASM_EMIT("2:")
629             // x4 blocks
630             __ASM_EMIT("adds                %[count], %[count], #4")
631             __ASM_EMIT("b.lt                4f")
632             __ASM_EMIT("1:")
633             __ASM_EMIT("ldr                 q6, [%[f]]")                            // v6   = f
634             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
635             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
636             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
637             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
638             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
639             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
640             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
641             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
642             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
643             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
644             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
645             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
646             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
647             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
648             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
649             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
650             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
651             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
652             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
653             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
654             // Update data
655             __ASM_EMIT("ld2                 {v6.4s, v7.4s}, [%[dst]]")              // v6   = r0 r1 r2 r3, v7 = i0 i1 i2 i3
656             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
657             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
658             __ASM_EMIT("fmls                v4.4s, v1.4s, v7.4s")                   // v4   = a_re*b_re - a_im*b_im
659             __ASM_EMIT("fmla                v5.4s, v0.4s, v7.4s")                   // v5   = a_im*b_re + a_re*b_im
660             __ASM_EMIT("st2                 {v4.4s, v5.4s}, [%[dst]]")
661             __ASM_EMIT("add                 %[dst], %[dst], #0x20")
662             __ASM_EMIT("sub                 %[count], %[count], #4")
663             __ASM_EMIT("add                 %[f], %[f], #0x10")
664             __ASM_EMIT("4:")
665             // x2 blocks
666             __ASM_EMIT("adds                %[count], %[count], #2")
667             __ASM_EMIT("blt                 6f")
668             __ASM_EMIT("1:")
669             __ASM_EMIT("ldr                 d6, [%[f]]")                            // v6   = f
670             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
671             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
672             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
673             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
674             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
675             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
676             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
677             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
678             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
679             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
680             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
681             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
682             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
683             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
684             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
685             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
686             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
687             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
688             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
689             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
690             // Update data
691             __ASM_EMIT("ld2                 {v6.2s, v7.2s}, [%[dst]]")              // v6   = r0 r1 r2 r3, v7 = i0 i1 i2 i3
692             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
693             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
694             __ASM_EMIT("fmls                v4.4s, v1.4s, v7.4s")                   // v4   = a_re*b_re - a_im*b_im
695             __ASM_EMIT("fmla                v5.4s, v0.4s, v7.4s")                   // v5   = a_im*b_re + a_re*b_im
696             __ASM_EMIT("st2                 {v4.2s, v5.2s}, [%[dst]]")
697             __ASM_EMIT("add                 %[dst], %[dst], #0x10")
698             __ASM_EMIT("sub                 %[count], %[count], #2")
699             __ASM_EMIT("add                 %[f], %[f], #0x08")
700             __ASM_EMIT("6:")
701             // x1 blocks
702             __ASM_EMIT("adds                %[count], %[count], #1")
703             __ASM_EMIT("b.lt                8f")
704             __ASM_EMIT("1:")
705             __ASM_EMIT("ld1r                {v6.4s}, [%[f]]")                       // v6   = f
706             __ASM_EMIT("fmul                v16.4s, v6.4s, v6.4s")                  // v16  = f2 = f*f
707             __ASM_EMIT("fmul                v3.4s, v19.4s, v6.4s")                  // v3   = t_im = t1*f
708             __ASM_EMIT("fmul                v6.4s, v22.4s, v6.4s")                  // v6   = b_im = b1*f
709             __ASM_EMIT("fmul                v0.4s, v20.4s, v16.4s")                 // v0   = t2*f2
710             __ASM_EMIT("fmul                v16.4s, v23.4s, v16.4s")                // v16  = b2*f2
711             __ASM_EMIT("fsub                v2.4s, v18.4s, v0.4s")                  // v2   = t_re = t0 - t2*f2
712             __ASM_EMIT("fsub                v16.4s, v21.4s, v16.4s")                // v16  = b_re = b0 - b2*f2
713             __ASM_EMIT("fmul                v0.4s, v2.4s, v16.4s")                  // v0   = t_re*b_re
714             __ASM_EMIT("fmul                v1.4s, v3.4s, v16.4s")                  // v1   = t_im*b_re
715             __ASM_EMIT("fmla                v0.4s, v3.4s, v6.4s")                   // v0   = t_re*b_re + t_im*b_im
716             __ASM_EMIT("fmls                v1.4s, v2.4s, v6.4s")                   // v1   = t_im*b_re - t_re*b_im
717             __ASM_EMIT("fmul                v4.4s, v16.4s, v16.4s")                 // v4   = b_re*b_re
718             __ASM_EMIT("fmla                v4.4s, v6.4s, v6.4s")                   // v4   = W = b_re*b_re + b_im*b_im
719             __ASM_EMIT("frecpe              v6.4s, v4.4s")                          // v6   = s2
720             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2)
721             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = s2' = s2 * (2 - R*s2)
722             __ASM_EMIT("frecps              v16.4s, v6.4s, v4.4s")                  // v16  = (2 - R*s2')
723             __ASM_EMIT("fmul                v6.4s, v16.4s, v6.4s")                  // v6   = 1/W = s2" = s2' * (2 - R*s2')
724             __ASM_EMIT("fmul                v0.4s, v0.4s, v6.4s")                   // v0   = a_re = t_re / W
725             __ASM_EMIT("fmul                v1.4s, v1.4s, v6.4s")                   // v1   = a_im = t_im / W
726             // Update data
727             __ASM_EMIT("ld2r                {v6.4s, v7.4s}, [%[dst]]")              // v6   = r0, v7 =i0
728             __ASM_EMIT("fmul                v4.4s, v0.4s, v6.4s")                   // v4   = a_re*b_re
729             __ASM_EMIT("fmul                v5.4s, v1.4s, v6.4s")                   // v5   = a_im*b_re
730             __ASM_EMIT("fmls                v4.4s, v1.4s, v7.4s")                   // v4   = a_re*b_re - a_im*b_im
731             __ASM_EMIT("fmla                v5.4s, v0.4s, v7.4s")                   // v5   = a_im*b_re + a_re*b_im
732             __ASM_EMIT("st2                 {v4.s, v5.s}[0], [%[dst]]")
733             __ASM_EMIT("8:")
734 
735             : [dst] "+r" (dst), [f] "+r" (freq),
736               [count] "+r" (count), [c] "+r" (c)
737             :
738             : "cc", "memory",
739               "v0", "v1", "v2", "v3",
740               "v4", "v5", "v6", "v7",
741               "v16", "v17", "v18", "v19",
742               "v20", "v21", "v22", "v23"
743         );
744     }
745 }
746 
747 #endif /* DSP_ARCH_AARCH64_ASIMD_FILTERS_TRANSFER_H_ */
748