1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 2 янв. 2020 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_X86_SSE_FILTERS_TRANSFER_H_
23 #define DSP_ARCH_X86_SSE_FILTERS_TRANSFER_H_
24 
25 #ifndef DSP_ARCH_X86_SSE_IMPL
26     #error "This header should not be included directly"
27 #endif /* DSP_ARCH_X86_SSE_IMPL */
28 
29 namespace sse
30 {
31     #define F_UNPACK \
32         __ASM_EMIT("movups      0x00(%[c]), %%xmm0")        /* x0   = t0 t1 t2 t3 */ \
33         __ASM_EMIT("movups      0x10(%[c]), %%xmm4")        /* x4   = b0 b1 b2 b3 */ \
34         __ASM_EMIT("movaps      %%xmm0, %%xmm2")            /* x3   = t0 t1 t2 t3 */ \
35         __ASM_EMIT("movaps      %%xmm4, %%xmm6")            /* x6   = b0 b1 b2 b3 */ \
36         __ASM_EMIT("unpcklps    %%xmm0, %%xmm0")            /* x0   = t0 t0 t1 t1 */ \
37         __ASM_EMIT("unpcklps    %%xmm4, %%xmm4")            /* x4   = b0 b0 b1 b1 */ \
38         __ASM_EMIT("unpckhps    %%xmm2, %%xmm2")            /* x2   = t2 t2 t3 t3 */ \
39         __ASM_EMIT("unpckhps    %%xmm6, %%xmm6")            /* x6   = b2 b2 b3 b3 */ \
40         __ASM_EMIT("movaps      %%xmm0, %%xmm1")            /* x1   = t0 t1 t2 t3 */ \
41         __ASM_EMIT("movaps      %%xmm4, %%xmm5")            /* x5   = b0 b1 b2 b3 */ \
42         __ASM_EMIT("unpcklps    %%xmm0, %%xmm0")            /* x0   = t0 t0 t0 t0 */ \
43         __ASM_EMIT("unpcklps    %%xmm4, %%xmm4")            /* x4   = b0 b0 b0 b0 */ \
44         __ASM_EMIT("unpckhps    %%xmm1, %%xmm1")            /* x1   = t1 t1 t1 t1 */ \
45         __ASM_EMIT("unpckhps    %%xmm5, %%xmm5")            /* x5   = b1 b1 b1 b1 */ \
46         __ASM_EMIT("unpcklps    %%xmm2, %%xmm2")            /* x2   = t2 t2 t2 t2 */ \
47         __ASM_EMIT("unpcklps    %%xmm6, %%xmm6")            /* x6   = b2 b2 b2 b2 */ \
48         __ASM_EMIT("movaps      %%xmm0, 0x00 + %[fp]")      /* x0   = t0 */ \
49         __ASM_EMIT("movaps      %%xmm1, 0x10 + %[fp]")      /* x1   = t1 */ \
50         __ASM_EMIT("movaps      %%xmm2, 0x20 + %[fp]")      /* x2   = t2 */ \
51         __ASM_EMIT("movaps      %%xmm4, 0x30 + %[fp]")      /* x4   = b0 */ \
52         __ASM_EMIT("movaps      %%xmm5, 0x40 + %[fp]")      /* x5   = b1 */ \
53         __ASM_EMIT("movaps      %%xmm6, 0x50 + %[fp]")      /* x6   = b2 */
54 
55     #define F_LOAD \
56         __ASM_EMIT("movaps      0x00 + %[fp], %%xmm0")      /* x0   = t0 */ \
57         __ASM_EMIT("movaps      0x10 + %[fp], %%xmm1")      /* x1   = t1 */ \
58         __ASM_EMIT("movaps      0x20 + %[fp], %%xmm2")      /* x2   = t2 */ \
59         __ASM_EMIT("movaps      0x30 + %[fp], %%xmm4")      /* x4   = b0 */ \
60         __ASM_EMIT("movaps      0x40 + %[fp], %%xmm5")      /* x5   = b1 */ \
61         __ASM_EMIT("movaps      0x50 + %[fp], %%xmm6")      /* x6   = b2 */
62 
63     #define HF_CORE \
64         /* Compute H[f] */ \
65         __ASM_EMIT("movaps      %%xmm3, %%xmm7")            /* x7   = f */ \
66         __ASM_EMIT("mulps       %%xmm3, %%xmm1")            /* x1   = t_im = t1 * f */ \
67         __ASM_EMIT("mulps       %%xmm7, %%xmm7")            /* x7   = f2 = f * f */ \
68         __ASM_EMIT("mulps       %%xmm3, %%xmm5")            /* x5   = b_im = b1 * f */ \
69         __ASM_EMIT("mulps       %%xmm7, %%xmm2")            /* x2   = t2 * f2 */ \
70         __ASM_EMIT("mulps       %%xmm7, %%xmm6")            /* x6   = b2 * f2 */ \
71         __ASM_EMIT("subps       %%xmm2, %%xmm0")            /* x0   = t_re = t0 - t2*f2 */ \
72         __ASM_EMIT("subps       %%xmm6, %%xmm4")            /* x4   = b_re = b0 - b2*f2 */ \
73         __ASM_EMIT("movaps      %%xmm5, %%xmm3")            /* x3   = b_im */ \
74         __ASM_EMIT("movaps      %%xmm4, %%xmm2")            /* x2   = b_re */ \
75         __ASM_EMIT("mulps       %%xmm3, %%xmm3")            /* x3   = b_im * b_im */ \
76         __ASM_EMIT("mulps       %%xmm2, %%xmm2")            /* x2   = b_re * b_re */ \
77         __ASM_EMIT("addps       %%xmm2, %%xmm3")            /* x3   = W = b_re * b_re + b_im * b_im */ \
78         __ASM_EMIT("movaps      %%xmm0, %%xmm6")            /* x6   = t_re */ \
79         __ASM_EMIT("movaps      %%xmm1, %%xmm7")            /* x7   = t_im */ \
80         __ASM_EMIT("mulps       %%xmm4, %%xmm0")            /* x0   = t_re * b_re */ \
81         __ASM_EMIT("mulps       %%xmm5, %%xmm7")            /* x7   = t_im * b_im */ \
82         __ASM_EMIT("mulps       %%xmm4, %%xmm1")            /* x1   = t_im * b_re */ \
83         __ASM_EMIT("mulps       %%xmm5, %%xmm6")            /* x6   = t_re * b_im */ \
84         __ASM_EMIT("addps       %%xmm7, %%xmm0")            /* x0   = t_re * b_re + t_im * b_im */ \
85         __ASM_EMIT("subps       %%xmm6, %%xmm1")            /* x1   = t_im * b_re - t_re * b_im */ \
86         __ASM_EMIT("divps       %%xmm3, %%xmm0")            /* x0   = a_re = (t_re * b_re + t_im * b_im) / W */ \
87         __ASM_EMIT("divps       %%xmm3, %%xmm1")            /* x1   = a_im = (t_im * b_re - t_re * b_im) / W */
88 
89     #define HF_APPLY \
90         /* Compute dst = H[f] * dst */ \
91         __ASM_EMIT("movaps      %%xmm0, %%xmm4")            /* x4   = a_re */ \
92         __ASM_EMIT("movaps      %%xmm1, %%xmm5")            /* x5   = a_im */ \
93         __ASM_EMIT("mulps       %%xmm2, %%xmm0")            /* x0   = a_re * b_re */ \
94         __ASM_EMIT("mulps       %%xmm3, %%xmm4")            /* x4   = a_re * b_im */ \
95         __ASM_EMIT("mulps       %%xmm2, %%xmm1")            /* x1   = a_im * b_re */ \
96         __ASM_EMIT("mulps       %%xmm3, %%xmm5")            /* x5   = a_im * b_im */ \
97         __ASM_EMIT("addps       %%xmm4, %%xmm1")            /* x1   = a_re * b_im + a_im * b_re */ \
98         __ASM_EMIT("subps       %%xmm5, %%xmm0")            /* x0   = a_re * b_re - a_im * b_im */
99 
filter_transfer_calc_ri(float * re,float * im,const f_cascade_t * c,const float * freq,size_t count)100     void filter_transfer_calc_ri(float *re, float *im, const f_cascade_t *c, const float *freq, size_t count)
101     {
102         IF_ARCH_X86( float fp[6*4] __lsp_aligned16; );
103 
104         ARCH_X86_ASM(
105             // Unpack filter params
106             F_UNPACK
107             // x4 blocks
108             __ASM_EMIT("sub         $4, %[count]")
109             __ASM_EMIT("jb          2f")
110             __ASM_EMIT("1:")
111             __ASM_EMIT("movups      0x00(%[f]), %%xmm3")        // x3   = f
112             HF_CORE
113             __ASM_EMIT("movups      %%xmm0, 0x00(%[re])")
114             __ASM_EMIT("movups      %%xmm1, 0x00(%[im])")
115             F_LOAD
116             __ASM_EMIT("add         $0x10, %[f]")
117             __ASM_EMIT("add         $0x10, %[re]")
118             __ASM_EMIT("add         $0x10, %[im]")
119             __ASM_EMIT("sub         $4, %[count]")
120             __ASM_EMIT("jae         1b")
121             __ASM_EMIT("2:")
122             // x2 block
123             __ASM_EMIT("add         $2, %[count]")
124             __ASM_EMIT("jl          4f")
125             __ASM_EMIT("movlps      0x00(%[f]), %%xmm3")        // x3   = f
126             HF_CORE
127             __ASM_EMIT("movlps      %%xmm0, 0x00(%[re])")
128             __ASM_EMIT("movlps      %%xmm1, 0x00(%[im])")
129             F_LOAD
130             __ASM_EMIT("sub         $2, %[count]")
131             __ASM_EMIT("add         $0x08, %[f]")
132             __ASM_EMIT("add         $0x08, %[re]")
133             __ASM_EMIT("add         $0x08, %[im]")
134             __ASM_EMIT("4:")
135             // x1 block
136             __ASM_EMIT("add         $1, %[count]")
137             __ASM_EMIT("jl          6f")
138             __ASM_EMIT("movss       0x00(%[f]), %%xmm3")        // x3   = f
139             HF_CORE
140             __ASM_EMIT("movss       %%xmm0, 0x00(%[re])")
141             __ASM_EMIT("movss       %%xmm1, 0x00(%[im])")
142             __ASM_EMIT("6:")
143 
144             : [re] "+r" (re), [im] "+r" (im), [f] "+r" (freq), [count] "+r" (count)
145             : [c] "r" (c),
146               [fp] "o" (fp)
147             : "cc", "memory",
148               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
149               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
150         );
151     }
152 
filter_transfer_apply_ri(float * re,float * im,const f_cascade_t * c,const float * freq,size_t count)153     void filter_transfer_apply_ri(float *re, float *im, const f_cascade_t *c, const float *freq, size_t count)
154     {
155         IF_ARCH_X86( float fp[6*4] __lsp_aligned16; );
156 
157         ARCH_X86_ASM(
158             // Unpack filter params
159             F_UNPACK
160             // x4 blocks
161             __ASM_EMIT("sub         $4, %[count]")
162             __ASM_EMIT("jb          2f")
163             __ASM_EMIT("1:")
164             __ASM_EMIT("movups      0x00(%[f]), %%xmm3")        // x3   = f
165             HF_CORE
166             __ASM_EMIT("movups      0x00(%[re]), %%xmm2")       // x2   = b_re
167             __ASM_EMIT("movups      0x00(%[im]), %%xmm3")       // x3   = b_im
168             HF_APPLY
169             __ASM_EMIT("movups      %%xmm0, 0x00(%[re])")
170             __ASM_EMIT("movups      %%xmm1, 0x00(%[im])")
171             F_LOAD
172             __ASM_EMIT("add         $0x10, %[f]")
173             __ASM_EMIT("add         $0x10, %[re]")
174             __ASM_EMIT("add         $0x10, %[im]")
175             __ASM_EMIT("sub         $4, %[count]")
176             __ASM_EMIT("jae         1b")
177             __ASM_EMIT("2:")
178             // x2 block
179             __ASM_EMIT("add         $2, %[count]")
180             __ASM_EMIT("jl          4f")
181             __ASM_EMIT("movlps      0x00(%[f]), %%xmm3")        // x3   = f
182             HF_CORE
183             __ASM_EMIT("movlps      0x00(%[re]), %%xmm2")       // x2   = b_re
184             __ASM_EMIT("movlps      0x00(%[im]), %%xmm3")       // x3   = b_im
185             HF_APPLY
186             __ASM_EMIT("movlps      %%xmm0, 0x00(%[re])")
187             __ASM_EMIT("movlps      %%xmm1, 0x00(%[im])")
188             F_LOAD
189             __ASM_EMIT("sub         $2, %[count]")
190             __ASM_EMIT("add         $0x08, %[f]")
191             __ASM_EMIT("add         $0x08, %[re]")
192             __ASM_EMIT("add         $0x08, %[im]")
193             __ASM_EMIT("4:")
194             // x1 block
195             __ASM_EMIT("add         $1, %[count]")
196             __ASM_EMIT("jl          6f")
197             __ASM_EMIT("movss       0x00(%[f]), %%xmm3")        // x3   = f
198             HF_CORE
199             __ASM_EMIT("movss       0x00(%[re]), %%xmm2")       // x2   = b_re
200             __ASM_EMIT("movss       0x00(%[im]), %%xmm3")       // x3   = b_im
201             HF_APPLY
202             __ASM_EMIT("movss       %%xmm0, 0x00(%[re])")
203             __ASM_EMIT("movss       %%xmm1, 0x00(%[im])")
204             __ASM_EMIT("6:")
205 
206             : [re] "+r" (re), [im] "+r" (im), [f] "+r" (freq), [count] "+r" (count)
207             : [c] "r" (c),
208               [fp] "o" (fp)
209             : "cc", "memory",
210               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
211               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
212         );
213     }
214 
215     #undef HF_CORE
216     #undef HF_APPLY
217 
218     #define PHF_CORE \
219         /* Compute H[f] */ \
220         __ASM_EMIT("movaps      %%xmm3, %%xmm7")            /* x7   = f */ \
221         __ASM_EMIT("mulps       %%xmm3, %%xmm1")            /* x1   = t_im = t1 * f */ \
222         __ASM_EMIT("mulps       %%xmm7, %%xmm7")            /* x7   = f2 = f * f */ \
223         __ASM_EMIT("mulps       %%xmm3, %%xmm5")            /* x5   = b_im = b1 * f */ \
224         __ASM_EMIT("mulps       %%xmm7, %%xmm2")            /* x2   = t2 * f2 */ \
225         __ASM_EMIT("mulps       %%xmm7, %%xmm6")            /* x6   = b2 * f2 */ \
226         __ASM_EMIT("subps       %%xmm2, %%xmm0")            /* x0   = t_re = t0 - t2*f2 */ \
227         __ASM_EMIT("subps       %%xmm6, %%xmm4")            /* x4   = b_re = b0 - b2*f2 */ \
228         __ASM_EMIT("movaps      %%xmm5, %%xmm3")            /* x3   = b_im */ \
229         __ASM_EMIT("movaps      %%xmm4, %%xmm2")            /* x2   = b_re */ \
230         __ASM_EMIT("mulps       %%xmm3, %%xmm3")            /* x3   = b_im * b_im */ \
231         __ASM_EMIT("mulps       %%xmm2, %%xmm2")            /* x2   = b_re * b_re */ \
232         __ASM_EMIT("addps       %%xmm2, %%xmm3")            /* x3   = W = b_re * b_re + b_im * b_im */ \
233         __ASM_EMIT("movaps      %%xmm0, %%xmm6")            /* x6   = t_re */ \
234         __ASM_EMIT("movaps      %%xmm1, %%xmm7")            /* x7   = t_im */ \
235         __ASM_EMIT("mulps       %%xmm4, %%xmm0")            /* x0   = t_re * b_re */ \
236         __ASM_EMIT("mulps       %%xmm5, %%xmm7")            /* x7   = t_im * b_im */ \
237         __ASM_EMIT("mulps       %%xmm4, %%xmm1")            /* x1   = t_im * b_re */ \
238         __ASM_EMIT("mulps       %%xmm5, %%xmm6")            /* x6   = t_re * b_im */ \
239         __ASM_EMIT("addps       %%xmm7, %%xmm0")            /* x0   = t_re * b_re + t_im * b_im */ \
240         __ASM_EMIT("subps       %%xmm6, %%xmm1")            /* x1   = t_im * b_re - t_re * b_im */ \
241         __ASM_EMIT("divps       %%xmm3, %%xmm0")            /* x0   = a_re = (t_re * b_re + t_im * b_im) / W */ \
242         __ASM_EMIT("divps       %%xmm3, %%xmm1")            /* x1   = a_im = (t_im * b_re - t_re * b_im) / W */
243 
244     #define PHF_APPLY \
245         /* Compute dst = H[f] * dst */ \
246         __ASM_EMIT("movaps      %%xmm0, %%xmm4")            /* x4   = a_re */ \
247         __ASM_EMIT("movaps      %%xmm1, %%xmm5")            /* x5   = a_im */ \
248         __ASM_EMIT("mulps       %%xmm2, %%xmm0")            /* x0   = a_re * b_re */ \
249         __ASM_EMIT("mulps       %%xmm3, %%xmm4")            /* x4   = a_re * b_im */ \
250         __ASM_EMIT("mulps       %%xmm2, %%xmm1")            /* x1   = a_im * b_re */ \
251         __ASM_EMIT("mulps       %%xmm3, %%xmm5")            /* x5   = a_im * b_im */ \
252         __ASM_EMIT("addps       %%xmm4, %%xmm1")            /* x1   = a_re * b_im + a_im * b_re */ \
253         __ASM_EMIT("subps       %%xmm5, %%xmm0")            /* x0   = a_re * b_re - a_im * b_im */ \
254         __ASM_EMIT("movaps      %%xmm0, %%xmm2")            /* x2   = re */ \
255         __ASM_EMIT("unpcklps    %%xmm1, %%xmm0")            /* x0   = r0 i0 r1 i1 */ \
256         __ASM_EMIT("unpckhps    %%xmm1, %%xmm2")            /* x2   = r2 i2 r3 i3 */
257 
filter_transfer_calc_pc(float * dst,const f_cascade_t * c,const float * freq,size_t count)258     void filter_transfer_calc_pc(float *dst, const f_cascade_t *c, const float *freq, size_t count)
259     {
260         IF_ARCH_X86( float fp[6*4] __lsp_aligned16; );
261 
262         ARCH_X86_ASM(
263             // Unpack filter params
264             F_UNPACK
265             // x4 block
266             __ASM_EMIT("sub         $4, %[count]")
267             __ASM_EMIT("jb          2f")
268             __ASM_EMIT("1:")
269             __ASM_EMIT("movups      0x00(%[f]), %%xmm3")        // x3   = f
270             PHF_CORE
271             __ASM_EMIT("movaps      %%xmm0, %%xmm2")            // x2   = re
272             __ASM_EMIT("unpcklps    %%xmm1, %%xmm0")            // x0   = r0 i0 r1 i1
273             __ASM_EMIT("unpckhps    %%xmm1, %%xmm2")            // x2   = r2 i2 r3 i3
274             __ASM_EMIT("movups      %%xmm0, 0x00(%[dst])")
275             __ASM_EMIT("movups      %%xmm2, 0x10(%[dst])")
276             // Load filter params and repeat loop
277             F_LOAD
278             __ASM_EMIT("add         $0x10, %[f]")
279             __ASM_EMIT("add         $0x20, %[dst]")
280             __ASM_EMIT("sub         $4, %[count]")
281             __ASM_EMIT("jae         1b")
282             __ASM_EMIT("2:")
283             // x2 block
284             __ASM_EMIT("add         $2, %[count]")
285             __ASM_EMIT("jl          4f")
286             __ASM_EMIT("movlps      0x00(%[f]), %%xmm3")        // x3   = f
287             PHF_CORE
288             __ASM_EMIT("unpcklps    %%xmm1, %%xmm0")            // x0   = r0 i0 r1 i1
289             __ASM_EMIT("movups      %%xmm0, 0x00(%[dst])")
290             F_LOAD
291             __ASM_EMIT("sub         $2, %[count]")
292             __ASM_EMIT("add         $0x08, %[f]")
293             __ASM_EMIT("add         $0x10, %[dst]")
294             __ASM_EMIT("4:")
295             // x1 block
296             __ASM_EMIT("add         $1, %[count]")
297             __ASM_EMIT("jl          6f")
298             __ASM_EMIT("movss       0x00(%[f]), %%xmm3")        // x3   = f
299             PHF_CORE
300             __ASM_EMIT("unpcklps    %%xmm1, %%xmm0")            // x0   = r0 i0 r1 i1
301             __ASM_EMIT("movlps      %%xmm0, 0x00(%[dst])")
302             __ASM_EMIT("6:")
303 
304             : [dst] "+r" (dst), [f] "+r" (freq), [count] "+r" (count)
305             : [c] "r" (c),
306               [fp] "o" (fp)
307             : "cc", "memory",
308               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
309               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
310         );
311     }
312 
filter_transfer_apply_pc(float * dst,const f_cascade_t * c,const float * freq,size_t count)313     void filter_transfer_apply_pc(float *dst, const f_cascade_t *c, const float *freq, size_t count)
314     {
315         IF_ARCH_X86( float fp[6*4] __lsp_aligned16; );
316 
317         ARCH_X86_ASM(
318             // Unpack filter params
319             F_UNPACK
320             // x4 block
321             __ASM_EMIT("sub         $4, %[count]")
322             __ASM_EMIT("jb          2f")
323             __ASM_EMIT("1:")
324             __ASM_EMIT("movups      0x00(%[f]), %%xmm3")        // x3   = f
325             PHF_CORE
326             __ASM_EMIT("movups      0x00(%[dst]), %%xmm2")      // x2   = br0 bi0 br1 bi1
327             __ASM_EMIT("movups      0x10(%[dst]), %%xmm4")      // x4   = br2 bi2 br3 bi3
328             __ASM_EMIT("movaps      %%xmm2, %%xmm3")            // x3   = br0 bi0 br1 bi1
329             __ASM_EMIT("shufps      $0x88, %%xmm4, %%xmm2")     // x2   = br0 br1 br2 br3
330             __ASM_EMIT("shufps      $0xdd, %%xmm4, %%xmm3")     // x3   = bi0 bi1 bi2 bi3
331             PHF_APPLY
332             __ASM_EMIT("movups      %%xmm0, 0x00(%[dst])")
333             __ASM_EMIT("movups      %%xmm2, 0x10(%[dst])")
334             // Load filter params and repeat loop
335             F_LOAD
336             __ASM_EMIT("add         $0x10, %[f]")
337             __ASM_EMIT("add         $0x20, %[dst]")
338             __ASM_EMIT("sub         $4, %[count]")
339             __ASM_EMIT("jae         1b")
340             __ASM_EMIT("2:")
341             // x2 block
342             __ASM_EMIT("add         $2, %[count]")
343             __ASM_EMIT("jl          4f")
344             __ASM_EMIT("movlps      0x00(%[f]), %%xmm3")        // x3   = f
345             PHF_CORE
346             __ASM_EMIT("movups      0x00(%[dst]), %%xmm2")      // x2   = br0 bi0 br1 bi1
347             __ASM_EMIT("movaps      %%xmm2, %%xmm3")            // x3   = br0 bi0 br1 bi1
348             __ASM_EMIT("shufps      $0x88, %%xmm2, %%xmm2")     // x2   = br0 br1 br0 bi1
349             __ASM_EMIT("shufps      $0xdd, %%xmm3, %%xmm3")     // x3   = bi0 bi1 bi0 bi1
350             PHF_APPLY
351             __ASM_EMIT("movups      %%xmm0, 0x00(%[dst])")
352             F_LOAD
353             __ASM_EMIT("sub         $2, %[count]")
354             __ASM_EMIT("add         $0x08, %[f]")
355             __ASM_EMIT("add         $0x10, %[dst]")
356             __ASM_EMIT("4:")
357             // x1 block
358             __ASM_EMIT("add         $1, %[count]")
359             __ASM_EMIT("jl          6f")
360             __ASM_EMIT("movss       0x00(%[f]), %%xmm3")        // x3   = f
361             PHF_CORE
362             __ASM_EMIT("movss       0x00(%[dst]), %%xmm2")      // x2   = br0
363             __ASM_EMIT("movss       0x04(%[dst]), %%xmm3")      // x3   = bi0
364             PHF_APPLY
365             __ASM_EMIT("movlps      %%xmm0, 0x00(%[dst])")
366             __ASM_EMIT("6:")
367 
368             : [dst] "+r" (dst), [f] "+r" (freq), [count] "+r" (count)
369             : [c] "r" (c),
370               [fp] "o" (fp)
371             : "cc", "memory",
372               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
373               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
374         );
375     }
376 
377     #undef PHF_CORE
378     #undef PHF_APPLY
379 
380     #undef F_UNPACK
381     #undef F_LOAD
382 }
383 
384 #endif /* DSP_ARCH_X86_SSE_FILTERS_TRANSFER_H_ */
385