1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 6 мар. 2017 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 #ifndef DSP_ARCH_X86_SSE_FASTCONV_H_IMPL
22     #error "This header should not be included directly"
23 #endif /* DSP_ARCH_X86_SSE_FASTCONV_H_IMPL */
24 
25 namespace sse
26 {
fastconv_restore(float * dst,float * tmp,size_t rank)27     void fastconv_restore(float *dst, float *tmp, size_t rank)
28     {
29         // Prepare for butterflies
30         size_t last     = size_t(1) << rank;
31         size_t items    = last;
32         float *ptr      = tmp;
33 
34         ARCH_X86_ASM
35         (
36     //                __ASM_EMIT(".align 16")
37             __ASM_EMIT("1:")
38 
39             // Load data
40             __ASM_EMIT("movups      0x00(%[ptr]), %%xmm1")
41             __ASM_EMIT("movups      0x10(%[ptr]), %%xmm3")
42             __ASM_EMIT("movups      0x20(%[ptr]), %%xmm5")
43             __ASM_EMIT("movups      0x30(%[ptr]), %%xmm7")
44 
45             // Do x4 reverse butterflies
46             // xmm1 = r0 r1 r2 r3
47             // xmm3 = i0 i1 i2 i3
48             // xmm5 = r4 r5 r6 r7
49             // xmm7 = i4 i5 i6 i7
50             __ASM_EMIT("movaps      %%xmm1, %%xmm0")            /* xmm0 = r0 r1 r2 r3 */
51             __ASM_EMIT("movaps      %%xmm5, %%xmm4")
52             __ASM_EMIT("shufps      $0x88, %%xmm3, %%xmm0")     /* xmm0 = r0 r2 i0 i2 */
53             __ASM_EMIT("shufps      $0x88, %%xmm7, %%xmm4")
54             __ASM_EMIT("shufps      $0xdd, %%xmm3, %%xmm1")     /* xmm1 = r1 r3 i1 i3 */
55             __ASM_EMIT("shufps      $0xdd, %%xmm7, %%xmm5")
56             __ASM_EMIT("movaps      %%xmm0, %%xmm2")            /* xmm2 = r0 r2 i0 i2 */
57             __ASM_EMIT("movaps      %%xmm4, %%xmm6")
58             __ASM_EMIT("addps       %%xmm1, %%xmm0")            /* xmm0 = r0+r1 r2+r3 i0+i1 i2+i3 = r0k r2k i0k i2k */
59             __ASM_EMIT("addps       %%xmm5, %%xmm4")
60             __ASM_EMIT("subps       %%xmm1, %%xmm2")            /* xmm2 = r0-r1 r2-r3 i0-i1 i2-i3 = r1k r3k i1k i3k */
61             __ASM_EMIT("subps       %%xmm5, %%xmm6")
62 
63             __ASM_EMIT("movaps      %%xmm0, %%xmm1")            /* xmm1 = r0k r2k i0k i2k */
64             __ASM_EMIT("movaps      %%xmm4, %%xmm5")
65             __ASM_EMIT("shufps      $0x88, %%xmm2, %%xmm0")     /* xmm0 = r0k i0k r1k i1k */
66             __ASM_EMIT("shufps      $0x88, %%xmm6, %%xmm4")
67             __ASM_EMIT("shufps      $0x7d, %%xmm2, %%xmm1")     /* xmm1 = r2k i2k i3k r3k */
68             __ASM_EMIT("shufps      $0x7d, %%xmm6, %%xmm5")
69             __ASM_EMIT("movaps      %%xmm0, %%xmm2")            /* xmm2 = r0k i0k r1k i1k */
70             __ASM_EMIT("movaps      %%xmm4, %%xmm6")
71             __ASM_EMIT("addps       %%xmm1, %%xmm0")            /* xmm0 = r0k+r2k i0k+i2k r1k+i3k i1k+r3k = d0 d4 d3 d5 */
72             __ASM_EMIT("addps       %%xmm5, %%xmm4")
73             __ASM_EMIT("subps       %%xmm1, %%xmm2")            /* xmm2 = r0k-r2k i0k-i2k r1k-i3k i1k-r3k = d2 d6 d1 d7  */
74             __ASM_EMIT("subps       %%xmm5, %%xmm6")
75 
76             __ASM_EMIT("movaps      %%xmm0, %%xmm1")            /* xmm1 = d0 d4 d3 d5 */
77             __ASM_EMIT("movaps      %%xmm4, %%xmm5")
78             __ASM_EMIT("shufps      $0x88, %%xmm2, %%xmm0")     /* xmm0 = d0 d3 d2 d1 */
79             __ASM_EMIT("shufps      $0x88, %%xmm6, %%xmm4")
80             __ASM_EMIT("shufps      $0xdd, %%xmm2, %%xmm1")     /* xmm1 = d4 d5 d6 d7 */
81             __ASM_EMIT("shufps      $0xdd, %%xmm6, %%xmm5")
82             __ASM_EMIT("shufps      $0x6c, %%xmm0, %%xmm0")     /* xmm0 = d0 d1 d2 d3 */
83             __ASM_EMIT("shufps      $0x6c, %%xmm4, %%xmm4")
84 
85             // Finally store result
86             __ASM_EMIT("movups      %%xmm0, 0x00(%[ptr])")
87             __ASM_EMIT("movups      %%xmm1, 0x10(%[ptr])")
88             __ASM_EMIT("movups      %%xmm4, 0x20(%[ptr])")
89             __ASM_EMIT("movups      %%xmm5, 0x30(%[ptr])")
90 
91             __ASM_EMIT("add         $0x40, %[ptr]")
92             __ASM_EMIT("sub         $8, %[k]")
93             __ASM_EMIT("jnz         1b")
94 
95              : [ptr] "+r" (ptr), [k] "+r" (items)
96              :
97              : "cc", "memory",
98                "%xmm0", "%xmm1", "%xmm2", "%xmm3",
99                "%xmm4", "%xmm5", "%xmm6", "%xmm7"
100         );
101 
102         items           = last << 1;
103         size_t n        = 8;
104         size_t bs       = n << 1;
105 
106         const float *wk     = XFFT_W;
107         const float *ak     = XFFT_A;
108 
109         // Iterate butterflies
110         while (n < last)
111         {
112             for (size_t p=0; p<items; p += bs)
113             {
114                 // Set initial values of pointers
115                 float *a        = &tmp[p];
116                 float *b        = &a[n];
117                 size_t k        = n;
118 
119                 ARCH_X86_ASM
120                 (
121                     __ASM_EMIT("movaps      0x00(%[ak]), %%xmm6")       /* xmm6 = rak[i] */
122                     __ASM_EMIT("movaps      0x10(%[ak]), %%xmm7")       /* xmm7 = iak[i] */
123                     :
124                     : [ak] "r"(ak)
125                     : "%xmm6", "%xmm7"
126                 );
127 
128                 ARCH_X86_ASM
129                 (
130     //                        __ASM_EMIT(".align 16")
131                     __ASM_EMIT("1:")
132 
133                     __ASM_EMIT("movups      0x00(%[a]), %%xmm0")        /* xmm0 = ra[i] */
134                     __ASM_EMIT("movups      0x10(%[a]), %%xmm1")        /* xmm1 = ia[i] */
135                     __ASM_EMIT("movups      0x00(%[b]), %%xmm2")        /* xmm2 = rb[i] */
136                     __ASM_EMIT("movups      0x10(%[b]), %%xmm3")        /* xmm3 = ib[i] */
137 
138                     __ASM_EMIT("movaps      %%xmm2, %%xmm4")            /* xmm4 = rb[i] */
139                     __ASM_EMIT("movaps      %%xmm3, %%xmm5")            /* xmm5 = ib[i] */
140                     __ASM_EMIT("mulps       %%xmm6, %%xmm2")            /* xmm2 = rak[i]*rb[i] */
141                     __ASM_EMIT("mulps       %%xmm7, %%xmm4")            /* xmm4 = iak[i]*rb[i] */
142                     __ASM_EMIT("mulps       %%xmm6, %%xmm3")            /* xmm3 = rak[i]*ib[i] */
143                     __ASM_EMIT("mulps       %%xmm7, %%xmm5")            /* xmm5 = iak[i]*ib[i] */
144                     __ASM_EMIT("addps       %%xmm4, %%xmm3")            /* xmm3 = rak[i]*ib[i] + iak[i]*rb[i] = ic[i] */
145                     __ASM_EMIT("subps       %%xmm5, %%xmm2")            /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */
146                     __ASM_EMIT("movaps      %%xmm0, %%xmm4")            /* xmm4 = ra[i] */
147                     __ASM_EMIT("movaps      %%xmm1, %%xmm5")            /* xmm5 = ia[i] */
148                     __ASM_EMIT("subps       %%xmm2, %%xmm0")            /* xmm0 = ra[i] - rc[i] */
149                     __ASM_EMIT("subps       %%xmm3, %%xmm1")            /* xmm1 = ia[i] - ic[i] */
150                     __ASM_EMIT("addps       %%xmm4, %%xmm2")            /* xmm2 = ra[i] + rc[i] */
151                     __ASM_EMIT("addps       %%xmm5, %%xmm3")            /* xmm3 = ia[i] + ic[i] */
152 
153                     __ASM_EMIT("movups      %%xmm2, 0x00(%[a])")
154                     __ASM_EMIT("movups      %%xmm3, 0x10(%[a])")
155                     __ASM_EMIT("movups      %%xmm0, 0x00(%[b])")
156                     __ASM_EMIT("movups      %%xmm1, 0x10(%[b])")
157 
158                     __ASM_EMIT("add         $0x20, %[a]")
159                     __ASM_EMIT("add         $0x20, %[b]")
160                     __ASM_EMIT("sub         $8, %[k]")
161                     __ASM_EMIT("jz          2f")
162 
163                     /* Rotate angle */
164                     __ASM_EMIT("movaps      0x00(%[wk]), %%xmm0")       /* xmm0 = rw */
165                     __ASM_EMIT("movaps      0x10(%[wk]), %%xmm1")       /* xmm1 = iw */
166                     __ASM_EMIT("movaps      %%xmm0, %%xmm2")            /* xmm2 = rw */
167                     __ASM_EMIT("movaps      %%xmm1, %%xmm3")            /* xmm3 = iw */
168                     __ASM_EMIT("mulps       %%xmm6, %%xmm3")            /* xmm3 = ra * iw */
169                     __ASM_EMIT("mulps       %%xmm7, %%xmm1")            /* xmm1 = ia * iw */
170                     __ASM_EMIT("mulps       %%xmm0, %%xmm6")            /* xmm6 = ra * rw */
171                     __ASM_EMIT("mulps       %%xmm2, %%xmm7")            /* xmm7 = ia * rw */
172                     __ASM_EMIT("subps       %%xmm1, %%xmm6")            /* xmm6 = ra * rw - ia * iw */
173                     __ASM_EMIT("addps       %%xmm3, %%xmm7")            /* xmm7 = ra * iw + ia * rw */
174                     __ASM_EMIT("jmp         1b")
175 
176                     __ASM_EMIT("2:")
177 
178                     : [a] "+r"(a), [b] "+r"(b), [k] "+r" (k)
179                     : [wk] "r"(wk)
180                     : "cc", "memory",
181                       "%xmm0", "%xmm1", "%xmm2", "%xmm3",
182                       "%xmm4", "%xmm5", "%xmm6", "%xmm7"
183                 );
184             }
185 
186             wk     += 8;
187             ak     += 8;
188             n     <<= 1;
189             bs    <<= 1;
190         }
191 
192         if (n < items)
193         {
194             // ONE LARGE CYCLE
195             // Set initial values of pointers
196             float kn            = 1.0f / last;
197             size_t k            = n;
198 
199             ARCH_X86_ASM
200             (
201                 __ASM_EMIT("movaps      0x00(%[ak]), %%xmm6")           /* xmm6 = rak[i] */
202                 __ASM_EMIT("movaps      0x10(%[ak]), %%xmm7")           /* xmm7 = iak[i] */
203                 __ASM_EMIT("shufps      $0x00, %%xmm0, %%xmm0")         /* xmm0 = kn */
204                 __ASM_EMIT("movaps      %%xmm0, %%xmm1")                /* xmm1 = kn */
205                 : "+Yz" (kn)
206                 : [ak] "r" (ak)
207                 : "%xmm1", "%xmm6", "%xmm7"
208             );
209 
210             ARCH_X86_ASM
211             (
212     //                    __ASM_EMIT(".align 16")
213                 __ASM_EMIT("1:")
214 
215                 __ASM_EMIT("movups      0x00(%[tmp]), %%xmm4")          /* xmm4 = ra[i] */
216                 __ASM_EMIT("movups      0x00(%[tmp],%[n],4), %%xmm2")   /* xmm2 = rb[i] */
217                 __ASM_EMIT("movups      0x10(%[tmp],%[n],4), %%xmm3")   /* xmm3 = ib[i] */
218 
219                 __ASM_EMIT("mulps       %%xmm6, %%xmm2")                /* xmm2 = rak[i]*rb[i] */
220                 __ASM_EMIT("mulps       %%xmm7, %%xmm3")                /* xmm3 = iak[i]*ib[i] */
221                 __ASM_EMIT("movaps      %%xmm4, %%xmm5")                /* xmm5 = ra[i] */
222                 __ASM_EMIT("subps       %%xmm3, %%xmm2")                /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */
223                 __ASM_EMIT("addps       %%xmm2, %%xmm4")                /* xmm4 = ra[i] + rc[i] */
224                 __ASM_EMIT("subps       %%xmm2, %%xmm5")                /* xmm5 = ra[i] - rc[i] */
225                 __ASM_EMIT("mulps       %%xmm0, %%xmm4")                /* xmm4 = kn*(ra[i] + rc[i]) */
226                 __ASM_EMIT("mulps       %%xmm1, %%xmm5")                /* xmm5 = kn*(ra[i] - rc[i]) */
227 
228                 __ASM_EMIT("movups      %%xmm4, 0x00(%[dst])")
229                 __ASM_EMIT("movups      %%xmm5, 0x00(%[dst],%[n],2)")
230 
231                 __ASM_EMIT("add         $0x20, %[tmp]")
232                 __ASM_EMIT("add         $0x10, %[dst]")
233                 __ASM_EMIT("sub         $8, %[k]")
234                 __ASM_EMIT("jz          2f")
235 
236                 /* Rotate angle */
237                 __ASM_EMIT("movaps      0x00(%[wk]), %%xmm4")           /* xmm4 = rw */
238                 __ASM_EMIT("movaps      0x10(%[wk]), %%xmm5")           /* xmm5 = iw */
239                 __ASM_EMIT("movaps      %%xmm4, %%xmm2")                /* xmm2 = rw */
240                 __ASM_EMIT("movaps      %%xmm5, %%xmm3")                /* xmm3 = iw */
241                 __ASM_EMIT("mulps       %%xmm6, %%xmm3")                /* xmm3 = ra * iw */
242                 __ASM_EMIT("mulps       %%xmm7, %%xmm5")                /* xmm5 = ia * iw */
243                 __ASM_EMIT("mulps       %%xmm4, %%xmm6")                /* xmm6 = ra * rw */
244                 __ASM_EMIT("mulps       %%xmm2, %%xmm7")                /* xmm7 = ia * rw */
245                 __ASM_EMIT("subps       %%xmm5, %%xmm6")                /* xmm6 = ra * rw - ia * iw */
246                 __ASM_EMIT("addps       %%xmm3, %%xmm7")                /* xmm7 = ra * iw + ia * rw */
247                 __ASM_EMIT("jmp         1b")
248 
249                 __ASM_EMIT("2:")
250 
251                 : [tmp] "+r"(tmp), [dst] "+r"(dst), [n] "+r"(n), [k] "+r" (k)
252                 : [wk] "r"(wk)
253                 : "cc", "memory",
254                   "%xmm0", "%xmm5", "%xmm2", "%xmm3",
255                   "%xmm4", "%xmm5", "%xmm6", "%xmm7"
256             );
257         }
258         else
259         {
260             // Add real result to the target (ignore complex result)
261             float kn    = 1.0f / last;
262 
263             // Unpack 4x split complex to 4x real
264             ARCH_X86_ASM
265             (
266                 __ASM_EMIT("shufps      $0x00, %%xmm0, %%xmm0")         /* xmm0 = kn */
267                 __ASM_EMIT("movups      0x00(%[tmp]), %%xmm1")          /* xmm0 = s[i] */
268                 __ASM_EMIT("movups      0x00(%[dst]), %%xmm2")
269                 __ASM_EMIT("mulps       %%xmm0, %%xmm1")
270                 __ASM_EMIT("addps       %%xmm1, %%xmm2")
271                 __ASM_EMIT("movups      %%xmm2, 0x00(%[dst])")
272                 : "+Yz" (kn)
273                 : [tmp] "r" (tmp), [dst] "r" (dst)
274                 : "cc", "memory",
275                   "%xmm1", "%xmm2"
276             );
277         }
278 
279     } // RESTORE_IMPL
280 }
281