1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 6 мар. 2017 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 #ifndef DSP_ARCH_X86_SSE_FASTCONV_H_IMPL
22     #error "This header should not be included directly"
23 #endif /* DSP_ARCH_X86_SSE_FASTCONV_H_IMPL */
24 
25 namespace sse
26 {
fastconv_restore_internal(float * dst,float * tmp,size_t rank)27     static inline void fastconv_restore_internal(float *dst, float *tmp, size_t rank)
28     {
29         // Prepare for butterflies
30         size_t last     = size_t(1) << rank;
31         size_t items    = last << 1;
32         size_t n        = 8;
33         size_t bs       = n << 1;
34 
35         const float *wk     = XFFT_W;
36         const float *ak     = XFFT_A;
37 
38         // Iterate butterflies
39         while (n < last)
40         {
41             for (size_t p=0; p<items; p += bs)
42             {
43                 // Set initial values of pointers
44                 float *a        = &tmp[p];
45                 float *b        = &a[n];
46                 size_t k        = n;
47 
48                 ARCH_X86_ASM
49                 (
50                     __ASM_EMIT("movaps      0x00(%[ak]), %%xmm6")       /* xmm6 = rak[i] */
51                     __ASM_EMIT("movaps      0x10(%[ak]), %%xmm7")       /* xmm7 = iak[i] */
52                     :
53                     : [ak] "r"(ak)
54                     : "%xmm6", "%xmm7"
55                 );
56 
57                 ARCH_X86_ASM
58                 (
59     //                        __ASM_EMIT(".align 16")
60                     __ASM_EMIT("1:")
61 
62                     __ASM_EMIT("movups      0x00(%[a]), %%xmm0")        /* xmm0 = ra[i] */
63                     __ASM_EMIT("movups      0x10(%[a]), %%xmm1")        /* xmm1 = ia[i] */
64                     __ASM_EMIT("movups      0x00(%[b]), %%xmm2")        /* xmm2 = rb[i] */
65                     __ASM_EMIT("movups      0x10(%[b]), %%xmm3")        /* xmm3 = ib[i] */
66 
67                     __ASM_EMIT("movaps      %%xmm2, %%xmm4")            /* xmm4 = rb[i] */
68                     __ASM_EMIT("movaps      %%xmm3, %%xmm5")            /* xmm5 = ib[i] */
69                     __ASM_EMIT("mulps       %%xmm6, %%xmm2")            /* xmm2 = rak[i]*rb[i] */
70                     __ASM_EMIT("mulps       %%xmm7, %%xmm4")            /* xmm4 = iak[i]*rb[i] */
71                     __ASM_EMIT("mulps       %%xmm6, %%xmm3")            /* xmm3 = rak[i]*ib[i] */
72                     __ASM_EMIT("mulps       %%xmm7, %%xmm5")            /* xmm5 = iak[i]*ib[i] */
73                     __ASM_EMIT("addps       %%xmm4, %%xmm3")            /* xmm3 = rak[i]*ib[i] + iak[i]*rb[i] = ic[i] */
74                     __ASM_EMIT("subps       %%xmm5, %%xmm2")            /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */
75                     __ASM_EMIT("movaps      %%xmm0, %%xmm4")            /* xmm4 = ra[i] */
76                     __ASM_EMIT("movaps      %%xmm1, %%xmm5")            /* xmm5 = ia[i] */
77                     __ASM_EMIT("subps       %%xmm2, %%xmm0")            /* xmm0 = ra[i] - rc[i] */
78                     __ASM_EMIT("subps       %%xmm3, %%xmm1")            /* xmm1 = ia[i] - ic[i] */
79                     __ASM_EMIT("addps       %%xmm4, %%xmm2")            /* xmm2 = ra[i] + rc[i] */
80                     __ASM_EMIT("addps       %%xmm5, %%xmm3")            /* xmm3 = ia[i] + ic[i] */
81 
82                     __ASM_EMIT("movups      %%xmm2, 0x00(%[a])")
83                     __ASM_EMIT("movups      %%xmm3, 0x10(%[a])")
84                     __ASM_EMIT("movups      %%xmm0, 0x00(%[b])")
85                     __ASM_EMIT("movups      %%xmm1, 0x10(%[b])")
86 
87                     __ASM_EMIT("add         $0x20, %[a]")
88                     __ASM_EMIT("add         $0x20, %[b]")
89                     __ASM_EMIT("sub         $8, %[k]")
90                     __ASM_EMIT("jz          2f")
91 
92                     /* Rotate angle */
93                     __ASM_EMIT("movaps      0x00(%[wk]), %%xmm0")       /* xmm0 = rw */
94                     __ASM_EMIT("movaps      0x10(%[wk]), %%xmm1")       /* xmm1 = iw */
95                     __ASM_EMIT("movaps      %%xmm0, %%xmm2")            /* xmm2 = rw */
96                     __ASM_EMIT("movaps      %%xmm1, %%xmm3")            /* xmm3 = iw */
97                     __ASM_EMIT("mulps       %%xmm6, %%xmm3")            /* xmm3 = ra * iw */
98                     __ASM_EMIT("mulps       %%xmm7, %%xmm1")            /* xmm1 = ia * iw */
99                     __ASM_EMIT("mulps       %%xmm0, %%xmm6")            /* xmm6 = ra * rw */
100                     __ASM_EMIT("mulps       %%xmm2, %%xmm7")            /* xmm7 = ia * rw */
101                     __ASM_EMIT("subps       %%xmm1, %%xmm6")            /* xmm6 = ra * rw - ia * iw */
102                     __ASM_EMIT("addps       %%xmm3, %%xmm7")            /* xmm7 = ra * iw + ia * rw */
103                     __ASM_EMIT("jmp         1b")
104 
105                     __ASM_EMIT("2:")
106 
107                     : [a] "+r"(a), [b] "+r"(b), [k] "+r" (k)
108                     : [wk] "r"(wk)
109                     : "cc", "memory",
110                       "%xmm0", "%xmm1", "%xmm2", "%xmm3",
111                       "%xmm4", "%xmm5", "%xmm6", "%xmm7"
112                 );
113             }
114 
115             wk     += 8;
116             ak     += 8;
117             n     <<= 1;
118             bs    <<= 1;
119         }
120 
121         if (n < items)
122         {
123             // ONE LARGE CYCLE
124             // Set initial values of pointers
125             float kn            = 1.0f / last;
126             size_t k            = n;
127 
128             ARCH_X86_ASM
129             (
130                 __ASM_EMIT("movaps      0x00(%[ak]), %%xmm6")           /* xmm6 = rak[i] */
131                 __ASM_EMIT("movaps      0x10(%[ak]), %%xmm7")           /* xmm7 = iak[i] */
132                 __ASM_EMIT("shufps      $0x00, %%xmm0, %%xmm0")         /* xmm0 = kn */
133                 __ASM_EMIT("movaps      %%xmm0, %%xmm1")                /* xmm1 = kn */
134                 : "+Yz" (kn)
135                 : [ak] "r" (ak)
136                 : "%xmm1", "%xmm6", "%xmm7"
137             );
138 
139             ARCH_X86_ASM
140             (
141     //                    __ASM_EMIT(".align 16")
142                 __ASM_EMIT("1:")
143 
144                 __ASM_EMIT("movups      0x00(%[tmp]), %%xmm4")          /* xmm4 = ra[i] */
145                 __ASM_EMIT("movups      0x00(%[tmp],%[n],4), %%xmm2")   /* xmm2 = rb[i] */
146                 __ASM_EMIT("movups      0x10(%[tmp],%[n],4), %%xmm3")   /* xmm3 = ib[i] */
147 
148                 __ASM_EMIT("mulps       %%xmm6, %%xmm2")                /* xmm2 = rak[i]*rb[i] */
149                 __ASM_EMIT("mulps       %%xmm7, %%xmm3")                /* xmm3 = iak[i]*ib[i] */
150                 __ASM_EMIT("movaps      %%xmm4, %%xmm5")                /* xmm5 = ra[i] */
151                 __ASM_EMIT("subps       %%xmm3, %%xmm2")                /* xmm2 = rak[i]*rb[i] - iak[i]*ib[i] = rc[i] */
152                 __ASM_EMIT("addps       %%xmm2, %%xmm4")                /* xmm4 = ra[i] + rc[i] */
153                 __ASM_EMIT("subps       %%xmm2, %%xmm5")                /* xmm5 = ra[i] - rc[i] */
154                 __ASM_EMIT("mulps       %%xmm0, %%xmm4")                /* xmm4 = kn*(ra[i] + rc[i]) */
155                 __ASM_EMIT("mulps       %%xmm1, %%xmm5")                /* xmm5 = kn*(ra[i] - rc[i]) */
156 
157                 __ASM_EMIT("movups      0x00(%[dst]), %%xmm2")
158                 __ASM_EMIT("movups      0x00(%[dst], %[n], 2), %%xmm3")
159                 __ASM_EMIT("addps       %%xmm4, %%xmm2")
160                 __ASM_EMIT("addps       %%xmm5, %%xmm3")
161                 __ASM_EMIT("movups      %%xmm2, 0x00(%[dst])")
162                 __ASM_EMIT("movups      %%xmm3, 0x00(%[dst],%[n],2)")
163 
164                 __ASM_EMIT("add         $0x20, %[tmp]")
165                 __ASM_EMIT("add         $0x10, %[dst]")
166                 __ASM_EMIT("sub         $8, %[k]")
167                 __ASM_EMIT("jz          2f")
168 
169                 /* Rotate angle */
170                 __ASM_EMIT("movaps      0x00(%[wk]), %%xmm4")           /* xmm4 = rw */
171                 __ASM_EMIT("movaps      0x10(%[wk]), %%xmm5")           /* xmm5 = iw */
172                 __ASM_EMIT("movaps      %%xmm4, %%xmm2")                /* xmm2 = rw */
173                 __ASM_EMIT("movaps      %%xmm5, %%xmm3")                /* xmm3 = iw */
174                 __ASM_EMIT("mulps       %%xmm6, %%xmm3")                /* xmm3 = ra * iw */
175                 __ASM_EMIT("mulps       %%xmm7, %%xmm5")                /* xmm5 = ia * iw */
176                 __ASM_EMIT("mulps       %%xmm4, %%xmm6")                /* xmm6 = ra * rw */
177                 __ASM_EMIT("mulps       %%xmm2, %%xmm7")                /* xmm7 = ia * rw */
178                 __ASM_EMIT("subps       %%xmm5, %%xmm6")                /* xmm6 = ra * rw - ia * iw */
179                 __ASM_EMIT("addps       %%xmm3, %%xmm7")                /* xmm7 = ra * iw + ia * rw */
180                 __ASM_EMIT("jmp         1b")
181 
182                 __ASM_EMIT("2:")
183 
184                 : [tmp] "+r"(tmp), [dst] "+r"(dst), [n] "+r"(n), [k] "+r" (k)
185                 : [wk] "r"(wk)
186                 : "cc", "memory",
187                   "%xmm0", "%xmm5", "%xmm2", "%xmm3",
188                   "%xmm4", "%xmm5", "%xmm6", "%xmm7"
189             );
190         }
191         else
192         {
193             // Add real result to the target (ignore complex result)
194             float kn    = 1.0f / last;
195 
196             // Unpack 4x split complex to 4x real
197             ARCH_X86_ASM
198             (
199                 __ASM_EMIT("shufps      $0x00, %%xmm0, %%xmm0")         /* xmm0 = kn */
200                 __ASM_EMIT("movups      0x00(%[tmp]), %%xmm1")          /* xmm0 = s[i] */
201                 __ASM_EMIT("movups      0x00(%[dst]), %%xmm2")
202                 __ASM_EMIT("mulps       %%xmm0, %%xmm1")
203                 __ASM_EMIT("addps       %%xmm1, %%xmm2")
204                 __ASM_EMIT("movups      %%xmm2, 0x00(%[dst])")
205                 : "+Yz" (kn)
206                 : [tmp] "r" (tmp), [dst] "r" (dst)
207                 : "cc", "memory",
208                   "%xmm1", "%xmm2"
209             );
210         }
211     }
212 }
213