1 /*
2  * * Copyright (C) 2006-2011 Anders Brander <anders@brander.dk>,
3  * * Anders Kvist <akv@lnxbx.dk> and Klaus Post <klauspost@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License
7  * as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
18  */
19 
20 #include "complexfilter.h"
21 #include <math.h>
22 #include "fftwindow.h"
23 
24 namespace RawStudio {
25 namespace FFTFilter {
26 
27 #if defined (__i386__) || defined (__x86_64__)
28 
29 #if defined (__x86_64__)
processSharpenOnlySSE3(ComplexBlock * block)30 void DeGridComplexFilter::processSharpenOnlySSE3(ComplexBlock* block) {
31   fftwf_complex* outcur = block->complex;
32   fftwf_complex* gridsample = grid->complex;
33   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
34   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
35   float *wsharpen = sharpenWindow->getLine(0);
36 
37   for (int i = 0; i < 4; i++) {
38     temp[i+0] = 1e-15f;                   // 0
39     temp[i+4] = gridfraction;             // 16
40     temp[i+8] = sigmaSquaredSharpenMin;   // 32
41     temp[i+12] = sigmaSquaredSharpenMax;  // 48
42     temp[i+16] = 1.0f;                    // 64
43   }
44   int size = bw*bh;
45   if ((size & 7) == 0) {   // TODO: Benchmark on non-vm platform - slower under VMWARE
46     asm volatile
47     (
48     "movaps 16(%1),%%xmm14\n"          // Load gridfraction into xmm6
49     "loop_sharpenonly_sse3_big:\n"
50     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
51     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
52     "movaps 32(%2), %%xmm8\n"           // in r0i0 r1i1
53     "movaps 48(%2), %%xmm9\n"         //in r2i2 r3i3
54     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
55     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
56     "movaps 32(%3), %%xmm12\n"           // grid r0i0 r1i1
57     "movaps 48(%3), %%xmm13\n"         // grid r2i2 r3i3
58 
59     "mulps %%xmm14, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
60     "mulps %%xmm14, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
61     "mulps %%xmm14, %%xmm12\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
62     "mulps %%xmm14, %%xmm13\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
63     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
64     "movaps %%xmm5, %%xmm3\n"
65     "movaps %%xmm12, %%xmm10\n"         // maintain gridcorrection in memory
66     "movaps %%xmm13, %%xmm11\n"
67     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
68     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
69     "subps %%xmm12, %%xmm8\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
70     "subps %%xmm13, %%xmm9\n"          // re2 im2 re3 im3    -
71     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
72     "movaps %%xmm1, %%xmm5\n"
73     "movaps %%xmm8, %%xmm12\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
74     "movaps %%xmm9, %%xmm13\n"
75 
76     "movaps (%1), %%xmm6\n"         // Move 1e-15 into xmm6
77     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
78     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
79     "mulps %%xmm12, %%xmm12\n"          //r0i0 r1i1 squared
80     "mulps %%xmm13, %%xmm13\n"          //r2i2 r3i3 squared
81     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
82     "haddps %%xmm13, %%xmm12\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
83     "addps %%xmm6, %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
84     "addps %%xmm6, %%xmm12\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
85     "movaps 48(%1), %%xmm7\n"         // Move sigmaSquaredSharpenMax into xmm7
86 
87     //      float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
88     "movaps 32(%1), %%xmm6\n"         // Move sigmaSquaredSharpenMin into xmm6
89     "movaps %%xmm4, %%xmm5\n"       // Copy psd into xmm5
90     "movaps %%xmm12, %%xmm13\n"       // Copy psd into xmm5
91     "movaps %%xmm7, %%xmm15\n"       // Copy sigmaSquaredSharpenMax
92     "addps %%xmm7, %%xmm4\n"        // xmm4 = psd + sigmaSquaredSharpenMax
93     "addps %%xmm7, %%xmm12\n"        // xmm4 = psd + sigmaSquaredSharpenMax
94     "mulps %%xmm5, %%xmm7\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
95     "mulps %%xmm13, %%xmm15\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
96     "addps %%xmm6, %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
97     "addps %%xmm6, %%xmm13\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
98 
99     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
100     "mulps %%xmm12, %%xmm13\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
101     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
102     "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
103     "rcpps %%xmm13, %%xmm13\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
104     "movaps 16(%4), %%xmm4\n"       // Load wsharpen
105     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
106     "mulps %%xmm13, %%xmm15\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
107     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
108     "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
109     "rsqrtps %%xmm15, %%xmm15\n"       // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
110     "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
111     "rcpps %%xmm15, %%xmm15\n"        // sqrt (..)
112     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen
113     "mulps %%xmm4, %%xmm15\n"        // multiply wsharpen
114     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
115     "addps %%xmm5, %%xmm15\n"        // + 1.0 xmm7 = sfact
116     "movaps %%xmm7, %%xmm5\n"
117     "movaps %%xmm15, %%xmm13\n"
118     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
119     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
120     "unpcklps %%xmm15, %%xmm15\n"     // unpack low to xmm7
121     "unpckhps %%xmm13, %%xmm13\n"     // unpack high to xmm5
122 
123     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
124     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
125     "mulps %%xmm15, %%xmm8\n"        // re+im *= sfact
126     "mulps %%xmm13, %%xmm9\n"        // re+im *= sfact
127     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
128     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
129     "addps %%xmm10, %%xmm8\n"        // add gridcorrection
130     "addps %%xmm11, %%xmm9\n"        // add gridcorrection
131     "movaps %%xmm0, (%2)\n"         // Store
132     "movaps %%xmm1, 16(%2)\n"       // Store
133     "movaps %%xmm8, 32(%2)\n"         // Store
134     "movaps %%xmm9, 48(%2)\n"       // Store
135     "sub $8, %0\n"                  // size -=8
136     "add $64, %2\n"                 // outcur+=64
137     "add $64, %3\n"                 // gridsample+=64
138     "add $32, %4\n"                 // wsharpen+=32
139     "cmp $0, %0\n"
140     "jg loop_sharpenonly_sse3_big\n"
141     : /* no output registers */
142     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample), "r"(wsharpen)
143     : /*  %0           %1          %2              %3               %4          */
144     );
145   } else {
146     asm volatile
147     (
148     "movaps (%1), %%xmm11\n"
149     "movaps 16(%1), %%xmm12\n"
150     "movaps 32(%1), %%xmm13\n"
151     "movaps 48(%1), %%xmm14\n"
152     "movaps 64(%1), %%xmm15\n"
153     "loop_sharpenonly_sse3:\n"
154     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
155     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
156     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
157     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
158 
159     "mulps %%xmm12, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
160     "mulps %%xmm12, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
161     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
162     "movaps %%xmm5, %%xmm3\n"
163     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
164     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
165     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
166     "movaps %%xmm1, %%xmm5\n"
167 
168     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
169     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
170     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
171     "addps %%xmm11, %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
172     "movaps %%xmm14, %%xmm7\n"         // Move sigmaSquaredSharpenMax into xmm7
173 
174     //      float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
175     "movaps %%xmm4, %%xmm5\n"       // Copy psd into xmm5
176     "addps %%xmm7, %%xmm4\n"        // xmm4 = psd + sigmaSquaredSharpenMax
177     "mulps %%xmm5, %%xmm7\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
178     "addps %%xmm13, %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
179 
180     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
181     "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
182     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
183     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
184     "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
185     "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
186     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen
187     "addps %%xmm15, %%xmm7\n"        // + 1.0 xmm7 = sfact
188     "movaps %%xmm7, %%xmm5\n"
189     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
190     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
191 
192     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
193     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
194     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
195     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
196     "movaps %%xmm0, (%2)\n"         // Store
197     "movaps %%xmm1, 16(%2)\n"       // Store
198     "sub $4, %0\n"                  // size -=4
199     "add $32, %2\n"                 // outcur+=32
200     "add $32, %3\n"                 // gridsample+=32
201     "add $16, %4\n"                 // wsharpen+=16
202     "cmp $0, %0\n"
203     "jg loop_sharpenonly_sse3\n"
204     : /* no output registers */
205     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample), "r"(wsharpen)
206     : /*  %0           %1          %2              %3               %4          */
207     );
208   }
209 }
210 
211 #else  // 32 bits
212 void DeGridComplexFilter::processSharpenOnlySSE3(ComplexBlock* block) {
213   fftwf_complex* outcur = block->complex;
214   fftwf_complex* gridsample = grid->complex;
215   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
216   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
217   float *wsharpen = sharpenWindow->getLine(0);
218 
219   for (int i = 0; i < 4; i++) {
220     temp[i+0] = 1e-15f;                   // 0
221     temp[i+4] = gridfraction;             // 16
222     temp[i+8] = sigmaSquaredSharpenMin;   // 32
223     temp[i+12] = sigmaSquaredSharpenMax;  // 48
224     temp[i+16] = 1.0f;                    // 64
225   }
226   int size = bw*bh;
227   asm volatile
228   (
229     "loop_sharpenonly_sse3:\n"
230     "movaps 16(%1),%%xmm6\n"          // Load gridfraction into xmm6
231     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
232     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
233     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
234     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
235 
236     "mulps %%xmm6, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
237     "mulps %%xmm6, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
238     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
239     "movaps %%xmm5, %%xmm3\n"
240     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
241     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
242     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
243     "movaps %%xmm1, %%xmm5\n"
244 
245     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
246     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
247     "movaps 32(%1), %%xmm6\n"         // Move sigmaSquaredSharpenMin into xmm6
248     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
249     "addps (%1), %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
250     "movaps 48(%1), %%xmm7\n"         // Move sigmaSquaredSharpenMax into xmm7
251 
252     //      float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
253     "movaps %%xmm4, %%xmm5\n"       // Copy psd into xmm5
254     "addps %%xmm7, %%xmm4\n"        // xmm4 = psd + sigmaSquaredSharpenMax
255     "mulps %%xmm5, %%xmm7\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
256     "addps %%xmm6, %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
257 
258     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
259     "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
260     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
261     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
262     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
263     "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
264     "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
265     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen
266     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
267     "movaps %%xmm7, %%xmm5\n"
268     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
269     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
270 
271     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
272     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
273     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
274     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
275     "movaps %%xmm0, (%2)\n"         // Store
276     "movaps %%xmm1, 16(%2)\n"       // Store
277     "sub $4, %0\n"                  // size -=4
278     "add $32, %2\n"                 // outcur+=32
279     "add $32, %3\n"                 // gridsample+=32
280     "add $16, %4\n"                 // wsharpen+=16
281     "cmp $0, %0\n"
282     "jg loop_sharpenonly_sse3\n"
283     : /* no output registers */
284     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample), "r"(wsharpen)
285     : /*  %0           %1          %2              %3               %4          */
286   );
287 }
288 #endif
processSharpenOnlySSE(ComplexBlock * block)289 void DeGridComplexFilter::processSharpenOnlySSE(ComplexBlock* block) {
290   fftwf_complex* outcur = block->complex;
291   fftwf_complex* gridsample = grid->complex;
292   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
293   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
294   float *wsharpen = sharpenWindow->getLine(0);
295 
296   for (int i = 0; i < 4; i++) {
297     temp[i+0] = 1e-15f;                   // 0
298     temp[i+4] = gridfraction;             // 16
299     temp[i+8] = sigmaSquaredSharpenMin;   // 32
300     temp[i+12] = sigmaSquaredSharpenMax;  // 48
301     temp[i+16] = 1.0f;                    // 64
302   }
303   int size = bw*bh;
304   asm volatile
305   (
306     "loop_sharpenonly_sse:\n"
307     "movaps 16(%1),%%xmm6\n"          // Load gridfraction into xmm6
308     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
309     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
310     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
311     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
312 
313     "mulps %%xmm6, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
314     "mulps %%xmm6, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
315     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
316     "movaps %%xmm5, %%xmm3\n"
317     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
318     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
319     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
320     "movaps %%xmm1, %%xmm5\n"
321 
322     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
323     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
324 
325     "movaps %%xmm4, %%xmm7\n"
326     "shufps $136, %%xmm5, %%xmm4\n"      // xmm7 r0r1 r2r3  [10 00 10 00 = 136]
327     "shufps $221, %%xmm5, %%xmm7\n"      // xmm6 i0i1 i2i3  [11 01 11 01 = 221]
328     "movaps 32(%1), %%xmm6\n"         // Move sigmaSquaredSharpenMin into xmm6
329     "addps %%xmm7, %%xmm4\n"
330     "movaps 48(%1), %%xmm7\n"         // Move sigmaSquaredSharpenMax into xmm7
331     "addps (%1), %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
332 
333     //      float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
334     "movaps %%xmm4, %%xmm5\n"       // Copy psd into xmm5
335     "addps %%xmm7, %%xmm4\n"        // xmm4 = psd + sigmaSquaredSharpenMax
336     "mulps %%xmm5, %%xmm7\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
337     "addps %%xmm6, %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
338 
339     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
340     "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
341     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
342     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
343     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
344     "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
345     "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
346     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen
347     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
348     "movaps %%xmm7, %%xmm5\n"
349     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
350     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
351 
352     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
353     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
354     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
355     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
356     "movaps %%xmm0, (%2)\n"         // Store
357     "movaps %%xmm1, 16(%2)\n"       // Store
358     "sub $4, %0\n"                  // size -=4
359     "add $32, %2\n"                 // outcur+=32
360     "add $32, %3\n"                 // gridsample+=32
361     "add $16, %4\n"                 // wsharpen+=16
362     "cmp $0, %0\n"
363     "jg loop_sharpenonly_sse\n"
364     : /* no output registers */
365     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample), "r"(wsharpen)
366     : /*  %0           %1          %2              %3               %4          */
367   );
368 }
369 #if defined (__x86_64__)
processSharpen_SSE3(ComplexBlock * block)370 void ComplexWienerFilterDeGrid::processSharpen_SSE3( ComplexBlock* block )
371 {
372   fftwf_complex* outcur = block->complex;
373   fftwf_complex* gridsample = grid->complex;
374   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
375   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
376   float *wsharpen = sharpenWindow->getLine(0);
377 
378   for (int i = 0; i < 4; i++) {
379     temp[i+0] = 1e-15f;                   // 0
380     temp[i+4] = gridfraction;             // 16
381     temp[i+8] = sigmaSquaredSharpenMin;   // 32
382     temp[i+12] = sigmaSquaredSharpenMax;  // 48
383     temp[i+16] = 1.0f;                    // 64
384     temp[i+20] = sigmaSquaredNoiseNormed; // 80
385     temp[i+24] = lowlimit;                // 96
386   }
387   int size = bw*bh;
388   asm volatile
389   (
390     "movaps (%1), %%xmm15\n"          // 1e-15f
391     "movaps 16(%1),%%xmm14\n"         // Load gridfraction into xmm14
392     "movaps 32(%1), %%xmm10\n"        //xmm10 sigmaSquaredSharpenMin
393     "movaps 48(%1), %%xmm11\n"        // Move sigmaSquaredSharpenMax into xmm11
394     "movaps 64(%1), %%xmm9\n"         // Load "1.0"
395     "movaps 80(%1), %%xmm13\n"        //sigmaSquaredNoiseNormed in xmm13
396     "movaps 96(%1), %%xmm12\n"        // xmm12 = lowlimit
397     "loop_wienerdegridsharpen_sse3:\n"
398     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
399     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
400     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
401     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
402 
403     "mulps %%xmm14, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
404     "mulps %%xmm14, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
405     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
406     "movaps %%xmm5, %%xmm3\n"
407     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
408     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
409     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
410     "movaps %%xmm1, %%xmm5\n"
411 
412     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
413     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
414     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
415     "addps %%xmm15, %%xmm4\n"         // add 1e-15 (xmm4: psd for all 4 pixels)
416 
417     //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
418 
419     "movaps %%xmm4, %%xmm6\n"           // Copy psd into xmm6
420     "rcpps %%xmm4, %%xmm7\n"            //  xmm7: (1 / psd)
421     "subps %%xmm13, %%xmm6\n"            // xmm6 (psd) - xmm5 (ssnn) xmm5 free
422     "mulps %%xmm7, %%xmm6\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
423     "maxps %%xmm12, %%xmm6\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
424 
425     //      float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
426     "movaps %%xmm11, %%xmm7\n"         // Move sigmaSquaredSharpenMax into xmm7
427     "movaps %%xmm4, %%xmm5\n"       // Copy psd into xmm5
428     "addps %%xmm11, %%xmm4\n"        // xmm4 = psd + sigmaSquaredSharpenMax
429     "mulps %%xmm5, %%xmm7\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
430     "addps %%xmm10, %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
431 
432     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
433     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
434     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
435     "rsqrtps %%xmm7, %%xmm7\n"       // 1 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
436     "rcpps %%xmm7, %%xmm7\n"        // sqrt(...)
437     "mulps (%4), %%xmm7\n"        // multiply wsharpen
438     "addps %%xmm9, %%xmm7\n"        // + 1.0 xmm7 = sfact
439     "mulps %%xmm6, %%xmm7\n"        // *= Wienerfactor
440     "movaps %%xmm7, %%xmm5\n"
441     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
442     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
443 
444     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
445     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
446     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
447     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
448     "movaps %%xmm0, (%2)\n"         // Store
449     "movaps %%xmm1, 16(%2)\n"       // Store
450     "sub $4, %0\n"                  // size -=4
451     "add $32, %2\n"                 // outcur+=32
452     "add $32, %3\n"                 // gridsample+=32
453     "add $16, %4\n"                 // wsharpen+=16
454     "cmp $0, %0\n"
455     "jg loop_wienerdegridsharpen_sse3\n"
456     : /* no output registers */
457     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample), "r"(wsharpen)
458     : /*  %0           %1          %2              %3               %4          */
459   );
460 }
461 
462 #else  // 32 bits
463 
processSharpen_SSE3(ComplexBlock * block)464 void ComplexWienerFilterDeGrid::processSharpen_SSE3( ComplexBlock* block )
465 {
466   fftwf_complex* outcur = block->complex;
467   fftwf_complex* gridsample = grid->complex;
468   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
469   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
470   float *wsharpen = sharpenWindow->getLine(0);
471 
472   for (int i = 0; i < 4; i++) {
473     temp[i+0] = 1e-15f;                   // 0
474     temp[i+4] = gridfraction;             // 16
475     temp[i+8] = sigmaSquaredSharpenMin;   // 32
476     temp[i+12] = sigmaSquaredSharpenMax;  // 48
477     temp[i+16] = 1.0f;                    // 64
478     temp[i+20] = sigmaSquaredNoiseNormed; // 80
479     temp[i+24] = lowlimit;                // 96
480   }
481   int size = bw*bh;
482   asm volatile
483   (
484     "loop_wienerdegridsharpen_sse3:\n"
485     "movaps 16(%1),%%xmm6\n"          // Load gridfraction into xmm6
486     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
487     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
488     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
489     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
490 
491     "mulps %%xmm6, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
492     "mulps %%xmm6, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
493     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
494     "movaps %%xmm5, %%xmm3\n"
495     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
496     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
497     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
498     "movaps %%xmm1, %%xmm5\n"
499 
500     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
501     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
502     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
503     "addps (%1), %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
504 
505     //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
506 
507     "movaps 80(%1), %%xmm5\n"           //sigmaSquaredNoiseNormed in xmm5
508     "movaps %%xmm4, %%xmm6\n"           // Copy psd into xmm6
509     "rcpps %%xmm4, %%xmm7\n"            //  xmm7: (1 / psd)
510     "subps %%xmm5, %%xmm6\n"            // xmm6 (psd) - xmm5 (ssnn) xmm5 free
511     "movaps 96(%1), %%xmm5\n"           // xmm5 = lowlimit
512     "mulps %%xmm7, %%xmm6\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
513     "maxps %%xmm5, %%xmm6\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
514 
515     //      float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
516     "movaps 48(%1), %%xmm7\n"         // Move sigmaSquaredSharpenMax into xmm7
517     "movaps %%xmm4, %%xmm5\n"       // Copy psd into xmm5
518     "addps %%xmm7, %%xmm4\n"        // xmm4 = psd + sigmaSquaredSharpenMax
519     "mulps %%xmm5, %%xmm7\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
520     "addps 32(%1), %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
521 
522     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
523     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
524     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
525     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
526     "rsqrtps %%xmm7, %%xmm7\n"      // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
527     "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
528     "mulps (%4), %%xmm7\n"        // multiply wsharpen
529     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
530     "mulps %%xmm6, %%xmm7\n"        // *= Wienerfactor
531     "movaps %%xmm7, %%xmm5\n"
532     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
533     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
534 
535     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
536     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
537     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
538     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
539     "movaps %%xmm0, (%2)\n"         // Store
540     "movaps %%xmm1, 16(%2)\n"       // Store
541     "sub $4, %0\n"                  // size -=4
542     "add $32, %2\n"                 // outcur+=32
543     "add $32, %3\n"                 // gridsample+=32
544     "add $16, %4\n"                 // wsharpen+=16
545     "cmp $0, %0\n"
546     "jg loop_wienerdegridsharpen_sse3\n"
547     : /* no output registers */
548     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample), "r"(wsharpen)
549     : /*  %0           %1          %2              %3               %4          */
550   );
551 }
552 #endif
553 
processSharpen_SSE(ComplexBlock * block)554 void ComplexWienerFilterDeGrid::processSharpen_SSE( ComplexBlock* block )
555 {
556   fftwf_complex* outcur = block->complex;
557   fftwf_complex* gridsample = grid->complex;
558   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
559   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
560   float *wsharpen = sharpenWindow->getLine(0);
561 
562   for (int i = 0; i < 4; i++) {
563     temp[i+0] = 1e-15f;                   // 0
564     temp[i+4] = gridfraction;             // 16
565     temp[i+8] = sigmaSquaredSharpenMin;   // 32
566     temp[i+12] = sigmaSquaredSharpenMax;  // 48
567     temp[i+16] = 1.0f;                    // 64
568     temp[i+20] = sigmaSquaredNoiseNormed; // 72
569     temp[i+24] = lowlimit;                // 96
570   }
571   int size = bw*bh;
572   asm volatile
573   (
574     "loop_wienerdegridsharpen_sse:\n"
575     "movaps 16(%1),%%xmm6\n"          // Load gridfraction into xmm6
576     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
577     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
578     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
579     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
580 
581     "mulps %%xmm6, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
582     "mulps %%xmm6, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
583     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
584     "movaps %%xmm5, %%xmm3\n"
585     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
586     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
587     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
588     "movaps %%xmm1, %%xmm5\n"
589 
590     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
591     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
592     "movaps %%xmm4, %%xmm7\n"
593     "shufps $136, %%xmm5, %%xmm4\n"      // xmm7 r0r1 r2r3  [10 00 10 00 = 136]
594     "shufps $221, %%xmm5, %%xmm7\n"      // xmm6 i0i1 i2i3  [11 01 11 01 = 221]
595     "addps %%xmm7, %%xmm4\n"
596 
597 
598     "addps (%1), %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
599 
600     //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
601 
602     "movaps 80(%1), %%xmm5\n"           //sigmaSquaredNoiseNormed in xmm5
603     "movaps %%xmm4, %%xmm6\n"           // Copy psd into xmm6
604     "rcpps %%xmm4, %%xmm7\n"            //  xmm7: (1 / psd)
605     "subps %%xmm5, %%xmm6\n"            // xmm6 (psd) - xmm5 (ssnn) xmm5 free
606     "movaps 96(%1), %%xmm5\n"           // xmm5 = lowlimit
607     "mulps %%xmm7, %%xmm6\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
608     "maxps %%xmm5, %%xmm6\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
609 
610     //      float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
611     "movaps 48(%1), %%xmm7\n"         // Move sigmaSquaredSharpenMax into xmm7
612     "movaps %%xmm4, %%xmm5\n"       // Copy psd into xmm5
613     "addps %%xmm7, %%xmm4\n"        // xmm4 = psd + sigmaSquaredSharpenMax
614     "mulps %%xmm5, %%xmm7\n"        // xmm7 =  psd*sigmaSquaredSharpenMax
615     "addps 32(%1), %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
616 
617     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
618     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
619     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
620     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
621     "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
622     "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
623     "mulps (%4), %%xmm7\n"        // multiply wsharpen
624     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
625     "mulps %%xmm6, %%xmm7\n"        // *= Wienerfactor
626     "movaps %%xmm7, %%xmm5\n"
627     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
628     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
629 
630     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
631     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
632     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
633     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
634     "movaps %%xmm0, (%2)\n"         // Store
635     "movaps %%xmm1, 16(%2)\n"       // Store
636     "sub $4, %0\n"                  // size -=4
637     "add $32, %2\n"                 // outcur+=32
638     "add $32, %3\n"                 // gridsample+=32
639     "add $16, %4\n"                 // wsharpen+=16
640     "cmp $0, %0\n"
641     "jg loop_wienerdegridsharpen_sse\n"
642     : /* no output registers */
643     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample), "r"(wsharpen)
644     : /*  %0           %1          %2              %3               %4          */
645   );
646 
647 }
648 
processNoSharpen_SSE(ComplexBlock * block)649 void ComplexWienerFilterDeGrid::processNoSharpen_SSE( ComplexBlock* block )
650 {
651   fftwf_complex* outcur = block->complex;
652   fftwf_complex* gridsample = grid->complex;
653   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
654   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
655 
656   for (int i = 0; i < 4; i++) {
657     temp[i+0] = 1e-15f;                   // 0
658     temp[i+4] = gridfraction;             // 16
659     temp[i+8] = sigmaSquaredNoiseNormed;  // 32
660     temp[i+12] = lowlimit;                // 48
661   }
662   int size = bw*bh;
663   asm volatile
664   (
665     "loop_wienerdegridnosharpen_sse:\n"
666     "movaps 16(%1),%%xmm6\n"          // Load gridfraction into xmm6
667     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
668     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
669     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
670     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
671 
672     "mulps %%xmm6, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
673     "mulps %%xmm6, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
674     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
675     "movaps %%xmm5, %%xmm3\n"
676     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
677     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
678     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
679     "movaps %%xmm1, %%xmm5\n"
680 
681     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
682     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
683     "movaps %%xmm4, %%xmm7\n"
684     "shufps $136, %%xmm5, %%xmm4\n"      // xmm7 r0r1 r2r3  [10 00 10 00 = 136]
685     "shufps $221, %%xmm5, %%xmm7\n"      // xmm6 i0i1 i2i3  [11 01 11 01 = 221]
686     "addps %%xmm7, %%xmm4\n"
687 
688     "addps (%1), %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
689 
690     //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
691 
692     "movaps 32(%1), %%xmm5\n"           //sigmaSquaredNoiseNormed in xmm5
693     "movaps %%xmm4, %%xmm6\n"           // Copy psd into xmm6
694     "rcpps %%xmm4, %%xmm7\n"            //  xmm7: (1 / psd)
695     "subps %%xmm5, %%xmm6\n"            // xmm6 (psd) - xmm5 (ssnn) xmm5 free
696     "movaps 48(%1), %%xmm5\n"           // xmm5 = lowlimit
697     "mulps %%xmm7, %%xmm6\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
698     "maxps %%xmm6, %%xmm5\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
699 
700     "movaps %%xmm5, %%xmm7\n"
701     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
702     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
703 
704     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
705     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
706     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
707     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
708     "movaps %%xmm0, (%2)\n"         // Store
709     "movaps %%xmm1, 16(%2)\n"       // Store
710     "sub $4, %0\n"                  // size -=4
711     "add $32, %2\n"                 // outcur+=32
712     "add $32, %3\n"                 // gridsample+=32
713     "cmp $0, %0\n"
714     "jg loop_wienerdegridnosharpen_sse\n"
715     : /* no output registers */
716     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample)
717     : /*  %0           %1          %2              %3          */
718   );
719 
720 }
721 #if defined (__x86_64__)
processNoSharpen_SSE3(ComplexBlock * block)722 void ComplexWienerFilterDeGrid::processNoSharpen_SSE3( ComplexBlock* block )
723 {
724   fftwf_complex* outcur = block->complex;
725   fftwf_complex* gridsample = grid->complex;
726   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
727   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
728 
729   for (int i = 0; i < 4; i++) {
730     temp[i+0] = 1e-15f;                   // 0
731     temp[i+4] = gridfraction;             // 16
732     temp[i+8] = sigmaSquaredNoiseNormed;  // 32
733     temp[i+12] = lowlimit;                // 48
734   }
735   int size = bw*bh;
736   if ((size & 7) == 0) {  //TODO: Bench me to see if I'm faster
737     asm volatile
738     (
739     "movaps (%1), %%xmm14\n"           // xmm14: 1e-15
740     "movaps 16(%1), %%xmm15\n"         // Load gridfraction into xmm15
741     "loop_wienerdegridnosharpen_sse3_big:\n"
742     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
743     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
744     "movaps 32(%2), %%xmm8\n"         // in r4i4 r5i5
745     "movaps 48(%2), %%xmm9\n"         //in r6i6 r7i7
746     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
747     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
748     "movaps 32(%3), %%xmm10\n"         // grid r4i4 r5i5
749     "movaps 48(%3), %%xmm11\n"         // grid r6i6 r7i7
750 
751     "mulps %%xmm15, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
752     "mulps %%xmm15, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
753     "mulps %%xmm15, %%xmm10\n"          //grid r4*gf i4*gf r5*gf i5*gf
754     "mulps %%xmm15, %%xmm11\n"          //grid r6*gf i6*gf r7*gf i7*gf
755     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
756     "movaps %%xmm5, %%xmm3\n"
757     "movaps %%xmm10, %%xmm12\n"         // maintain gridcorrection in memory
758     "movaps %%xmm11, %%xmm13\n"
759     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
760     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
761     "subps %%xmm10, %%xmm8\n"          // re4 im4 re5 im5  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
762     "subps %%xmm11, %%xmm9\n"          // re6 im6 re7 im7    -
763     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
764     "movaps %%xmm1, %%xmm5\n"
765     "movaps %%xmm8, %%xmm10\n"         // copy re4+im4
766     "movaps %%xmm9, %%xmm11\n"
767 
768     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
769     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
770     "mulps %%xmm10, %%xmm10\n"          //r4i4 r5i5 squared
771     "mulps %%xmm11, %%xmm11\n"          //r6i6 r7i7 squared
772     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 (all squared) (SSE3!) - xmm 5 free
773     "haddps %%xmm11, %%xmm10\n"       //r4+i4 r5+i5 r6+i6 r7+i7 (all squared) (SSE3!) - xmm 11 free
774 
775     "addps %%xmm14, %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
776     "addps %%xmm14, %%xmm10\n"           // add 1e-15 (xmm10: psd for all 4 pixels)
777 
778     //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
779 
780     "movaps %%xmm4, %%xmm5\n"           // Copy psd into xmm5
781     "movaps %%xmm10, %%xmm11\n"           // Copy psd into xmm11
782 
783     "rcpps %%xmm4, %%xmm4\n"            //  xmm4: (1 / psd)
784     "movaps 32(%1), %%xmm7\n"
785     "rcpps %%xmm10, %%xmm10\n"          //  xmm10: (1 / psd)
786     "subps %%xmm7, %%xmm5\n"           // xmm5 (psd) - xmm5 (ssnn)
787     "subps %%xmm7, %%xmm11\n"          // xmm11 (psd) - xmm5 (ssnn)
788     "movaps 48(%1), %%xmm7\n"
789     "mulps %%xmm4, %%xmm5\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
790     "mulps %%xmm10, %%xmm11\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
791     "maxps %%xmm7, %%xmm5\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
792     "maxps %%xmm7, %%xmm11\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
793 
794     "movaps %%xmm5, %%xmm7\n"
795     "movaps %%xmm11, %%xmm10\n"
796     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
797     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
798     "unpckhps %%xmm11, %%xmm11\n"     // unpack high to xmm11
799     "unpcklps %%xmm10, %%xmm10\n"     // unpack low to xmm10
800 
801     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
802     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
803     "mulps %%xmm10, %%xmm8\n"        // re+im *= sfact
804     "mulps %%xmm11, %%xmm9\n"        // re+im *= sfact
805     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
806     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
807     "addps %%xmm12, %%xmm8\n"        // add gridcorrection
808     "addps %%xmm13, %%xmm9\n"        // add gridcorrection
809     "movaps %%xmm0, (%2)\n"         // Store
810     "movaps %%xmm1, 16(%2)\n"       // Store
811     "movaps %%xmm8, 32(%2)\n"         // Store
812     "movaps %%xmm9, 48(%2)\n"       // Store
813     "sub $8, %0\n"                  // size -=8
814     "add $64, %2\n"                 // outcur+=64
815     "add $64, %3\n"                 // gridsample+=64
816     "cmp $0, %0\n"
817     "jg loop_wienerdegridnosharpen_sse3_big\n"
818     : /* no output registers */
819     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample)
820     : /*  %0           %1          %2              %3          */
821     );
822   } else {
823     asm volatile
824     (
825     "movaps (%1), %%xmm14\n"           // xmm14: 1e-15
826     "movaps 16(%1), %%xmm15\n"         // Load gridfraction into xmm15
827     "movaps 32(%1), %%xmm13\n"         //sigmaSquaredNoiseNormed in xmm13
828     "movaps 48(%1), %%xmm12\n"         // xmm12 = lowlimit
829     "loop_wienerdegridnosharpen_sse3:\n"
830     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
831     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
832     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
833     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
834 
835     "mulps %%xmm15, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
836     "mulps %%xmm15, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
837     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
838     "movaps %%xmm5, %%xmm3\n"
839     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
840     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
841     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
842     "movaps %%xmm1, %%xmm5\n"
843 
844     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
845     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
846     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
847 
848     "addps %%xmm14, %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
849 
850     //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
851 
852     "movaps %%xmm4, %%xmm6\n"           // Copy psd into xmm6
853     "rcpps %%xmm4, %%xmm7\n"            //  xmm7: (1 / psd)
854     "subps %%xmm13, %%xmm6\n"            // xmm6 (psd) - xmm5 (ssnn) xmm5 free
855     "mulps %%xmm7, %%xmm6\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
856     "maxps %%xmm12, %%xmm6\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
857 
858     "movaps %%xmm6, %%xmm7\n"
859     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
860     "unpckhps %%xmm6, %%xmm6\n"     // unpack high to xmm6
861 
862     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
863     "mulps %%xmm6, %%xmm1\n"        // re+im *= sfact
864     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
865     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
866     "movaps %%xmm0, (%2)\n"         // Store
867     "movaps %%xmm1, 16(%2)\n"       // Store
868     "sub $4, %0\n"                  // size -=4
869     "add $32, %2\n"                 // outcur+=32
870     "add $32, %3\n"                 // gridsample+=32
871     "cmp $0, %0\n"
872     "jg loop_wienerdegridnosharpen_sse3\n"
873     : /* no output registers */
874     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample)
875     : /*  %0           %1          %2              %3          */
876     );
877   }
878 }
879 
880 #else // 32 bits
processNoSharpen_SSE3(ComplexBlock * block)881 void ComplexWienerFilterDeGrid::processNoSharpen_SSE3( ComplexBlock* block )
882 {
883   fftwf_complex* outcur = block->complex;
884   fftwf_complex* gridsample = grid->complex;
885   float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
886   float* temp = block->temp->data;  // Get aligned temp area, at least 256 bytes, only used by this thread.
887 
888   for (int i = 0; i < 4; i++) {
889     temp[i+0] = 1e-15f;                   // 0
890     temp[i+4] = gridfraction;             // 16
891     temp[i+8] = sigmaSquaredNoiseNormed;  // 32
892     temp[i+12] = lowlimit;                // 48
893   }
894   int size = bw*bh;
895   asm volatile
896   (
897     "loop_wienerdegridnosharpen_sse3:\n"
898     "movaps 16(%1),%%xmm6\n"          // Load gridfraction into xmm6
899     "movaps (%2), %%xmm0\n"           // in r0i0 r1i1
900     "movaps 16(%2), %%xmm1\n"         //in r2i2 r3i3
901     "movaps (%3), %%xmm4\n"           // grid r0i0 r1i1
902     "movaps 16(%3), %%xmm5\n"         // grid r2i2 r3i3
903 
904     "mulps %%xmm6, %%xmm4\n"          //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
905     "mulps %%xmm6, %%xmm5\n"          //grid r2*gf i2*gf r3*gf i3*gf  (gridfraction*gridsample[x])
906     "movaps %%xmm4, %%xmm2\n"         // maintain gridcorrection in memory
907     "movaps %%xmm5, %%xmm3\n"
908     "subps %%xmm4, %%xmm0\n"          // re0 im0 re1 im1  (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
909     "subps %%xmm5, %%xmm1\n"          // re2 im2 re3 im3    -
910     "movaps %%xmm0, %%xmm4\n"         // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
911     "movaps %%xmm1, %%xmm5\n"
912 
913     "mulps %%xmm4, %%xmm4\n"          //r0i0 r1i1 squared
914     "mulps %%xmm5, %%xmm5\n"          //r2i2 r3i3 squared
915     "haddps %%xmm5, %%xmm4\n"         //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
916 
917     "addps (%1), %%xmm4\n"            // add 1e-15 (xmm4: psd for all 4 pixels)
918 
919     //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
920 
921     "movaps 32(%1), %%xmm5\n"           //sigmaSquaredNoiseNormed in xmm5
922     "movaps %%xmm4, %%xmm6\n"           // Copy psd into xmm6
923     "rcpps %%xmm4, %%xmm7\n"            //  xmm7: (1 / psd)
924     "subps %%xmm5, %%xmm6\n"            // xmm6 (psd) - xmm5 (ssnn) xmm5 free
925     "movaps 48(%1), %%xmm5\n"           // xmm5 = lowlimit
926     "mulps %%xmm7, %%xmm6\n"            // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
927     "maxps %%xmm6, %%xmm5\n"            // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
928 
929     "movaps %%xmm5, %%xmm7\n"
930     "unpcklps %%xmm7, %%xmm7\n"     // unpack low to xmm7
931     "unpckhps %%xmm5, %%xmm5\n"     // unpack high to xmm5
932 
933     "mulps %%xmm7, %%xmm0\n"        // re+im *= sfact
934     "mulps %%xmm5, %%xmm1\n"        // re+im *= sfact
935     "addps %%xmm2, %%xmm0\n"        // add gridcorrection
936     "addps %%xmm3, %%xmm1\n"        // add gridcorrection
937     "movaps %%xmm0, (%2)\n"         // Store
938     "movaps %%xmm1, 16(%2)\n"       // Store
939     "sub $4, %0\n"                  // size -=4
940     "add $32, %2\n"                 // outcur+=32
941     "add $32, %3\n"                 // gridsample+=32
942     "cmp $0, %0\n"
943     "jg loop_wienerdegridnosharpen_sse3\n"
944     : /* no output registers */
945     : "r" (size), "r" (temp),  "r" (outcur), "r" (gridsample)
946     : /*  %0           %1          %2              %3          */
947   );
948 }
949 #endif
950 
951 #endif // defined (__i386__) || defined (__x86_64__)
952 
953 }}// namespace RawStudio::FFTFilter
954