1 /*
2 * * Copyright (C) 2006-2011 Anders Brander <anders@brander.dk>,
3 * * Anders Kvist <akv@lnxbx.dk> and Klaus Post <klauspost@gmail.com>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 */
19
20 #include "complexfilter.h"
21 #include <math.h>
22 #include "fftwindow.h"
23
24 namespace RawStudio {
25 namespace FFTFilter {
26
27 #if defined (__i386__) || defined (__x86_64__)
28
29 #if defined (__x86_64__)
processSharpenOnlySSE3(ComplexBlock * block)30 void DeGridComplexFilter::processSharpenOnlySSE3(ComplexBlock* block) {
31 fftwf_complex* outcur = block->complex;
32 fftwf_complex* gridsample = grid->complex;
33 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
34 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
35 float *wsharpen = sharpenWindow->getLine(0);
36
37 for (int i = 0; i < 4; i++) {
38 temp[i+0] = 1e-15f; // 0
39 temp[i+4] = gridfraction; // 16
40 temp[i+8] = sigmaSquaredSharpenMin; // 32
41 temp[i+12] = sigmaSquaredSharpenMax; // 48
42 temp[i+16] = 1.0f; // 64
43 }
44 int size = bw*bh;
45 if ((size & 7) == 0) { // TODO: Benchmark on non-vm platform - slower under VMWARE
46 asm volatile
47 (
48 "movaps 16(%1),%%xmm14\n" // Load gridfraction into xmm6
49 "loop_sharpenonly_sse3_big:\n"
50 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
51 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
52 "movaps 32(%2), %%xmm8\n" // in r0i0 r1i1
53 "movaps 48(%2), %%xmm9\n" //in r2i2 r3i3
54 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
55 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
56 "movaps 32(%3), %%xmm12\n" // grid r0i0 r1i1
57 "movaps 48(%3), %%xmm13\n" // grid r2i2 r3i3
58
59 "mulps %%xmm14, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
60 "mulps %%xmm14, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
61 "mulps %%xmm14, %%xmm12\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
62 "mulps %%xmm14, %%xmm13\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
63 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
64 "movaps %%xmm5, %%xmm3\n"
65 "movaps %%xmm12, %%xmm10\n" // maintain gridcorrection in memory
66 "movaps %%xmm13, %%xmm11\n"
67 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
68 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
69 "subps %%xmm12, %%xmm8\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
70 "subps %%xmm13, %%xmm9\n" // re2 im2 re3 im3 -
71 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
72 "movaps %%xmm1, %%xmm5\n"
73 "movaps %%xmm8, %%xmm12\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
74 "movaps %%xmm9, %%xmm13\n"
75
76 "movaps (%1), %%xmm6\n" // Move 1e-15 into xmm6
77 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
78 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
79 "mulps %%xmm12, %%xmm12\n" //r0i0 r1i1 squared
80 "mulps %%xmm13, %%xmm13\n" //r2i2 r3i3 squared
81 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
82 "haddps %%xmm13, %%xmm12\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
83 "addps %%xmm6, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
84 "addps %%xmm6, %%xmm12\n" // add 1e-15 (xmm4: psd for all 4 pixels)
85 "movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
86
87 // float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
88 "movaps 32(%1), %%xmm6\n" // Move sigmaSquaredSharpenMin into xmm6
89 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
90 "movaps %%xmm12, %%xmm13\n" // Copy psd into xmm5
91 "movaps %%xmm7, %%xmm15\n" // Copy sigmaSquaredSharpenMax
92 "addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
93 "addps %%xmm7, %%xmm12\n" // xmm4 = psd + sigmaSquaredSharpenMax
94 "mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
95 "mulps %%xmm13, %%xmm15\n" // xmm7 = psd*sigmaSquaredSharpenMax
96 "addps %%xmm6, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
97 "addps %%xmm6, %%xmm13\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
98
99 "mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
100 "mulps %%xmm12, %%xmm13\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
101 "rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
102 "movaps (%4), %%xmm6\n" // load wsharpen[0->4]
103 "rcpps %%xmm13, %%xmm13\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
104 "movaps 16(%4), %%xmm4\n" // Load wsharpen
105 "mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
106 "mulps %%xmm13, %%xmm15\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
107 "movaps 64(%1), %%xmm5\n" // Load "1.0"
108 "rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
109 "rsqrtps %%xmm15, %%xmm15\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
110 "rcpps %%xmm7, %%xmm7\n" // sqrt (..)
111 "rcpps %%xmm15, %%xmm15\n" // sqrt (..)
112 "mulps %%xmm6, %%xmm7\n" // multiply wsharpen
113 "mulps %%xmm4, %%xmm15\n" // multiply wsharpen
114 "addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
115 "addps %%xmm5, %%xmm15\n" // + 1.0 xmm7 = sfact
116 "movaps %%xmm7, %%xmm5\n"
117 "movaps %%xmm15, %%xmm13\n"
118 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
119 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
120 "unpcklps %%xmm15, %%xmm15\n" // unpack low to xmm7
121 "unpckhps %%xmm13, %%xmm13\n" // unpack high to xmm5
122
123 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
124 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
125 "mulps %%xmm15, %%xmm8\n" // re+im *= sfact
126 "mulps %%xmm13, %%xmm9\n" // re+im *= sfact
127 "addps %%xmm2, %%xmm0\n" // add gridcorrection
128 "addps %%xmm3, %%xmm1\n" // add gridcorrection
129 "addps %%xmm10, %%xmm8\n" // add gridcorrection
130 "addps %%xmm11, %%xmm9\n" // add gridcorrection
131 "movaps %%xmm0, (%2)\n" // Store
132 "movaps %%xmm1, 16(%2)\n" // Store
133 "movaps %%xmm8, 32(%2)\n" // Store
134 "movaps %%xmm9, 48(%2)\n" // Store
135 "sub $8, %0\n" // size -=8
136 "add $64, %2\n" // outcur+=64
137 "add $64, %3\n" // gridsample+=64
138 "add $32, %4\n" // wsharpen+=32
139 "cmp $0, %0\n"
140 "jg loop_sharpenonly_sse3_big\n"
141 : /* no output registers */
142 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
143 : /* %0 %1 %2 %3 %4 */
144 );
145 } else {
146 asm volatile
147 (
148 "movaps (%1), %%xmm11\n"
149 "movaps 16(%1), %%xmm12\n"
150 "movaps 32(%1), %%xmm13\n"
151 "movaps 48(%1), %%xmm14\n"
152 "movaps 64(%1), %%xmm15\n"
153 "loop_sharpenonly_sse3:\n"
154 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
155 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
156 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
157 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
158
159 "mulps %%xmm12, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
160 "mulps %%xmm12, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
161 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
162 "movaps %%xmm5, %%xmm3\n"
163 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
164 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
165 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
166 "movaps %%xmm1, %%xmm5\n"
167
168 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
169 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
170 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
171 "addps %%xmm11, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
172 "movaps %%xmm14, %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
173
174 // float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
175 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
176 "addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
177 "mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
178 "addps %%xmm13, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
179
180 "mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
181 "movaps (%4), %%xmm6\n" // load wsharpen[0->4]
182 "rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
183 "mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
184 "rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
185 "rcpps %%xmm7, %%xmm7\n" // sqrt (..)
186 "mulps %%xmm6, %%xmm7\n" // multiply wsharpen
187 "addps %%xmm15, %%xmm7\n" // + 1.0 xmm7 = sfact
188 "movaps %%xmm7, %%xmm5\n"
189 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
190 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
191
192 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
193 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
194 "addps %%xmm2, %%xmm0\n" // add gridcorrection
195 "addps %%xmm3, %%xmm1\n" // add gridcorrection
196 "movaps %%xmm0, (%2)\n" // Store
197 "movaps %%xmm1, 16(%2)\n" // Store
198 "sub $4, %0\n" // size -=4
199 "add $32, %2\n" // outcur+=32
200 "add $32, %3\n" // gridsample+=32
201 "add $16, %4\n" // wsharpen+=16
202 "cmp $0, %0\n"
203 "jg loop_sharpenonly_sse3\n"
204 : /* no output registers */
205 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
206 : /* %0 %1 %2 %3 %4 */
207 );
208 }
209 }
210
211 #else // 32 bits
212 void DeGridComplexFilter::processSharpenOnlySSE3(ComplexBlock* block) {
213 fftwf_complex* outcur = block->complex;
214 fftwf_complex* gridsample = grid->complex;
215 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
216 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
217 float *wsharpen = sharpenWindow->getLine(0);
218
219 for (int i = 0; i < 4; i++) {
220 temp[i+0] = 1e-15f; // 0
221 temp[i+4] = gridfraction; // 16
222 temp[i+8] = sigmaSquaredSharpenMin; // 32
223 temp[i+12] = sigmaSquaredSharpenMax; // 48
224 temp[i+16] = 1.0f; // 64
225 }
226 int size = bw*bh;
227 asm volatile
228 (
229 "loop_sharpenonly_sse3:\n"
230 "movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
231 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
232 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
233 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
234 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
235
236 "mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
237 "mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
238 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
239 "movaps %%xmm5, %%xmm3\n"
240 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
241 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
242 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
243 "movaps %%xmm1, %%xmm5\n"
244
245 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
246 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
247 "movaps 32(%1), %%xmm6\n" // Move sigmaSquaredSharpenMin into xmm6
248 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
249 "addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
250 "movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
251
252 // float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
253 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
254 "addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
255 "mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
256 "addps %%xmm6, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
257
258 "mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
259 "movaps (%4), %%xmm6\n" // load wsharpen[0->4]
260 "rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
261 "mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
262 "movaps 64(%1), %%xmm5\n" // Load "1.0"
263 "rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
264 "rcpps %%xmm7, %%xmm7\n" // sqrt (..)
265 "mulps %%xmm6, %%xmm7\n" // multiply wsharpen
266 "addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
267 "movaps %%xmm7, %%xmm5\n"
268 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
269 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
270
271 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
272 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
273 "addps %%xmm2, %%xmm0\n" // add gridcorrection
274 "addps %%xmm3, %%xmm1\n" // add gridcorrection
275 "movaps %%xmm0, (%2)\n" // Store
276 "movaps %%xmm1, 16(%2)\n" // Store
277 "sub $4, %0\n" // size -=4
278 "add $32, %2\n" // outcur+=32
279 "add $32, %3\n" // gridsample+=32
280 "add $16, %4\n" // wsharpen+=16
281 "cmp $0, %0\n"
282 "jg loop_sharpenonly_sse3\n"
283 : /* no output registers */
284 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
285 : /* %0 %1 %2 %3 %4 */
286 );
287 }
288 #endif
processSharpenOnlySSE(ComplexBlock * block)289 void DeGridComplexFilter::processSharpenOnlySSE(ComplexBlock* block) {
290 fftwf_complex* outcur = block->complex;
291 fftwf_complex* gridsample = grid->complex;
292 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
293 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
294 float *wsharpen = sharpenWindow->getLine(0);
295
296 for (int i = 0; i < 4; i++) {
297 temp[i+0] = 1e-15f; // 0
298 temp[i+4] = gridfraction; // 16
299 temp[i+8] = sigmaSquaredSharpenMin; // 32
300 temp[i+12] = sigmaSquaredSharpenMax; // 48
301 temp[i+16] = 1.0f; // 64
302 }
303 int size = bw*bh;
304 asm volatile
305 (
306 "loop_sharpenonly_sse:\n"
307 "movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
308 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
309 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
310 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
311 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
312
313 "mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
314 "mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
315 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
316 "movaps %%xmm5, %%xmm3\n"
317 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
318 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
319 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
320 "movaps %%xmm1, %%xmm5\n"
321
322 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
323 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
324
325 "movaps %%xmm4, %%xmm7\n"
326 "shufps $136, %%xmm5, %%xmm4\n" // xmm7 r0r1 r2r3 [10 00 10 00 = 136]
327 "shufps $221, %%xmm5, %%xmm7\n" // xmm6 i0i1 i2i3 [11 01 11 01 = 221]
328 "movaps 32(%1), %%xmm6\n" // Move sigmaSquaredSharpenMin into xmm6
329 "addps %%xmm7, %%xmm4\n"
330 "movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
331 "addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
332
333 // float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
334 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
335 "addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
336 "mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
337 "addps %%xmm6, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
338
339 "mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
340 "movaps (%4), %%xmm6\n" // load wsharpen[0->4]
341 "rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
342 "mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
343 "movaps 64(%1), %%xmm5\n" // Load "1.0"
344 "rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
345 "rcpps %%xmm7, %%xmm7\n" // sqrt (..)
346 "mulps %%xmm6, %%xmm7\n" // multiply wsharpen
347 "addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
348 "movaps %%xmm7, %%xmm5\n"
349 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
350 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
351
352 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
353 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
354 "addps %%xmm2, %%xmm0\n" // add gridcorrection
355 "addps %%xmm3, %%xmm1\n" // add gridcorrection
356 "movaps %%xmm0, (%2)\n" // Store
357 "movaps %%xmm1, 16(%2)\n" // Store
358 "sub $4, %0\n" // size -=4
359 "add $32, %2\n" // outcur+=32
360 "add $32, %3\n" // gridsample+=32
361 "add $16, %4\n" // wsharpen+=16
362 "cmp $0, %0\n"
363 "jg loop_sharpenonly_sse\n"
364 : /* no output registers */
365 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
366 : /* %0 %1 %2 %3 %4 */
367 );
368 }
369 #if defined (__x86_64__)
processSharpen_SSE3(ComplexBlock * block)370 void ComplexWienerFilterDeGrid::processSharpen_SSE3( ComplexBlock* block )
371 {
372 fftwf_complex* outcur = block->complex;
373 fftwf_complex* gridsample = grid->complex;
374 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
375 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
376 float *wsharpen = sharpenWindow->getLine(0);
377
378 for (int i = 0; i < 4; i++) {
379 temp[i+0] = 1e-15f; // 0
380 temp[i+4] = gridfraction; // 16
381 temp[i+8] = sigmaSquaredSharpenMin; // 32
382 temp[i+12] = sigmaSquaredSharpenMax; // 48
383 temp[i+16] = 1.0f; // 64
384 temp[i+20] = sigmaSquaredNoiseNormed; // 80
385 temp[i+24] = lowlimit; // 96
386 }
387 int size = bw*bh;
388 asm volatile
389 (
390 "movaps (%1), %%xmm15\n" // 1e-15f
391 "movaps 16(%1),%%xmm14\n" // Load gridfraction into xmm14
392 "movaps 32(%1), %%xmm10\n" //xmm10 sigmaSquaredSharpenMin
393 "movaps 48(%1), %%xmm11\n" // Move sigmaSquaredSharpenMax into xmm11
394 "movaps 64(%1), %%xmm9\n" // Load "1.0"
395 "movaps 80(%1), %%xmm13\n" //sigmaSquaredNoiseNormed in xmm13
396 "movaps 96(%1), %%xmm12\n" // xmm12 = lowlimit
397 "loop_wienerdegridsharpen_sse3:\n"
398 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
399 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
400 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
401 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
402
403 "mulps %%xmm14, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
404 "mulps %%xmm14, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
405 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
406 "movaps %%xmm5, %%xmm3\n"
407 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
408 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
409 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
410 "movaps %%xmm1, %%xmm5\n"
411
412 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
413 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
414 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
415 "addps %%xmm15, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
416
417 //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
418
419 "movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
420 "rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
421 "subps %%xmm13, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
422 "mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
423 "maxps %%xmm12, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
424
425 // float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
426 "movaps %%xmm11, %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
427 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
428 "addps %%xmm11, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
429 "mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
430 "addps %%xmm10, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
431
432 "mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
433 "rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
434 "mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
435 "rsqrtps %%xmm7, %%xmm7\n" // 1 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
436 "rcpps %%xmm7, %%xmm7\n" // sqrt(...)
437 "mulps (%4), %%xmm7\n" // multiply wsharpen
438 "addps %%xmm9, %%xmm7\n" // + 1.0 xmm7 = sfact
439 "mulps %%xmm6, %%xmm7\n" // *= Wienerfactor
440 "movaps %%xmm7, %%xmm5\n"
441 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
442 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
443
444 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
445 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
446 "addps %%xmm2, %%xmm0\n" // add gridcorrection
447 "addps %%xmm3, %%xmm1\n" // add gridcorrection
448 "movaps %%xmm0, (%2)\n" // Store
449 "movaps %%xmm1, 16(%2)\n" // Store
450 "sub $4, %0\n" // size -=4
451 "add $32, %2\n" // outcur+=32
452 "add $32, %3\n" // gridsample+=32
453 "add $16, %4\n" // wsharpen+=16
454 "cmp $0, %0\n"
455 "jg loop_wienerdegridsharpen_sse3\n"
456 : /* no output registers */
457 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
458 : /* %0 %1 %2 %3 %4 */
459 );
460 }
461
462 #else // 32 bits
463
processSharpen_SSE3(ComplexBlock * block)464 void ComplexWienerFilterDeGrid::processSharpen_SSE3( ComplexBlock* block )
465 {
466 fftwf_complex* outcur = block->complex;
467 fftwf_complex* gridsample = grid->complex;
468 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
469 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
470 float *wsharpen = sharpenWindow->getLine(0);
471
472 for (int i = 0; i < 4; i++) {
473 temp[i+0] = 1e-15f; // 0
474 temp[i+4] = gridfraction; // 16
475 temp[i+8] = sigmaSquaredSharpenMin; // 32
476 temp[i+12] = sigmaSquaredSharpenMax; // 48
477 temp[i+16] = 1.0f; // 64
478 temp[i+20] = sigmaSquaredNoiseNormed; // 80
479 temp[i+24] = lowlimit; // 96
480 }
481 int size = bw*bh;
482 asm volatile
483 (
484 "loop_wienerdegridsharpen_sse3:\n"
485 "movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
486 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
487 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
488 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
489 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
490
491 "mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
492 "mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
493 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
494 "movaps %%xmm5, %%xmm3\n"
495 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
496 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
497 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
498 "movaps %%xmm1, %%xmm5\n"
499
500 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
501 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
502 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
503 "addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
504
505 //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
506
507 "movaps 80(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
508 "movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
509 "rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
510 "subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
511 "movaps 96(%1), %%xmm5\n" // xmm5 = lowlimit
512 "mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
513 "maxps %%xmm5, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
514
515 // float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
516 "movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
517 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
518 "addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
519 "mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
520 "addps 32(%1), %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
521
522 "mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
523 "rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
524 "mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
525 "movaps 64(%1), %%xmm5\n" // Load "1.0"
526 "rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
527 "rcpps %%xmm7, %%xmm7\n" // sqrt (..)
528 "mulps (%4), %%xmm7\n" // multiply wsharpen
529 "addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
530 "mulps %%xmm6, %%xmm7\n" // *= Wienerfactor
531 "movaps %%xmm7, %%xmm5\n"
532 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
533 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
534
535 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
536 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
537 "addps %%xmm2, %%xmm0\n" // add gridcorrection
538 "addps %%xmm3, %%xmm1\n" // add gridcorrection
539 "movaps %%xmm0, (%2)\n" // Store
540 "movaps %%xmm1, 16(%2)\n" // Store
541 "sub $4, %0\n" // size -=4
542 "add $32, %2\n" // outcur+=32
543 "add $32, %3\n" // gridsample+=32
544 "add $16, %4\n" // wsharpen+=16
545 "cmp $0, %0\n"
546 "jg loop_wienerdegridsharpen_sse3\n"
547 : /* no output registers */
548 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
549 : /* %0 %1 %2 %3 %4 */
550 );
551 }
552 #endif
553
processSharpen_SSE(ComplexBlock * block)554 void ComplexWienerFilterDeGrid::processSharpen_SSE( ComplexBlock* block )
555 {
556 fftwf_complex* outcur = block->complex;
557 fftwf_complex* gridsample = grid->complex;
558 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
559 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
560 float *wsharpen = sharpenWindow->getLine(0);
561
562 for (int i = 0; i < 4; i++) {
563 temp[i+0] = 1e-15f; // 0
564 temp[i+4] = gridfraction; // 16
565 temp[i+8] = sigmaSquaredSharpenMin; // 32
566 temp[i+12] = sigmaSquaredSharpenMax; // 48
567 temp[i+16] = 1.0f; // 64
568 temp[i+20] = sigmaSquaredNoiseNormed; // 72
569 temp[i+24] = lowlimit; // 96
570 }
571 int size = bw*bh;
572 asm volatile
573 (
574 "loop_wienerdegridsharpen_sse:\n"
575 "movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
576 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
577 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
578 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
579 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
580
581 "mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
582 "mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
583 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
584 "movaps %%xmm5, %%xmm3\n"
585 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
586 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
587 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
588 "movaps %%xmm1, %%xmm5\n"
589
590 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
591 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
592 "movaps %%xmm4, %%xmm7\n"
593 "shufps $136, %%xmm5, %%xmm4\n" // xmm7 r0r1 r2r3 [10 00 10 00 = 136]
594 "shufps $221, %%xmm5, %%xmm7\n" // xmm6 i0i1 i2i3 [11 01 11 01 = 221]
595 "addps %%xmm7, %%xmm4\n"
596
597
598 "addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
599
600 //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
601
602 "movaps 80(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
603 "movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
604 "rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
605 "subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
606 "movaps 96(%1), %%xmm5\n" // xmm5 = lowlimit
607 "mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
608 "maxps %%xmm5, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
609
610 // float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
611 "movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
612 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
613 "addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
614 "mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
615 "addps 32(%1), %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
616
617 "mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
618 "rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
619 "mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
620 "movaps 64(%1), %%xmm5\n" // Load "1.0"
621 "rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
622 "rcpps %%xmm7, %%xmm7\n" // sqrt (..)
623 "mulps (%4), %%xmm7\n" // multiply wsharpen
624 "addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
625 "mulps %%xmm6, %%xmm7\n" // *= Wienerfactor
626 "movaps %%xmm7, %%xmm5\n"
627 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
628 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
629
630 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
631 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
632 "addps %%xmm2, %%xmm0\n" // add gridcorrection
633 "addps %%xmm3, %%xmm1\n" // add gridcorrection
634 "movaps %%xmm0, (%2)\n" // Store
635 "movaps %%xmm1, 16(%2)\n" // Store
636 "sub $4, %0\n" // size -=4
637 "add $32, %2\n" // outcur+=32
638 "add $32, %3\n" // gridsample+=32
639 "add $16, %4\n" // wsharpen+=16
640 "cmp $0, %0\n"
641 "jg loop_wienerdegridsharpen_sse\n"
642 : /* no output registers */
643 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
644 : /* %0 %1 %2 %3 %4 */
645 );
646
647 }
648
processNoSharpen_SSE(ComplexBlock * block)649 void ComplexWienerFilterDeGrid::processNoSharpen_SSE( ComplexBlock* block )
650 {
651 fftwf_complex* outcur = block->complex;
652 fftwf_complex* gridsample = grid->complex;
653 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
654 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
655
656 for (int i = 0; i < 4; i++) {
657 temp[i+0] = 1e-15f; // 0
658 temp[i+4] = gridfraction; // 16
659 temp[i+8] = sigmaSquaredNoiseNormed; // 32
660 temp[i+12] = lowlimit; // 48
661 }
662 int size = bw*bh;
663 asm volatile
664 (
665 "loop_wienerdegridnosharpen_sse:\n"
666 "movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
667 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
668 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
669 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
670 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
671
672 "mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
673 "mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
674 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
675 "movaps %%xmm5, %%xmm3\n"
676 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
677 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
678 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
679 "movaps %%xmm1, %%xmm5\n"
680
681 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
682 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
683 "movaps %%xmm4, %%xmm7\n"
684 "shufps $136, %%xmm5, %%xmm4\n" // xmm7 r0r1 r2r3 [10 00 10 00 = 136]
685 "shufps $221, %%xmm5, %%xmm7\n" // xmm6 i0i1 i2i3 [11 01 11 01 = 221]
686 "addps %%xmm7, %%xmm4\n"
687
688 "addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
689
690 //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
691
692 "movaps 32(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
693 "movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
694 "rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
695 "subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
696 "movaps 48(%1), %%xmm5\n" // xmm5 = lowlimit
697 "mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
698 "maxps %%xmm6, %%xmm5\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
699
700 "movaps %%xmm5, %%xmm7\n"
701 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
702 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
703
704 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
705 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
706 "addps %%xmm2, %%xmm0\n" // add gridcorrection
707 "addps %%xmm3, %%xmm1\n" // add gridcorrection
708 "movaps %%xmm0, (%2)\n" // Store
709 "movaps %%xmm1, 16(%2)\n" // Store
710 "sub $4, %0\n" // size -=4
711 "add $32, %2\n" // outcur+=32
712 "add $32, %3\n" // gridsample+=32
713 "cmp $0, %0\n"
714 "jg loop_wienerdegridnosharpen_sse\n"
715 : /* no output registers */
716 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
717 : /* %0 %1 %2 %3 */
718 );
719
720 }
721 #if defined (__x86_64__)
processNoSharpen_SSE3(ComplexBlock * block)722 void ComplexWienerFilterDeGrid::processNoSharpen_SSE3( ComplexBlock* block )
723 {
724 fftwf_complex* outcur = block->complex;
725 fftwf_complex* gridsample = grid->complex;
726 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
727 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
728
729 for (int i = 0; i < 4; i++) {
730 temp[i+0] = 1e-15f; // 0
731 temp[i+4] = gridfraction; // 16
732 temp[i+8] = sigmaSquaredNoiseNormed; // 32
733 temp[i+12] = lowlimit; // 48
734 }
735 int size = bw*bh;
736 if ((size & 7) == 0) { //TODO: Bench me to see if I'm faster
737 asm volatile
738 (
739 "movaps (%1), %%xmm14\n" // xmm14: 1e-15
740 "movaps 16(%1), %%xmm15\n" // Load gridfraction into xmm15
741 "loop_wienerdegridnosharpen_sse3_big:\n"
742 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
743 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
744 "movaps 32(%2), %%xmm8\n" // in r4i4 r5i5
745 "movaps 48(%2), %%xmm9\n" //in r6i6 r7i7
746 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
747 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
748 "movaps 32(%3), %%xmm10\n" // grid r4i4 r5i5
749 "movaps 48(%3), %%xmm11\n" // grid r6i6 r7i7
750
751 "mulps %%xmm15, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
752 "mulps %%xmm15, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
753 "mulps %%xmm15, %%xmm10\n" //grid r4*gf i4*gf r5*gf i5*gf
754 "mulps %%xmm15, %%xmm11\n" //grid r6*gf i6*gf r7*gf i7*gf
755 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
756 "movaps %%xmm5, %%xmm3\n"
757 "movaps %%xmm10, %%xmm12\n" // maintain gridcorrection in memory
758 "movaps %%xmm11, %%xmm13\n"
759 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
760 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
761 "subps %%xmm10, %%xmm8\n" // re4 im4 re5 im5 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
762 "subps %%xmm11, %%xmm9\n" // re6 im6 re7 im7 -
763 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
764 "movaps %%xmm1, %%xmm5\n"
765 "movaps %%xmm8, %%xmm10\n" // copy re4+im4
766 "movaps %%xmm9, %%xmm11\n"
767
768 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
769 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
770 "mulps %%xmm10, %%xmm10\n" //r4i4 r5i5 squared
771 "mulps %%xmm11, %%xmm11\n" //r6i6 r7i7 squared
772 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 (all squared) (SSE3!) - xmm 5 free
773 "haddps %%xmm11, %%xmm10\n" //r4+i4 r5+i5 r6+i6 r7+i7 (all squared) (SSE3!) - xmm 11 free
774
775 "addps %%xmm14, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
776 "addps %%xmm14, %%xmm10\n" // add 1e-15 (xmm10: psd for all 4 pixels)
777
778 //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
779
780 "movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
781 "movaps %%xmm10, %%xmm11\n" // Copy psd into xmm11
782
783 "rcpps %%xmm4, %%xmm4\n" // xmm4: (1 / psd)
784 "movaps 32(%1), %%xmm7\n"
785 "rcpps %%xmm10, %%xmm10\n" // xmm10: (1 / psd)
786 "subps %%xmm7, %%xmm5\n" // xmm5 (psd) - xmm5 (ssnn)
787 "subps %%xmm7, %%xmm11\n" // xmm11 (psd) - xmm5 (ssnn)
788 "movaps 48(%1), %%xmm7\n"
789 "mulps %%xmm4, %%xmm5\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
790 "mulps %%xmm10, %%xmm11\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
791 "maxps %%xmm7, %%xmm5\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
792 "maxps %%xmm7, %%xmm11\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
793
794 "movaps %%xmm5, %%xmm7\n"
795 "movaps %%xmm11, %%xmm10\n"
796 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
797 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
798 "unpckhps %%xmm11, %%xmm11\n" // unpack high to xmm11
799 "unpcklps %%xmm10, %%xmm10\n" // unpack low to xmm10
800
801 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
802 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
803 "mulps %%xmm10, %%xmm8\n" // re+im *= sfact
804 "mulps %%xmm11, %%xmm9\n" // re+im *= sfact
805 "addps %%xmm2, %%xmm0\n" // add gridcorrection
806 "addps %%xmm3, %%xmm1\n" // add gridcorrection
807 "addps %%xmm12, %%xmm8\n" // add gridcorrection
808 "addps %%xmm13, %%xmm9\n" // add gridcorrection
809 "movaps %%xmm0, (%2)\n" // Store
810 "movaps %%xmm1, 16(%2)\n" // Store
811 "movaps %%xmm8, 32(%2)\n" // Store
812 "movaps %%xmm9, 48(%2)\n" // Store
813 "sub $8, %0\n" // size -=8
814 "add $64, %2\n" // outcur+=64
815 "add $64, %3\n" // gridsample+=64
816 "cmp $0, %0\n"
817 "jg loop_wienerdegridnosharpen_sse3_big\n"
818 : /* no output registers */
819 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
820 : /* %0 %1 %2 %3 */
821 );
822 } else {
823 asm volatile
824 (
825 "movaps (%1), %%xmm14\n" // xmm14: 1e-15
826 "movaps 16(%1), %%xmm15\n" // Load gridfraction into xmm15
827 "movaps 32(%1), %%xmm13\n" //sigmaSquaredNoiseNormed in xmm13
828 "movaps 48(%1), %%xmm12\n" // xmm12 = lowlimit
829 "loop_wienerdegridnosharpen_sse3:\n"
830 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
831 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
832 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
833 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
834
835 "mulps %%xmm15, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
836 "mulps %%xmm15, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
837 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
838 "movaps %%xmm5, %%xmm3\n"
839 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
840 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
841 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
842 "movaps %%xmm1, %%xmm5\n"
843
844 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
845 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
846 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
847
848 "addps %%xmm14, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
849
850 //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
851
852 "movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
853 "rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
854 "subps %%xmm13, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
855 "mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
856 "maxps %%xmm12, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
857
858 "movaps %%xmm6, %%xmm7\n"
859 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
860 "unpckhps %%xmm6, %%xmm6\n" // unpack high to xmm6
861
862 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
863 "mulps %%xmm6, %%xmm1\n" // re+im *= sfact
864 "addps %%xmm2, %%xmm0\n" // add gridcorrection
865 "addps %%xmm3, %%xmm1\n" // add gridcorrection
866 "movaps %%xmm0, (%2)\n" // Store
867 "movaps %%xmm1, 16(%2)\n" // Store
868 "sub $4, %0\n" // size -=4
869 "add $32, %2\n" // outcur+=32
870 "add $32, %3\n" // gridsample+=32
871 "cmp $0, %0\n"
872 "jg loop_wienerdegridnosharpen_sse3\n"
873 : /* no output registers */
874 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
875 : /* %0 %1 %2 %3 */
876 );
877 }
878 }
879
880 #else // 32 bits
processNoSharpen_SSE3(ComplexBlock * block)881 void ComplexWienerFilterDeGrid::processNoSharpen_SSE3( ComplexBlock* block )
882 {
883 fftwf_complex* outcur = block->complex;
884 fftwf_complex* gridsample = grid->complex;
885 float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
886 float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
887
888 for (int i = 0; i < 4; i++) {
889 temp[i+0] = 1e-15f; // 0
890 temp[i+4] = gridfraction; // 16
891 temp[i+8] = sigmaSquaredNoiseNormed; // 32
892 temp[i+12] = lowlimit; // 48
893 }
894 int size = bw*bh;
895 asm volatile
896 (
897 "loop_wienerdegridnosharpen_sse3:\n"
898 "movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
899 "movaps (%2), %%xmm0\n" // in r0i0 r1i1
900 "movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
901 "movaps (%3), %%xmm4\n" // grid r0i0 r1i1
902 "movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
903
904 "mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
905 "mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
906 "movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
907 "movaps %%xmm5, %%xmm3\n"
908 "subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
909 "subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
910 "movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
911 "movaps %%xmm1, %%xmm5\n"
912
913 "mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
914 "mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
915 "haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
916
917 "addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
918
919 //WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
920
921 "movaps 32(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
922 "movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
923 "rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
924 "subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
925 "movaps 48(%1), %%xmm5\n" // xmm5 = lowlimit
926 "mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
927 "maxps %%xmm6, %%xmm5\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
928
929 "movaps %%xmm5, %%xmm7\n"
930 "unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
931 "unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
932
933 "mulps %%xmm7, %%xmm0\n" // re+im *= sfact
934 "mulps %%xmm5, %%xmm1\n" // re+im *= sfact
935 "addps %%xmm2, %%xmm0\n" // add gridcorrection
936 "addps %%xmm3, %%xmm1\n" // add gridcorrection
937 "movaps %%xmm0, (%2)\n" // Store
938 "movaps %%xmm1, 16(%2)\n" // Store
939 "sub $4, %0\n" // size -=4
940 "add $32, %2\n" // outcur+=32
941 "add $32, %3\n" // gridsample+=32
942 "cmp $0, %0\n"
943 "jg loop_wienerdegridnosharpen_sse3\n"
944 : /* no output registers */
945 : "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
946 : /* %0 %1 %2 %3 */
947 );
948 }
949 #endif
950
951 #endif // defined (__i386__) || defined (__x86_64__)
952
953 }}// namespace RawStudio::FFTFilter
954