1 /*
2 * This file is part of RawTherapee.
3 *
4 * Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
5 *
6 * RawTherapee is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * RawTherapee is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with RawTherapee. If not, see <http://www.gnu.org/licenses/>.
18 */
19 #include "librtprocess.h"
20 #include <cmath>
21 #include <cstring>
22 #include <cstdlib>
23 #include "opthelper.h"
24 #include "boxblur.h"
25
26 enum eGaussType {GAUSS_STANDARD, GAUSS_MULT, GAUSS_DIV};
27
28 using namespace librtprocess;
29
calculateYvVFactors(const T sigma,T & b1,T & b2,T & b3,T & B,T M[3][3])30 template<class T> void calculateYvVFactors( const T sigma, T &b1, T &b2, T &b3, T &B, T M[3][3])
31 {
32 // coefficient calculation
33 T q;
34
35 if (sigma < 2.5) {
36 q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
37 } else {
38 q = 0.98711 * sigma - 0.96330;
39 }
40
41 T b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q;
42 b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q;
43 b2 = -1.4281 * q * q - 1.26661 * q * q * q;
44 b3 = 0.422205 * q * q * q;
45 B = 1.0 - (b1 + b2 + b3) / b0;
46
47 b1 /= b0;
48 b2 /= b0;
49 b3 /= b0;
50
51 // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering
52 M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2;
53 M[0][1] = (b3 + b1) * (b2 + b3 * b1);
54 M[0][2] = b3 * (b1 + b3 * b2);
55 M[1][0] = b1 + b3 * b2;
56 M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1);
57 M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3;
58 M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2;
59 M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3;
60 M[2][2] = b3 * (b1 + b3 * b2);
61
62 }
63
64 // classical filtering if the support window is small and src != dst
gauss3x3(T ** RESTRICT src,T ** RESTRICT dst,const int W,const int H,const T c0,const T c1,const T c2,const T b0,const T b1)65 template<class T> void gauss3x3 (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1)
66 {
67
68 // first row
69 #ifdef _OPENMP
70 #pragma omp single nowait
71 #endif
72 {
73 dst[0][0] = src[0][0];
74
75 for (int j = 1; j < W - 1; j++)
76 {
77 dst[0][j] = b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j];
78 }
79
80 dst[0][W - 1] = src[0][W - 1];
81 }
82
83 #ifdef _OPENMP
84 #pragma omp for nowait
85 #endif
86
87 for (int i = 1; i < H - 1; i++) {
88 dst[i][0] = b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0];
89
90 for (int j = 1; j < W - 1; j++) {
91 dst[i][j] = c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j];
92 }
93
94 dst[i][W - 1] = b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1];
95 }
96
97 // last row
98 #ifdef _OPENMP
99 #pragma omp single
100 #endif
101 {
102 dst[H - 1][0] = src[H - 1][0];
103
104 for (int j = 1; j < W - 1; j++) {
105 dst[H - 1][j] = b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j];
106 }
107
108 dst[H - 1][W - 1] = src[H - 1][W - 1];
109 }
110 }
111
112 // classical filtering if the support window is small and src != dst
gauss3x3mult(T ** RESTRICT src,T ** RESTRICT dst,const int W,const int H,const T c0,const T c1,const T c2,const T b0,const T b1)113 template<class T> void gauss3x3mult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1)
114 {
115
116 // first row
117 #ifdef _OPENMP
118 #pragma omp single nowait
119 #endif
120 {
121 dst[0][0] *= src[0][0];
122
123 for (int j = 1; j < W - 1; j++)
124 {
125 dst[0][j] *= b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j];
126 }
127
128 dst[0][W - 1] *= src[0][W - 1];
129 }
130
131 #ifdef _OPENMP
132 #pragma omp for nowait
133 #endif
134
135 for (int i = 1; i < H - 1; i++) {
136 dst[i][0] *= b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0];
137
138 for (int j = 1; j < W - 1; j++) {
139 dst[i][j] *= c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j];
140 }
141
142 dst[i][W - 1] *= b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1];
143 }
144
145 // last row
146 #ifdef _OPENMP
147 #pragma omp single
148 #endif
149 {
150 dst[H - 1][0] *= src[H - 1][0];
151
152 for (int j = 1; j < W - 1; j++) {
153 dst[H - 1][j] *= b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j];
154 }
155
156 dst[H - 1][W - 1] *= src[H - 1][W - 1];
157 }
158 }
159
gauss3x3div(T ** RESTRICT src,T ** RESTRICT dst,T ** RESTRICT divBuffer,const int W,const int H,const T c0,const T c1,const T c2,const T b0,const T b1)160 template<class T> void gauss3x3div (T** RESTRICT src, T** RESTRICT dst, T** RESTRICT divBuffer, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1)
161 {
162
163 // first row
164 #ifdef _OPENMP
165 #pragma omp single nowait
166 #endif
167 {
168 dst[0][0] = librtprocess::max(divBuffer[0][0] / (src[0][0] > 0.f ? src[0][0] : 1.f), 0.f);
169
170 for (int j = 1; j < W - 1; j++)
171 {
172 float tmp = (b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j]);
173 dst[0][j] = librtprocess::max(divBuffer[0][j] / (tmp > 0.f ? tmp : 1.f), 0.f);
174 }
175
176 dst[0][W - 1] = librtprocess::max(divBuffer[0][W - 1] / (src[0][W - 1] > 0.f ? src[0][W - 1] : 1.f), 0.f);
177 }
178
179 #ifdef _OPENMP
180 #pragma omp for nowait
181 #endif
182
183 for (int i = 1; i < H - 1; i++) {
184 float tmp = (b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0]);
185 dst[i][0] = librtprocess::max(divBuffer[i][0] / (tmp > 0.f ? tmp : 1.f), 0.f);
186
187 for (int j = 1; j < W - 1; j++) {
188 tmp = (c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j]);
189 dst[i][j] = librtprocess::max(divBuffer[i][j] / (tmp > 0.f ? tmp : 1.f), 0.f);
190 }
191
192 tmp = (b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1]);
193 dst[i][W - 1] = librtprocess::max(divBuffer[i][W - 1] / (tmp > 0.f ? tmp : 1.f), 0.f);
194 }
195
196 // last row
197 #ifdef _OPENMP
198 #pragma omp single
199 #endif
200 {
201 dst[H - 1][0] = librtprocess::max(divBuffer[H - 1][0] / (src[H - 1][0] > 0.f ? src[H - 1][0] : 1.f), 0.f);
202
203 for (int j = 1; j < W - 1; j++) {
204 float tmp = (b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j]);
205 dst[H - 1][j] = librtprocess::max(divBuffer[H - 1][j] / (tmp > 0.f ? tmp : 1.f), 0.f);
206 }
207
208 dst[H - 1][W - 1] = librtprocess::max(divBuffer[H - 1][W - 1] / (src[H - 1][W - 1] > 0.f ? src[H - 1][W - 1] : 1.f), 0.f);
209 }
210 }
211
212
213 // use separated filter if the support window is small and src == dst
gaussHorizontal3(T ** src,T ** dst,int W,int H,const float c0,const float c1)214 template<class T> void gaussHorizontal3 (T** src, T** dst, int W, int H, const float c0, const float c1)
215 {
216 T temp[W] ALIGNED16;
217 #ifdef _OPENMP
218 #pragma omp for
219 #endif
220
221 for (int i = 0; i < H; i++) {
222 for (int j = 1; j < W - 1; j++) {
223 temp[j] = (T)(c1 * (src[i][j - 1] + src[i][j + 1]) + c0 * src[i][j]);
224 }
225
226 dst[i][0] = src[i][0];
227 memcpy (dst[i] + 1, temp + 1, (W - 2)*sizeof(T));
228
229 dst[i][W - 1] = src[i][W - 1];
230 }
231 }
232
233 #ifdef __SSE2__
gaussVertical3(T ** src,T ** dst,int W,int H,const float c0,const float c1)234 template<class T> void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1)
235 {
236 vfloat Tv = F2V(0.f), Tm1v, Tp1v;
237 vfloat Tv1 = F2V(0.f), Tm1v1, Tp1v1;
238 vfloat c0v, c1v;
239 c0v = F2V(c0);
240 c1v = F2V(c1);
241
242 #ifdef _OPENMP
243 #pragma omp for nowait
244 #endif
245
246 // process 8 columns per iteration for better usage of cpu cache
247 for (int i = 0; i < W - 7; i += 8) {
248 Tm1v = LVFU( src[0][i] );
249 Tm1v1 = LVFU( src[0][i + 4] );
250 STVFU( dst[0][i], Tm1v);
251 STVFU( dst[0][i + 4], Tm1v1);
252
253 if (H > 1) {
254 Tv = LVFU( src[1][i]);
255 Tv1 = LVFU( src[1][i + 4]);
256 }
257
258 for (int j = 1; j < H - 1; j++) {
259 Tp1v = LVFU( src[j + 1][i]);
260 Tp1v1 = LVFU( src[j + 1][i + 4]);
261 STVFU( dst[j][i], c1v * (Tp1v + Tm1v) + Tv * c0v);
262 STVFU( dst[j][i + 4], c1v * (Tp1v1 + Tm1v1) + Tv1 * c0v);
263 Tm1v = Tv;
264 Tm1v1 = Tv1;
265 Tv = Tp1v;
266 Tv1 = Tp1v1;
267 }
268
269 STVFU( dst[H - 1][i], LVFU( src[H - 1][i]));
270 STVFU( dst[H - 1][i + 4], LVFU( src[H - 1][i + 4]));
271 }
272
273 // Borders are done without SSE
274 float temp[H] ALIGNED16;
275 #ifdef _OPENMP
276 #pragma omp single
277 #endif
278
279 for (int i = W - (W % 8); i < W; i++) {
280 for (int j = 1; j < H - 1; j++) {
281 temp[j] = c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i];
282 }
283
284 dst[0][i] = src[0][i];
285
286 for (int j = 1; j < H - 1; j++) {
287 dst[j][i] = temp[j];
288 }
289
290 dst[H - 1][i] = src[H - 1][i];
291 }
292 }
293 #else
gaussVertical3(T ** src,T ** dst,int W,int H,const float c0,const float c1)294 template<class T> void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1)
295 {
296 T temp[H] ALIGNED16;
297 #ifdef _OPENMP
298 #pragma omp for
299 #endif
300
301 for (int i = 0; i < W; i++) {
302 for (int j = 1; j < H - 1; j++) {
303 temp[j] = (T)(c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i]);
304 }
305
306 dst[0][i] = src[0][i];
307
308 for (int j = 1; j < H - 1; j++) {
309 dst[j][i] = temp[j];
310 }
311
312 dst[H - 1][i] = src[H - 1][i];
313 }
314 }
315 #endif
316
317 #ifdef __SSE2__
318 // fast gaussian approximation if the support window is large
gaussHorizontalSse(T ** src,T ** dst,const int W,const int H,const float sigma)319 template<class T> void gaussHorizontalSse (T** src, T** dst, const int W, const int H, const float sigma)
320 {
321 double b1, b2, b3, B, M[3][3];
322 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
323
324 for (int i = 0; i < 3; i++)
325 for (int j = 0; j < 3; j++) {
326 M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
327 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
328 }
329
330 vfloat Rv;
331 vfloat Tv, Tm2v, Tm3v;
332 vfloat Bv, b1v, b2v, b3v;
333 vfloat temp2W, temp2Wp1;
334 float tmp[W][4] ALIGNED16;
335 Bv = F2V(B);
336 b1v = F2V(b1);
337 b2v = F2V(b2);
338 b3v = F2V(b3);
339
340 #ifdef _OPENMP
341 #pragma omp for nowait
342 #endif
343
344 for (int i = 0; i < H - 3; i += 4) {
345 Tv = _mm_set_ps(src[i][0], src[i + 1][0], src[i + 2][0], src[i + 3][0]);
346 Tm3v = Tv * (Bv + b1v + b2v + b3v);
347 STVF( tmp[0][0], Tm3v );
348
349 Tm2v = _mm_set_ps(src[i][1], src[i + 1][1], src[i + 2][1], src[i + 3][1]) * Bv + Tm3v * b1v + Tv * (b2v + b3v);
350 STVF( tmp[1][0], Tm2v );
351
352 Rv = _mm_set_ps(src[i][2], src[i + 1][2], src[i + 2][2], src[i + 3][2]) * Bv + Tm2v * b1v + Tm3v * b2v + Tv * b3v;
353 STVF( tmp[2][0], Rv );
354
355 for (int j = 3; j < W; j++) {
356 Tv = Rv;
357 Rv = _mm_set_ps(src[i][j], src[i + 1][j], src[i + 2][j], src[i + 3][j]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
358 STVF( tmp[j][0], Rv );
359 Tm3v = Tm2v;
360 Tm2v = Tv;
361 }
362
363 Tv = _mm_set_ps(src[i][W - 1], src[i + 1][W - 1], src[i + 2][W - 1], src[i + 3][W - 1]);
364
365 temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * ( Tm2v - Tv ) + F2V(M[2][2]) * (Tm3v - Tv);
366 temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
367
368 Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
369 STVF(tmp[W - 1][0], Rv);
370
371 Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
372 STVF(tmp[W - 2][0], Tm2v);
373
374 Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
375 STVF(tmp[W - 3][0], Tm3v);
376
377 Tv = Rv;
378 Rv = Tm3v;
379 Tm3v = Tv;
380
381 for (int j = W - 4; j >= 0; j--) {
382 Tv = Rv;
383 Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
384 STVF(tmp[j][0], Rv);
385 Tm3v = Tm2v;
386 Tm2v = Tv;
387 }
388
389 for (int j = 0; j < W; j++) {
390 dst[i + 3][j] = tmp[j][0];
391 dst[i + 2][j] = tmp[j][1];
392 dst[i + 1][j] = tmp[j][2];
393 dst[i + 0][j] = tmp[j][3];
394 }
395
396
397 }
398
399 // Borders are done without SSE
400 #ifdef _OPENMP
401 #pragma omp single
402 #endif
403
404 for (int i = H - (H % 4); i < H; i++) {
405 tmp[0][0] = src[i][0] * (B + b1 + b2 + b3);
406 tmp[1][0] = B * src[i][1] + b1 * tmp[0][0] + src[i][0] * (b2 + b3);
407 tmp[2][0] = B * src[i][2] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[i][0];
408
409 for (int j = 3; j < W; j++) {
410 tmp[j][0] = B * src[i][j] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
411 }
412
413 float temp2Wm1 = src[i][W - 1] + M[0][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[0][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[0][2] * (tmp[W - 3][0] - src[i][W - 1]);
414 float temp2Ws = src[i][W - 1] + M[1][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[1][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[1][2] * (tmp[W - 3][0] - src[i][W - 1]);
415 float temp2Wp1s = src[i][W - 1] + M[2][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[2][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[2][2] * (tmp[W - 3][0] - src[i][W - 1]);
416
417 tmp[W - 1][0] = temp2Wm1;
418 tmp[W - 2][0] = B * tmp[W - 2][0] + b1 * tmp[W - 1][0] + b2 * temp2Ws + b3 * temp2Wp1s;
419 tmp[W - 3][0] = B * tmp[W - 3][0] + b1 * tmp[W - 2][0] + b2 * tmp[W - 1][0] + b3 * temp2Ws;
420
421 for (int j = W - 4; j >= 0; j--) {
422 tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
423 }
424
425 for (int j = 0; j < W; j++) {
426 dst[i][j] = tmp[j][0];
427 }
428 }
429 }
430 #endif
431
432 // fast gaussian approximation if the support window is large
gaussHorizontal(T ** src,T ** dst,const int W,const int H,const double sigma)433 template<class T> void gaussHorizontal (T** src, T** dst, const int W, const int H, const double sigma)
434 {
435 double b1, b2, b3, B, M[3][3];
436 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
437
438 for (int i = 0; i < 3; i++)
439 for (int j = 0; j < 3; j++) {
440 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
441 }
442
443 double temp2[W] ALIGNED16;
444
445 #ifdef _OPENMP
446 #pragma omp for
447 #endif
448
449 for (int i = 0; i < H; i++) {
450
451 temp2[0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0];
452 temp2[1] = B * src[i][1] + b1 * temp2[0] + b2 * src[i][0] + b3 * src[i][0];
453 temp2[2] = B * src[i][2] + b1 * temp2[1] + b2 * temp2[0] + b3 * src[i][0];
454
455 for (int j = 3; j < W; j++) {
456 temp2[j] = B * src[i][j] + b1 * temp2[j - 1] + b2 * temp2[j - 2] + b3 * temp2[j - 3];
457 }
458
459 double temp2Wm1 = src[i][W - 1] + M[0][0] * (temp2[W - 1] - src[i][W - 1]) + M[0][1] * (temp2[W - 2] - src[i][W - 1]) + M[0][2] * (temp2[W - 3] - src[i][W - 1]);
460 double temp2W = src[i][W - 1] + M[1][0] * (temp2[W - 1] - src[i][W - 1]) + M[1][1] * (temp2[W - 2] - src[i][W - 1]) + M[1][2] * (temp2[W - 3] - src[i][W - 1]);
461 double temp2Wp1 = src[i][W - 1] + M[2][0] * (temp2[W - 1] - src[i][W - 1]) + M[2][1] * (temp2[W - 2] - src[i][W - 1]) + M[2][2] * (temp2[W - 3] - src[i][W - 1]);
462
463 temp2[W - 1] = temp2Wm1;
464 temp2[W - 2] = B * temp2[W - 2] + b1 * temp2[W - 1] + b2 * temp2W + b3 * temp2Wp1;
465 temp2[W - 3] = B * temp2[W - 3] + b1 * temp2[W - 2] + b2 * temp2[W - 1] + b3 * temp2W;
466
467 for (int j = W - 4; j >= 0; j--) {
468 temp2[j] = B * temp2[j] + b1 * temp2[j + 1] + b2 * temp2[j + 2] + b3 * temp2[j + 3];
469 }
470
471 for (int j = 0; j < W; j++) {
472 dst[i][j] = (T)temp2[j];
473 }
474
475 }
476 }
477
478 #ifdef __SSE2__
gaussVerticalSse(T ** src,T ** dst,const int W,const int H,const float sigma)479 template<class T> void gaussVerticalSse (T** src, T** dst, const int W, const int H, const float sigma)
480 {
481 double b1, b2, b3, B, M[3][3];
482 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
483
484 for (int i = 0; i < 3; i++)
485 for (int j = 0; j < 3; j++) {
486 M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
487 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
488 }
489
490 float tmp[H][8] ALIGNED16;
491 vfloat Rv;
492 vfloat Tv, Tm2v, Tm3v;
493 vfloat Rv1;
494 vfloat Tv1, Tm2v1, Tm3v1;
495 vfloat Bv, b1v, b2v, b3v;
496 vfloat temp2W, temp2Wp1;
497 vfloat temp2W1, temp2Wp11;
498 Bv = F2V(B);
499 b1v = F2V(b1);
500 b2v = F2V(b2);
501 b3v = F2V(b3);
502
503 #ifdef _OPENMP
504 #pragma omp for nowait
505 #endif
506
507 // process 8 columns per iteration for better usage of cpu cache
508 for (int i = 0; i < W - 7; i += 8) {
509 Tv = LVFU( src[0][i]);
510 Tv1 = LVFU( src[0][i + 4]);
511 Rv = Tv * (Bv + b1v + b2v + b3v);
512 Rv1 = Tv1 * (Bv + b1v + b2v + b3v);
513 Tm3v = Rv;
514 Tm3v1 = Rv1;
515 STVF( tmp[0][0], Rv );
516 STVF( tmp[0][4], Rv1 );
517
518 Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v);
519 Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v);
520 Tm2v = Rv;
521 Tm2v1 = Rv1;
522 STVF( tmp[1][0], Rv );
523 STVF( tmp[1][4], Rv1 );
524
525 Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
526 Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v;
527 STVF( tmp[2][0], Rv );
528 STVF( tmp[2][4], Rv1 );
529
530 for (int j = 3; j < H; j++) {
531 Tv = Rv;
532 Tv1 = Rv1;
533 Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
534 Rv1 = LVFU(src[j][i + 4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
535 STVF( tmp[j][0], Rv );
536 STVF( tmp[j][4], Rv1 );
537 Tm3v = Tm2v;
538 Tm3v1 = Tm2v1;
539 Tm2v = Tv;
540 Tm2v1 = Tv1;
541 }
542
543 Tv = LVFU(src[H - 1][i]);
544 Tv1 = LVFU(src[H - 1][i + 4]);
545
546 temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv);
547 temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1);
548 temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
549 temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1);
550
551 Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
552 Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1);
553 STVFU( dst[H - 1][i], Rv );
554 STVFU( dst[H - 1][i + 4], Rv1 );
555
556 Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
557 Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11;
558 STVFU( dst[H - 2][i], Tm2v );
559 STVFU( dst[H - 2][i + 4], Tm2v1 );
560
561 Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
562 Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1;
563 STVFU( dst[H - 3][i], Tm3v );
564 STVFU( dst[H - 3][i + 4], Tm3v1 );
565
566 Tv = Rv;
567 Tv1 = Rv1;
568 Rv = Tm3v;
569 Rv1 = Tm3v1;
570 Tm3v = Tv;
571 Tm3v1 = Tv1;
572
573 for (int j = H - 4; j >= 0; j--) {
574 Tv = Rv;
575 Tv1 = Rv1;
576 Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
577 Rv1 = LVF(tmp[j][4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
578 STVFU( dst[j][i], Rv );
579 STVFU( dst[j][i + 4], Rv1 );
580 Tm3v = Tm2v;
581 Tm3v1 = Tm2v1;
582 Tm2v = Tv;
583 Tm2v1 = Tv1;
584 }
585 }
586
587 // Borders are done without SSE
588 #ifdef _OPENMP
589 #pragma omp single
590 #endif
591
592 for (int i = W - (W % 8); i < W; i++) {
593 tmp[0][0] = src[0][i] * (B + b1 + b2 + b3);
594 tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3);
595 tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
596
597 for (int j = 3; j < H; j++) {
598 tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
599 }
600
601 float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]);
602 float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]);
603 float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]);
604
605 tmp[H - 1][0] = temp2Hm1;
606 tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
607 tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H;
608
609 for (int j = H - 4; j >= 0; j--) {
610 tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
611 }
612
613 for (int j = 0; j < H; j++) {
614 dst[j][i] = tmp[j][0];
615 }
616
617 }
618 }
619 #endif
620
621 #ifdef __SSE2__
gaussVerticalSsemult(T ** RESTRICT src,T ** RESTRICT dst,const int W,const int H,const float sigma)622 template<class T> void gaussVerticalSsemult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const float sigma)
623 {
624 double b1, b2, b3, B, M[3][3];
625 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
626
627 for (int i = 0; i < 3; i++)
628 for (int j = 0; j < 3; j++) {
629 M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
630 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
631 }
632
633 float tmp[H][8] ALIGNED16;
634 vfloat Rv;
635 vfloat Tv, Tm2v, Tm3v;
636 vfloat Rv1;
637 vfloat Tv1, Tm2v1, Tm3v1;
638 vfloat Bv, b1v, b2v, b3v;
639 vfloat temp2W, temp2Wp1;
640 vfloat temp2W1, temp2Wp11;
641 Bv = F2V(B);
642 b1v = F2V(b1);
643 b2v = F2V(b2);
644 b3v = F2V(b3);
645
646 #ifdef _OPENMP
647 #pragma omp for nowait
648 #endif
649
650 // process 8 columns per iteration for better usage of cpu cache
651 for (int i = 0; i < W - 7; i += 8) {
652 Tv = LVFU( src[0][i]);
653 Tv1 = LVFU( src[0][i + 4]);
654 Rv = Tv * (Bv + b1v + b2v + b3v);
655 Rv1 = Tv1 * (Bv + b1v + b2v + b3v);
656 Tm3v = Rv;
657 Tm3v1 = Rv1;
658 STVF( tmp[0][0], Rv );
659 STVF( tmp[0][4], Rv1 );
660
661 Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v);
662 Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v);
663 Tm2v = Rv;
664 Tm2v1 = Rv1;
665 STVF( tmp[1][0], Rv );
666 STVF( tmp[1][4], Rv1 );
667
668 Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
669 Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v;
670 STVF( tmp[2][0], Rv );
671 STVF( tmp[2][4], Rv1 );
672
673 for (int j = 3; j < H; j++) {
674 Tv = Rv;
675 Tv1 = Rv1;
676 Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
677 Rv1 = LVFU(src[j][i + 4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
678 STVF( tmp[j][0], Rv );
679 STVF( tmp[j][4], Rv1 );
680 Tm3v = Tm2v;
681 Tm3v1 = Tm2v1;
682 Tm2v = Tv;
683 Tm2v1 = Tv1;
684 }
685
686 Tv = LVFU(src[H - 1][i]);
687 Tv1 = LVFU(src[H - 1][i + 4]);
688
689 temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv);
690 temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1);
691 temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
692 temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1);
693
694 Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
695 Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1);
696 STVFU( dst[H - 1][i], LVFU(dst[H - 1][i]) * Rv );
697 STVFU( dst[H - 1][i + 4], LVFU(dst[H - 1][i + 4]) * Rv1 );
698
699 Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
700 Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11;
701 STVFU( dst[H - 2][i], LVFU(dst[H - 2][i]) * Tm2v );
702 STVFU( dst[H - 2][i + 4], LVFU(dst[H - 2][i + 4]) * Tm2v1 );
703
704 Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
705 Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1;
706 STVFU( dst[H - 3][i], LVFU(dst[H - 3][i]) * Tm3v );
707 STVFU( dst[H - 3][i + 4], LVFU(dst[H - 3][i + 4]) * Tm3v1 );
708
709 Tv = Rv;
710 Tv1 = Rv1;
711 Rv = Tm3v;
712 Rv1 = Tm3v1;
713 Tm3v = Tv;
714 Tm3v1 = Tv1;
715
716 for (int j = H - 4; j >= 0; j--) {
717 Tv = Rv;
718 Tv1 = Rv1;
719 Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
720 Rv1 = LVF(tmp[j][4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
721 STVFU( dst[j][i], LVFU(dst[j][i]) * Rv );
722 STVFU( dst[j][i + 4], LVFU(dst[j][i + 4]) * Rv1 );
723 Tm3v = Tm2v;
724 Tm3v1 = Tm2v1;
725 Tm2v = Tv;
726 Tm2v1 = Tv1;
727 }
728 }
729
730 // Borders are done without SSE
731 #ifdef _OPENMP
732 #pragma omp single
733 #endif
734
735 for (int i = W - (W % 8); i < W; i++) {
736 tmp[0][0] = src[0][i] * (B + b1 + b2 + b3);
737 tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3);
738 tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
739
740 for (int j = 3; j < H; j++) {
741 tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
742 }
743
744 float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]);
745 float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]);
746 float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]);
747
748 tmp[H - 1][0] = temp2Hm1;
749 tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
750 tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H;
751
752 for (int j = H - 4; j >= 0; j--) {
753 tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
754 }
755
756 for (int j = 0; j < H; j++) {
757 dst[j][i] *= tmp[j][0];
758 }
759
760 }
761 }
762
gaussVerticalSsediv(T ** src,T ** dst,T ** divBuffer,const int W,const int H,const float sigma)763 template<class T> void gaussVerticalSsediv (T** src, T** dst, T** divBuffer, const int W, const int H, const float sigma)
764 {
765 double b1, b2, b3, B, M[3][3];
766 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
767
768 for (int i = 0; i < 3; i++)
769 for (int j = 0; j < 3; j++) {
770 M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
771 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
772 }
773
774 float tmp[H][8] ALIGNED16;
775 vfloat Rv;
776 vfloat Tv, Tm2v, Tm3v;
777 vfloat Rv1;
778 vfloat Tv1, Tm2v1, Tm3v1;
779 vfloat Bv, b1v, b2v, b3v;
780 vfloat temp2W, temp2Wp1;
781 vfloat temp2W1, temp2Wp11;
782 vfloat onev = F2V(1.f);
783 Bv = F2V(B);
784 b1v = F2V(b1);
785 b2v = F2V(b2);
786 b3v = F2V(b3);
787
788 #ifdef _OPENMP
789 #pragma omp for nowait
790 #endif
791
792 // process 8 columns per iteration for better usage of cpu cache
793 for (int i = 0; i < W - 7; i += 8) {
794 Tv = LVFU( src[0][i]);
795 Tv1 = LVFU( src[0][i + 4]);
796 Rv = Tv * (Bv + b1v + b2v + b3v);
797 Rv1 = Tv1 * (Bv + b1v + b2v + b3v);
798 Tm3v = Rv;
799 Tm3v1 = Rv1;
800 STVF( tmp[0][0], Rv );
801 STVF( tmp[0][4], Rv1 );
802
803 Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v);
804 Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v);
805 Tm2v = Rv;
806 Tm2v1 = Rv1;
807 STVF( tmp[1][0], Rv );
808 STVF( tmp[1][4], Rv1 );
809
810 Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
811 Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v;
812 STVF( tmp[2][0], Rv );
813 STVF( tmp[2][4], Rv1 );
814
815 for (int j = 3; j < H; j++) {
816 Tv = Rv;
817 Tv1 = Rv1;
818 Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
819 Rv1 = LVFU(src[j][i + 4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
820 STVF( tmp[j][0], Rv );
821 STVF( tmp[j][4], Rv1 );
822 Tm3v = Tm2v;
823 Tm3v1 = Tm2v1;
824 Tm2v = Tv;
825 Tm2v1 = Tv1;
826 }
827
828 Tv = LVFU(src[H - 1][i]);
829 Tv1 = LVFU(src[H - 1][i + 4]);
830
831 temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv);
832 temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1);
833 temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
834 temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1);
835
836 Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
837 Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1);
838
839 STVFU( dst[H - 1][i], LVFU(divBuffer[H - 1][i]) / vself(vmaskf_gt(Rv, ZEROV), Rv, onev));
840 STVFU( dst[H - 1][i + 4], LVFU(divBuffer[H - 1][i + 4]) / vself(vmaskf_gt(Rv1, ZEROV), Rv1, onev));
841
842 Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
843 Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11;
844 STVFU( dst[H - 2][i], LVFU(divBuffer[H - 2][i]) / vself(vmaskf_gt(Tm2v, ZEROV), Tm2v, onev));
845 STVFU( dst[H - 2][i + 4], LVFU(divBuffer[H - 2][i + 4]) / vself(vmaskf_gt(Tm2v1, ZEROV), Tm2v1, onev));
846
847 Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
848 Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1;
849 STVFU( dst[H - 3][i], LVFU(divBuffer[H - 3][i]) / vself(vmaskf_gt(Tm3v, ZEROV), Tm3v, onev));
850 STVFU( dst[H - 3][i + 4], LVFU(divBuffer[H - 3][i + 4]) / vself(vmaskf_gt(Tm3v1, ZEROV), Tm3v1, onev));
851
852 Tv = Rv;
853 Tv1 = Rv1;
854 Rv = Tm3v;
855 Rv1 = Tm3v1;
856 Tm3v = Tv;
857 Tm3v1 = Tv1;
858
859 for (int j = H - 4; j >= 0; j--) {
860 Tv = Rv;
861 Tv1 = Rv1;
862 Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
863 Rv1 = LVF(tmp[j][4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
864 STVFU( dst[j][i], vmaxf(LVFU(divBuffer[j][i]) / vself(vmaskf_gt(Rv, ZEROV), Rv, onev), ZEROV));
865 STVFU( dst[j][i + 4], vmaxf(LVFU(divBuffer[j][i + 4]) / vself(vmaskf_gt(Rv1, ZEROV), Rv1, onev), ZEROV));
866 Tm3v = Tm2v;
867 Tm3v1 = Tm2v1;
868 Tm2v = Tv;
869 Tm2v1 = Tv1;
870 }
871 }
872
873 // Borders are done without SSE
874 #ifdef _OPENMP
875 #pragma omp single
876 #endif
877
878 for (int i = W - (W % 8); i < W; i++) {
879 tmp[0][0] = src[0][i] * (B + b1 + b2 + b3);
880 tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3);
881 tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
882
883 for (int j = 3; j < H; j++) {
884 tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
885 }
886
887 float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]);
888 float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]);
889 float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]);
890
891 tmp[H - 1][0] = temp2Hm1;
892 tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
893 tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H;
894
895 for (int j = H - 4; j >= 0; j--) {
896 tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
897 }
898
899 for (int j = 0; j < H; j++) {
900 dst[j][i] = librtprocess::max(divBuffer[j][i] / (tmp[j][0] > 0.f ? tmp[j][0] : 1.f), 0.f);
901 }
902
903 }
904 }
905
906 #endif
907
gaussVertical(T ** src,T ** dst,const int W,const int H,const double sigma)908 template<class T> void gaussVertical (T** src, T** dst, const int W, const int H, const double sigma)
909 {
910 double b1, b2, b3, B, M[3][3];
911 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
912
913 for (int i = 0; i < 3; i++)
914 for (int j = 0; j < 3; j++) {
915 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
916 }
917
918 // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
919 static const int numcols = 8;
920 double temp2[H][numcols] ALIGNED16;
921 double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
922 #ifdef _OPENMP
923 #pragma omp for nowait
924 #endif
925
926 for (unsigned int i = 0; i < static_cast<unsigned>(std::max(0, W - numcols + 1)); i += numcols) {
927 for (int k = 0; k < numcols; k++) {
928 temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
929 temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
930 temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
931 }
932
933 for (int j = 3; j < H; j++) {
934 for (int k = 0; k < numcols; k++) {
935 temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
936 }
937 }
938
939 for (int k = 0; k < numcols; k++) {
940 temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
941 temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
942 temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
943 }
944
945 for (int k = 0; k < numcols; k++) {
946 dst[H - 1][i + k] = temp2[H - 1][k] = temp2Hm1[k];
947 dst[H - 2][i + k] = temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
948 dst[H - 3][i + k] = temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
949 }
950
951 for (int j = H - 4; j >= 0; j--) {
952 for (int k = 0; k < numcols; k++) {
953 dst[j][i + k] = temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k];
954 }
955 }
956 }
957
958 #ifdef _OPENMP
959 #pragma omp single
960 #endif
961
962 // process remaining columns
963 for (int i = W - (W % numcols); i < W; i++) {
964 temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
965 temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i];
966 temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i];
967
968 for (int j = 3; j < H; j++) {
969 temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
970 }
971
972 double temp2Hm1s = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
973 double temp2Hs = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
974 double temp2Hp1s = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
975
976 dst[H - 1][i] = temp2[H - 1][0] = temp2Hm1s;
977 dst[H - 2][i] = temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2Hs + b3 * temp2Hp1s;
978 dst[H - 3][i] = temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2Hs;
979
980 for (int j = H - 4; j >= 0; j--) {
981 dst[j][i] = temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0];
982 }
983 }
984 }
985
986 #ifndef __SSE2__
gaussVerticaldiv(T ** src,T ** dst,T ** divBuffer,const int W,const int H,const double sigma)987 template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const int W, const int H, const double sigma)
988 {
989 double b1, b2, b3, B, M[3][3];
990 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
991
992 for (int i = 0; i < 3; i++)
993 for (int j = 0; j < 3; j++) {
994 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
995 }
996
997 // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
998 static const int numcols = 8;
999 double temp2[H][numcols] ALIGNED16;
1000 double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
1001 #ifdef _OPENMP
1002 #pragma omp for nowait
1003 #endif
1004
1005 for (int i = 0; i < W - numcols + 1; i += numcols) {
1006 for (int k = 0; k < numcols; k++) {
1007 temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1008 temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1009 temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
1010 }
1011
1012 for (int j = 3; j < H; j++) {
1013 for (int k = 0; k < numcols; k++) {
1014 temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
1015 }
1016 }
1017
1018 for (int k = 0; k < numcols; k++) {
1019 temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1020 temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1021 temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1022 }
1023
1024 for (int k = 0; k < numcols; k++) {
1025 dst[H - 1][i + k] = librtprocess::max(divBuffer[H - 1][i + k] / (temp2[H - 1][k] = temp2Hm1[k]), 0.0);
1026 dst[H - 2][i + k] = librtprocess::max(divBuffer[H - 2][i + k] / (temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k]), 0.0);
1027 dst[H - 3][i + k] = librtprocess::max(divBuffer[H - 3][i + k] / (temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k]), 0.0);
1028 }
1029
1030 for (int j = H - 4; j >= 0; j--) {
1031 for (int k = 0; k < numcols; k++) {
1032 dst[j][i + k] = librtprocess::max(divBuffer[j][i + k] / (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]), 0.0);
1033 }
1034 }
1035 }
1036
1037 #ifdef _OPENMP
1038 #pragma omp single
1039 #endif
1040
1041 // process remaining columns
1042 for (int i = W - (W % numcols); i < W; i++) {
1043 temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
1044 temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i];
1045 temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i];
1046
1047 for (int j = 3; j < H; j++) {
1048 temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
1049 }
1050
1051 double temp2Hm1s = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
1052 double temp2Hs = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
1053 double temp2Hp1s = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
1054
1055 dst[H - 1][i] = librtprocess::max(divBuffer[H - 1][i] / (temp2[H - 1][0] = temp2Hm1s), 0.0);
1056 dst[H - 2][i] = librtprocess::max(divBuffer[H - 2][i] / (temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2Hs + b3 * temp2Hp1s), 0.0);
1057 dst[H - 3][i] = librtprocess::max(divBuffer[H - 3][i] / (temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2Hs), 0.0);
1058
1059 for (int j = H - 4; j >= 0; j--) {
1060 dst[j][i] = librtprocess::max(divBuffer[j][i] / (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]), 0.0);
1061 }
1062 }
1063 }
1064
gaussVerticalmult(T ** src,T ** dst,const int W,const int H,const double sigma)1065 template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const int H, const double sigma)
1066 {
1067 double b1, b2, b3, B, M[3][3];
1068 calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
1069
1070 for (int i = 0; i < 3; i++)
1071 for (int j = 0; j < 3; j++) {
1072 M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
1073 }
1074
1075 // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
1076 static const int numcols = 8;
1077 double temp2[H][numcols] ALIGNED16;
1078 double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
1079 #ifdef _OPENMP
1080 #pragma omp for nowait
1081 #endif
1082
1083 for (int i = 0; i < W - numcols + 1; i += numcols) {
1084 for (int k = 0; k < numcols; k++) {
1085 temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1086 temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1087 temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
1088 }
1089
1090 for (int j = 3; j < H; j++) {
1091 for (int k = 0; k < numcols; k++) {
1092 temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
1093 }
1094 }
1095
1096 for (int k = 0; k < numcols; k++) {
1097 temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1098 temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1099 temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1100 }
1101
1102 for (int k = 0; k < numcols; k++) {
1103 dst[H - 1][i + k] *= temp2[H - 1][k] = temp2Hm1[k];
1104 dst[H - 2][i + k] *= temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
1105 dst[H - 3][i + k] *= temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
1106 }
1107
1108 for (int j = H - 4; j >= 0; j--) {
1109 for (int k = 0; k < numcols; k++) {
1110 dst[j][i + k] *= (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]);
1111 }
1112 }
1113 }
1114
1115 #ifdef _OPENMP
1116 #pragma omp single
1117 #endif
1118
1119 // process remaining columns
1120 for (int i = W - (W % numcols); i < W; i++) {
1121 temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
1122 temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i];
1123 temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i];
1124
1125 for (int j = 3; j < H; j++) {
1126 temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
1127 }
1128
1129 double temp2Hm1s = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
1130 double temp2Hs = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
1131 double temp2Hp1s = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
1132
1133 dst[H - 1][i] *= temp2[H - 1][0] = temp2Hm1s;
1134 dst[H - 2][i] *= temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2Hs + b3 * temp2Hp1s;
1135 dst[H - 3][i] *= temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2Hs;
1136
1137 for (int j = H - 4; j >= 0; j--) {
1138 dst[j][i] *= (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]);
1139 }
1140 }
1141 }
1142 #endif
1143
1144 template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int H, const double sigma, T *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, T** buffer2 = nullptr)
1145 {
1146 static constexpr auto GAUSS_SKIP = 0.25;
1147 static constexpr auto GAUSS_3X3_LIMIT = 0.6;
1148 static constexpr auto GAUSS_DOUBLE = 25.0;
1149
1150 if(buffer) {
1151 // special variant for very large sigma, currently only used by retinex algorithm
1152 // use iterated boxblur to approximate gaussian blur
1153 // Compute ideal averaging filter width and number of iterations
1154 int n = 1;
1155 double wIdeal = sqrt((12 * sigma * sigma) + 1);
1156
1157 while(wIdeal > W || wIdeal > H) {
1158 n++;
1159 wIdeal = sqrt((12 * sigma * sigma / n) + 1);
1160 }
1161
1162 if(n < 3) {
1163 n = 3;
1164 wIdeal = sqrt((12 * sigma * sigma / n) + 1);
1165 } else if(n > 6) {
1166 n = 6;
1167 }
1168
1169 int wl = wIdeal;
1170
1171 if(wl % 2 == 0) {
1172 wl--;
1173 }
1174
1175 int wu = wl + 2;
1176
1177 double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4);
1178 int m = round(mIdeal);
1179
1180 int sizes[n];
1181
1182 for(int i = 0; i < n; i++) {
1183 sizes[i] = ((i < m ? wl : wu) - 1) / 2;
1184 }
1185
1186 librtprocess::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H);
1187
1188 for(int i = 1; i < n; i++) {
1189 librtprocess::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H);
1190 }
1191 } else {
1192 if (sigma < GAUSS_SKIP) {
1193 // don't perform filtering
1194 if (src != dst) {
1195 for(int i = 0; i < H; ++i) {
1196 memcpy(dst[i], src[i], W * sizeof(T));
1197 }
1198 }
1199 } else if (sigma < GAUSS_3X3_LIMIT) {
1200 if(src != dst) {
1201 // If src != dst we can take the fast way
1202 // compute 3x3 kernel values
1203 double c0 = 1.0;
1204 double c1 = exp( -0.5 * (librtprocess::SQR(1.0 / sigma)) );
1205 double c2 = exp( -librtprocess::SQR(1.0 / sigma) );
1206
1207 // normalize kernel values
1208 double sum = c0 + 4.0 * (c1 + c2);
1209 c0 /= sum;
1210 c1 /= sum;
1211 c2 /= sum;
1212 // compute kernel values for border pixels
1213 double b1 = exp (-1.0 / (2.0 * sigma * sigma));
1214 double bsum = 2.0 * b1 + 1.0;
1215 b1 /= bsum;
1216 double b0 = 1.0 / bsum;
1217
1218 switch (gausstype) {
1219 case GAUSS_MULT :
1220 gauss3x3mult<T> (src, dst, W, H, c0, c1, c2, b0, b1);
1221 break;
1222
1223 case GAUSS_DIV :
1224 gauss3x3div<T> (src, dst, buffer2, W, H, c0, c1, c2, b0, b1);
1225 break;
1226
1227 case GAUSS_STANDARD :
1228 gauss3x3<T> (src, dst, W, H, c0, c1, c2, b0, b1);
1229 break;
1230 }
1231 } else {
1232 // compute kernel values for separated 3x3 gaussian blur
1233 double c1 = exp (-1.0 / (2.0 * sigma * sigma));
1234 double csum = 2.0 * c1 + 1.0;
1235 c1 /= csum;
1236 double c0 = 1.0 / csum;
1237 gaussHorizontal3<T> (src, dst, W, H, c0, c1);
1238 gaussVertical3<T> (dst, dst, W, H, c0, c1);
1239 }
1240 } else {
1241 #ifdef __SSE2__
1242
1243 if (sigma < GAUSS_DOUBLE) {
1244 switch (gausstype) {
1245 case GAUSS_MULT : {
1246 gaussHorizontalSse<T> (src, src, W, H, sigma);
1247 gaussVerticalSsemult<T> (src, dst, W, H, sigma);
1248 break;
1249 }
1250
1251 case GAUSS_DIV : {
1252 gaussHorizontalSse<T> (src, dst, W, H, sigma);
1253 gaussVerticalSsediv<T> (dst, dst, buffer2, W, H, sigma);
1254 break;
1255 }
1256
1257 case GAUSS_STANDARD : {
1258 gaussHorizontalSse<T> (src, dst, W, H, sigma);
1259 gaussVerticalSse<T> (dst, dst, W, H, sigma);
1260 break;
1261 }
1262 }
1263 } else { // large sigma only with double precision
1264 gaussHorizontal<T> (src, dst, W, H, sigma);
1265 gaussVertical<T> (dst, dst, W, H, sigma);
1266 }
1267
1268 #else
1269
1270 if (sigma < GAUSS_DOUBLE) {
1271 switch (gausstype) {
1272 case GAUSS_MULT : {
1273 gaussHorizontal<T> (src, src, W, H, sigma);
1274 gaussVerticalmult<T> (src, dst, W, H, sigma);
1275 break;
1276 }
1277
1278 case GAUSS_DIV : {
1279 gaussHorizontal<T> (src, dst, W, H, sigma);
1280 gaussVerticaldiv<T> (dst, dst, buffer2, W, H, sigma);
1281 break;
1282 }
1283
1284 case GAUSS_STANDARD : {
1285 gaussHorizontal<T> (src, dst, W, H, sigma);
1286 gaussVertical<T> (dst, dst, W, H, sigma);
1287 break;
1288 }
1289 }
1290 } else { // large sigma only with double precision
1291 gaussHorizontal<T> (src, dst, W, H, sigma);
1292 gaussVertical<T> (dst, dst, W, H, sigma);
1293 }
1294
1295 #endif
1296 }
1297 }
1298 }
1299
1300 static void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr)
1301 {
1302 gaussianBlurImpl<float>(src, dst, W, H, sigma, buffer, gausstype, buffer2);
1303 }
1304
1305
1306