1 /*
2  *  This file is part of RawTherapee.
3  *
4  *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
5  *
6  *  RawTherapee is free software: you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation, either version 3 of the License, or
9  *  (at your option) any later version.
10  *
11  *  RawTherapee is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 #include "librtprocess.h"
20 #include <cmath>
21 #include <cstring>
22 #include <cstdlib>
23 #include "opthelper.h"
24 #include "boxblur.h"
25 
26 enum eGaussType {GAUSS_STANDARD, GAUSS_MULT, GAUSS_DIV};
27 
28 using namespace librtprocess;
29 
calculateYvVFactors(const T sigma,T & b1,T & b2,T & b3,T & B,T M[3][3])30 template<class T> void calculateYvVFactors( const T sigma, T &b1, T &b2, T &b3, T &B, T M[3][3])
31 {
32     // coefficient calculation
33     T q;
34 
35     if (sigma < 2.5) {
36         q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
37     } else {
38         q = 0.98711 * sigma - 0.96330;
39     }
40 
41     T b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q;
42     b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q;
43     b2 = -1.4281 * q * q - 1.26661 * q * q * q;
44     b3 = 0.422205 * q * q * q;
45     B = 1.0 - (b1 + b2 + b3) / b0;
46 
47     b1 /= b0;
48     b2 /= b0;
49     b3 /= b0;
50 
51     // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering
52     M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2;
53     M[0][1] = (b3 + b1) * (b2 + b3 * b1);
54     M[0][2] = b3 * (b1 + b3 * b2);
55     M[1][0] = b1 + b3 * b2;
56     M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1);
57     M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3;
58     M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2;
59     M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3;
60     M[2][2] = b3 * (b1 + b3 * b2);
61 
62 }
63 
64 // classical filtering if the support window is small and src != dst
gauss3x3(T ** RESTRICT src,T ** RESTRICT dst,const int W,const int H,const T c0,const T c1,const T c2,const T b0,const T b1)65 template<class T> void gauss3x3 (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1)
66 {
67 
68     // first row
69 #ifdef _OPENMP
70     #pragma omp single nowait
71 #endif
72     {
73         dst[0][0] = src[0][0];
74 
75         for (int j = 1; j < W - 1; j++)
76         {
77             dst[0][j] = b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j];
78         }
79 
80         dst[0][W - 1] = src[0][W - 1];
81     }
82 
83 #ifdef _OPENMP
84     #pragma omp for nowait
85 #endif
86 
87     for (int i = 1; i < H - 1; i++) {
88         dst[i][0] = b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0];
89 
90         for (int j = 1; j < W - 1; j++) {
91             dst[i][j] = c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j];
92         }
93 
94         dst[i][W - 1] = b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1];
95     }
96 
97     // last row
98 #ifdef _OPENMP
99     #pragma omp single
100 #endif
101     {
102         dst[H - 1][0] = src[H - 1][0];
103 
104         for (int j = 1; j < W - 1; j++) {
105             dst[H - 1][j] = b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j];
106         }
107 
108         dst[H - 1][W - 1] = src[H - 1][W - 1];
109     }
110 }
111 
112 // classical filtering if the support window is small and src != dst
gauss3x3mult(T ** RESTRICT src,T ** RESTRICT dst,const int W,const int H,const T c0,const T c1,const T c2,const T b0,const T b1)113 template<class T> void gauss3x3mult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1)
114 {
115 
116     // first row
117 #ifdef _OPENMP
118     #pragma omp single nowait
119 #endif
120     {
121         dst[0][0] *= src[0][0];
122 
123         for (int j = 1; j < W - 1; j++)
124         {
125             dst[0][j] *= b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j];
126         }
127 
128         dst[0][W - 1] *= src[0][W - 1];
129     }
130 
131 #ifdef _OPENMP
132     #pragma omp for nowait
133 #endif
134 
135     for (int i = 1; i < H - 1; i++) {
136         dst[i][0] *= b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0];
137 
138         for (int j = 1; j < W - 1; j++) {
139             dst[i][j] *= c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j];
140         }
141 
142         dst[i][W - 1] *= b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1];
143     }
144 
145     // last row
146 #ifdef _OPENMP
147     #pragma omp single
148 #endif
149     {
150         dst[H - 1][0] *= src[H - 1][0];
151 
152         for (int j = 1; j < W - 1; j++) {
153             dst[H - 1][j] *= b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j];
154         }
155 
156         dst[H - 1][W - 1] *= src[H - 1][W - 1];
157     }
158 }
159 
gauss3x3div(T ** RESTRICT src,T ** RESTRICT dst,T ** RESTRICT divBuffer,const int W,const int H,const T c0,const T c1,const T c2,const T b0,const T b1)160 template<class T> void gauss3x3div (T** RESTRICT src, T** RESTRICT dst, T** RESTRICT divBuffer, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1)
161 {
162 
163     // first row
164 #ifdef _OPENMP
165     #pragma omp single nowait
166 #endif
167     {
168         dst[0][0] = librtprocess::max(divBuffer[0][0] / (src[0][0] > 0.f ? src[0][0] : 1.f), 0.f);
169 
170         for (int j = 1; j < W - 1; j++)
171         {
172             float tmp = (b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j]);
173             dst[0][j] = librtprocess::max(divBuffer[0][j] / (tmp > 0.f ? tmp : 1.f), 0.f);
174         }
175 
176         dst[0][W - 1] = librtprocess::max(divBuffer[0][W - 1] / (src[0][W - 1] > 0.f ? src[0][W - 1] : 1.f), 0.f);
177     }
178 
179 #ifdef _OPENMP
180     #pragma omp for nowait
181 #endif
182 
183     for (int i = 1; i < H - 1; i++) {
184         float tmp = (b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0]);
185         dst[i][0] = librtprocess::max(divBuffer[i][0] / (tmp > 0.f ? tmp : 1.f), 0.f);
186 
187         for (int j = 1; j < W - 1; j++) {
188             tmp = (c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j]);
189             dst[i][j] = librtprocess::max(divBuffer[i][j] / (tmp > 0.f ? tmp : 1.f), 0.f);
190         }
191 
192         tmp = (b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1]);
193         dst[i][W - 1] = librtprocess::max(divBuffer[i][W - 1] / (tmp > 0.f ? tmp : 1.f), 0.f);
194     }
195 
196     // last row
197 #ifdef _OPENMP
198     #pragma omp single
199 #endif
200     {
201         dst[H - 1][0] = librtprocess::max(divBuffer[H - 1][0] / (src[H - 1][0] > 0.f ? src[H - 1][0] : 1.f), 0.f);
202 
203         for (int j = 1; j < W - 1; j++) {
204             float tmp = (b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j]);
205             dst[H - 1][j] = librtprocess::max(divBuffer[H - 1][j] / (tmp > 0.f ? tmp : 1.f), 0.f);
206         }
207 
208         dst[H - 1][W - 1] = librtprocess::max(divBuffer[H - 1][W - 1] / (src[H - 1][W - 1] > 0.f ? src[H - 1][W - 1] : 1.f), 0.f);
209     }
210 }
211 
212 
213 // use separated filter if the support window is small and src == dst
gaussHorizontal3(T ** src,T ** dst,int W,int H,const float c0,const float c1)214 template<class T> void gaussHorizontal3 (T** src, T** dst, int W, int H, const float c0, const float c1)
215 {
216     T temp[W] ALIGNED16;
217 #ifdef _OPENMP
218     #pragma omp for
219 #endif
220 
221     for (int i = 0; i < H; i++) {
222         for (int j = 1; j < W - 1; j++) {
223             temp[j] = (T)(c1 * (src[i][j - 1] + src[i][j + 1]) + c0 * src[i][j]);
224         }
225 
226         dst[i][0] = src[i][0];
227         memcpy (dst[i] + 1, temp + 1, (W - 2)*sizeof(T));
228 
229         dst[i][W - 1] = src[i][W - 1];
230     }
231 }
232 
233 #ifdef __SSE2__
gaussVertical3(T ** src,T ** dst,int W,int H,const float c0,const float c1)234 template<class T> void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1)
235 {
236     vfloat Tv = F2V(0.f), Tm1v, Tp1v;
237     vfloat Tv1 = F2V(0.f), Tm1v1, Tp1v1;
238     vfloat c0v, c1v;
239     c0v = F2V(c0);
240     c1v = F2V(c1);
241 
242 #ifdef _OPENMP
243     #pragma omp for nowait
244 #endif
245 
246     // process 8 columns per iteration for better usage of cpu cache
247     for (int i = 0; i < W - 7; i += 8) {
248         Tm1v = LVFU( src[0][i] );
249         Tm1v1 = LVFU( src[0][i + 4] );
250         STVFU( dst[0][i], Tm1v);
251         STVFU( dst[0][i + 4], Tm1v1);
252 
253         if (H > 1) {
254             Tv = LVFU( src[1][i]);
255             Tv1 = LVFU( src[1][i + 4]);
256         }
257 
258         for (int j = 1; j < H - 1; j++) {
259             Tp1v = LVFU( src[j + 1][i]);
260             Tp1v1 = LVFU( src[j + 1][i + 4]);
261             STVFU( dst[j][i], c1v * (Tp1v + Tm1v) + Tv * c0v);
262             STVFU( dst[j][i + 4], c1v * (Tp1v1 + Tm1v1) + Tv1 * c0v);
263             Tm1v = Tv;
264             Tm1v1 = Tv1;
265             Tv = Tp1v;
266             Tv1 = Tp1v1;
267         }
268 
269         STVFU( dst[H - 1][i], LVFU( src[H - 1][i]));
270         STVFU( dst[H - 1][i + 4], LVFU( src[H - 1][i + 4]));
271     }
272 
273 // Borders are done without SSE
274     float temp[H] ALIGNED16;
275 #ifdef _OPENMP
276     #pragma omp single
277 #endif
278 
279     for (int i = W - (W % 8); i < W; i++) {
280         for (int j = 1; j < H - 1; j++) {
281             temp[j] = c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i];
282         }
283 
284         dst[0][i] = src[0][i];
285 
286         for (int j = 1; j < H - 1; j++) {
287             dst[j][i] = temp[j];
288         }
289 
290         dst[H - 1][i] = src[H - 1][i];
291     }
292 }
293 #else
gaussVertical3(T ** src,T ** dst,int W,int H,const float c0,const float c1)294 template<class T> void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1)
295 {
296     T temp[H] ALIGNED16;
297 #ifdef _OPENMP
298     #pragma omp for
299 #endif
300 
301     for (int i = 0; i < W; i++) {
302         for (int j = 1; j < H - 1; j++) {
303             temp[j] = (T)(c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i]);
304         }
305 
306         dst[0][i] = src[0][i];
307 
308         for (int j = 1; j < H - 1; j++) {
309             dst[j][i] = temp[j];
310         }
311 
312         dst[H - 1][i] = src[H - 1][i];
313     }
314 }
315 #endif
316 
317 #ifdef __SSE2__
318 // fast gaussian approximation if the support window is large
gaussHorizontalSse(T ** src,T ** dst,const int W,const int H,const float sigma)319 template<class T> void gaussHorizontalSse (T** src, T** dst, const int W, const int H, const float sigma)
320 {
321     double b1, b2, b3, B, M[3][3];
322     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
323 
324     for (int i = 0; i < 3; i++)
325         for (int j = 0; j < 3; j++) {
326             M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
327             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
328         }
329 
330     vfloat Rv;
331     vfloat Tv, Tm2v, Tm3v;
332     vfloat Bv, b1v, b2v, b3v;
333     vfloat temp2W, temp2Wp1;
334     float tmp[W][4] ALIGNED16;
335     Bv = F2V(B);
336     b1v = F2V(b1);
337     b2v = F2V(b2);
338     b3v = F2V(b3);
339 
340 #ifdef _OPENMP
341     #pragma omp for nowait
342 #endif
343 
344     for (int i = 0; i < H - 3; i += 4) {
345         Tv = _mm_set_ps(src[i][0], src[i + 1][0], src[i + 2][0], src[i + 3][0]);
346         Tm3v = Tv * (Bv + b1v + b2v + b3v);
347         STVF( tmp[0][0], Tm3v );
348 
349         Tm2v = _mm_set_ps(src[i][1], src[i + 1][1], src[i + 2][1], src[i + 3][1]) * Bv + Tm3v * b1v + Tv * (b2v + b3v);
350         STVF( tmp[1][0], Tm2v );
351 
352         Rv = _mm_set_ps(src[i][2], src[i + 1][2], src[i + 2][2], src[i + 3][2]) * Bv + Tm2v * b1v + Tm3v * b2v + Tv * b3v;
353         STVF( tmp[2][0], Rv );
354 
355         for (int j = 3; j < W; j++) {
356             Tv = Rv;
357             Rv = _mm_set_ps(src[i][j], src[i + 1][j], src[i + 2][j], src[i + 3][j]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
358             STVF( tmp[j][0], Rv );
359             Tm3v = Tm2v;
360             Tm2v = Tv;
361         }
362 
363         Tv = _mm_set_ps(src[i][W - 1], src[i + 1][W - 1], src[i + 2][W - 1], src[i + 3][W - 1]);
364 
365         temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * ( Tm2v - Tv ) +  F2V(M[2][2]) * (Tm3v - Tv);
366         temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
367 
368         Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
369         STVF(tmp[W - 1][0], Rv);
370 
371         Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
372         STVF(tmp[W - 2][0], Tm2v);
373 
374         Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
375         STVF(tmp[W - 3][0], Tm3v);
376 
377         Tv = Rv;
378         Rv = Tm3v;
379         Tm3v = Tv;
380 
381         for (int j = W - 4; j >= 0; j--) {
382             Tv = Rv;
383             Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
384             STVF(tmp[j][0], Rv);
385             Tm3v = Tm2v;
386             Tm2v = Tv;
387         }
388 
389         for (int j = 0; j < W; j++) {
390             dst[i + 3][j] = tmp[j][0];
391             dst[i + 2][j] = tmp[j][1];
392             dst[i + 1][j] = tmp[j][2];
393             dst[i + 0][j] = tmp[j][3];
394         }
395 
396 
397     }
398 
399 // Borders are done without SSE
400 #ifdef _OPENMP
401     #pragma omp single
402 #endif
403 
404     for (int i = H - (H % 4); i < H; i++) {
405         tmp[0][0] = src[i][0] * (B + b1 + b2 + b3);
406         tmp[1][0] = B * src[i][1] + b1 * tmp[0][0]  + src[i][0] * (b2 + b3);
407         tmp[2][0] = B * src[i][2] + b1 * tmp[1][0]  + b2 * tmp[0][0]  + b3 * src[i][0];
408 
409         for (int j = 3; j < W; j++) {
410             tmp[j][0] = B * src[i][j] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
411         }
412 
413         float temp2Wm1  = src[i][W - 1] + M[0][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[0][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[0][2] * (tmp[W - 3][0] - src[i][W - 1]);
414         float temp2Ws   = src[i][W - 1] + M[1][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[1][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[1][2] * (tmp[W - 3][0] - src[i][W - 1]);
415         float temp2Wp1s = src[i][W - 1] + M[2][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[2][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[2][2] * (tmp[W - 3][0] - src[i][W - 1]);
416 
417         tmp[W - 1][0] = temp2Wm1;
418         tmp[W - 2][0] = B * tmp[W - 2][0] + b1 * tmp[W - 1][0] + b2 * temp2Ws + b3 * temp2Wp1s;
419         tmp[W - 3][0] = B * tmp[W - 3][0] + b1 * tmp[W - 2][0] + b2 * tmp[W - 1][0] + b3 * temp2Ws;
420 
421         for (int j = W - 4; j >= 0; j--) {
422             tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
423         }
424 
425         for (int j = 0; j < W; j++) {
426             dst[i][j] = tmp[j][0];
427         }
428     }
429 }
430 #endif
431 
432 // fast gaussian approximation if the support window is large
gaussHorizontal(T ** src,T ** dst,const int W,const int H,const double sigma)433 template<class T> void gaussHorizontal (T** src, T** dst, const int W, const int H, const double sigma)
434 {
435     double b1, b2, b3, B, M[3][3];
436     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
437 
438     for (int i = 0; i < 3; i++)
439         for (int j = 0; j < 3; j++) {
440             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
441         }
442 
443     double temp2[W] ALIGNED16;
444 
445 #ifdef _OPENMP
446     #pragma omp for
447 #endif
448 
449     for (int i = 0; i < H; i++) {
450 
451         temp2[0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0];
452         temp2[1] = B * src[i][1] + b1 * temp2[0]  + b2 * src[i][0] + b3 * src[i][0];
453         temp2[2] = B * src[i][2] + b1 * temp2[1]  + b2 * temp2[0]  + b3 * src[i][0];
454 
455         for (int j = 3; j < W; j++) {
456             temp2[j] = B * src[i][j] + b1 * temp2[j - 1] + b2 * temp2[j - 2] + b3 * temp2[j - 3];
457         }
458 
459         double temp2Wm1 = src[i][W - 1] + M[0][0] * (temp2[W - 1] - src[i][W - 1]) + M[0][1] * (temp2[W - 2] - src[i][W - 1]) + M[0][2] * (temp2[W - 3] - src[i][W - 1]);
460         double temp2W   = src[i][W - 1] + M[1][0] * (temp2[W - 1] - src[i][W - 1]) + M[1][1] * (temp2[W - 2] - src[i][W - 1]) + M[1][2] * (temp2[W - 3] - src[i][W - 1]);
461         double temp2Wp1 = src[i][W - 1] + M[2][0] * (temp2[W - 1] - src[i][W - 1]) + M[2][1] * (temp2[W - 2] - src[i][W - 1]) + M[2][2] * (temp2[W - 3] - src[i][W - 1]);
462 
463         temp2[W - 1] = temp2Wm1;
464         temp2[W - 2] = B * temp2[W - 2] + b1 * temp2[W - 1] + b2 * temp2W + b3 * temp2Wp1;
465         temp2[W - 3] = B * temp2[W - 3] + b1 * temp2[W - 2] + b2 * temp2[W - 1] + b3 * temp2W;
466 
467         for (int j = W - 4; j >= 0; j--) {
468             temp2[j] = B * temp2[j] + b1 * temp2[j + 1] + b2 * temp2[j + 2] + b3 * temp2[j + 3];
469         }
470 
471         for (int j = 0; j < W; j++) {
472             dst[i][j] = (T)temp2[j];
473         }
474 
475     }
476 }
477 
478 #ifdef __SSE2__
gaussVerticalSse(T ** src,T ** dst,const int W,const int H,const float sigma)479 template<class T> void gaussVerticalSse (T** src, T** dst, const int W, const int H, const float sigma)
480 {
481     double b1, b2, b3, B, M[3][3];
482     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
483 
484     for (int i = 0; i < 3; i++)
485         for (int j = 0; j < 3; j++) {
486             M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
487             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
488         }
489 
490     float tmp[H][8] ALIGNED16;
491     vfloat Rv;
492     vfloat Tv, Tm2v, Tm3v;
493     vfloat Rv1;
494     vfloat Tv1, Tm2v1, Tm3v1;
495     vfloat Bv, b1v, b2v, b3v;
496     vfloat temp2W, temp2Wp1;
497     vfloat temp2W1, temp2Wp11;
498     Bv = F2V(B);
499     b1v = F2V(b1);
500     b2v = F2V(b2);
501     b3v = F2V(b3);
502 
503 #ifdef _OPENMP
504     #pragma omp for nowait
505 #endif
506 
507     // process 8 columns per iteration for better usage of cpu cache
508     for (int i = 0; i < W - 7; i += 8) {
509         Tv = LVFU( src[0][i]);
510         Tv1 = LVFU( src[0][i + 4]);
511         Rv = Tv * (Bv + b1v + b2v + b3v);
512         Rv1 = Tv1 * (Bv + b1v + b2v + b3v);
513         Tm3v = Rv;
514         Tm3v1 = Rv1;
515         STVF( tmp[0][0], Rv );
516         STVF( tmp[0][4], Rv1 );
517 
518         Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v);
519         Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v);
520         Tm2v = Rv;
521         Tm2v1 = Rv1;
522         STVF( tmp[1][0], Rv );
523         STVF( tmp[1][4], Rv1 );
524 
525         Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
526         Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v;
527         STVF( tmp[2][0], Rv );
528         STVF( tmp[2][4], Rv1 );
529 
530         for (int j = 3; j < H; j++) {
531             Tv = Rv;
532             Tv1 = Rv1;
533             Rv = LVFU(src[j][i]) * Bv +  Tv * b1v + Tm2v * b2v + Tm3v * b3v;
534             Rv1 = LVFU(src[j][i + 4]) * Bv +  Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
535             STVF( tmp[j][0], Rv );
536             STVF( tmp[j][4], Rv1 );
537             Tm3v = Tm2v;
538             Tm3v1 = Tm2v1;
539             Tm2v = Tv;
540             Tm2v1 = Tv1;
541         }
542 
543         Tv = LVFU(src[H - 1][i]);
544         Tv1 = LVFU(src[H - 1][i + 4]);
545 
546         temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv);
547         temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1);
548         temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
549         temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1);
550 
551         Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
552         Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1);
553         STVFU( dst[H - 1][i], Rv );
554         STVFU( dst[H - 1][i + 4], Rv1 );
555 
556         Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
557         Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11;
558         STVFU( dst[H - 2][i], Tm2v );
559         STVFU( dst[H - 2][i + 4], Tm2v1 );
560 
561         Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
562         Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1;
563         STVFU( dst[H - 3][i], Tm3v );
564         STVFU( dst[H - 3][i + 4], Tm3v1 );
565 
566         Tv = Rv;
567         Tv1 = Rv1;
568         Rv = Tm3v;
569         Rv1 = Tm3v1;
570         Tm3v = Tv;
571         Tm3v1 = Tv1;
572 
573         for (int j = H - 4; j >= 0; j--) {
574             Tv = Rv;
575             Tv1 = Rv1;
576             Rv = LVF(tmp[j][0]) * Bv +  Tv * b1v + Tm2v * b2v + Tm3v * b3v;
577             Rv1 = LVF(tmp[j][4]) * Bv +  Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
578             STVFU( dst[j][i], Rv );
579             STVFU( dst[j][i + 4], Rv1 );
580             Tm3v = Tm2v;
581             Tm3v1 = Tm2v1;
582             Tm2v = Tv;
583             Tm2v1 = Tv1;
584         }
585     }
586 
587 // Borders are done without SSE
588 #ifdef _OPENMP
589     #pragma omp single
590 #endif
591 
592     for (int i = W - (W % 8); i < W; i++) {
593         tmp[0][0] = src[0][i] * (B + b1 + b2 + b3);
594         tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3);
595         tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
596 
597         for (int j = 3; j < H; j++) {
598             tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
599         }
600 
601         float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]);
602         float temp2H   = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]);
603         float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]);
604 
605         tmp[H - 1][0] = temp2Hm1;
606         tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
607         tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H;
608 
609         for (int j = H - 4; j >= 0; j--) {
610             tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
611         }
612 
613         for (int j = 0; j < H; j++) {
614             dst[j][i] = tmp[j][0];
615         }
616 
617     }
618 }
619 #endif
620 
621 #ifdef __SSE2__
gaussVerticalSsemult(T ** RESTRICT src,T ** RESTRICT dst,const int W,const int H,const float sigma)622 template<class T> void gaussVerticalSsemult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const float sigma)
623 {
624     double b1, b2, b3, B, M[3][3];
625     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
626 
627     for (int i = 0; i < 3; i++)
628         for (int j = 0; j < 3; j++) {
629             M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
630             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
631         }
632 
633     float tmp[H][8] ALIGNED16;
634     vfloat Rv;
635     vfloat Tv, Tm2v, Tm3v;
636     vfloat Rv1;
637     vfloat Tv1, Tm2v1, Tm3v1;
638     vfloat Bv, b1v, b2v, b3v;
639     vfloat temp2W, temp2Wp1;
640     vfloat temp2W1, temp2Wp11;
641     Bv = F2V(B);
642     b1v = F2V(b1);
643     b2v = F2V(b2);
644     b3v = F2V(b3);
645 
646 #ifdef _OPENMP
647     #pragma omp for nowait
648 #endif
649 
650     // process 8 columns per iteration for better usage of cpu cache
651     for (int i = 0; i < W - 7; i += 8) {
652         Tv = LVFU( src[0][i]);
653         Tv1 = LVFU( src[0][i + 4]);
654         Rv = Tv * (Bv + b1v + b2v + b3v);
655         Rv1 = Tv1 * (Bv + b1v + b2v + b3v);
656         Tm3v = Rv;
657         Tm3v1 = Rv1;
658         STVF( tmp[0][0], Rv );
659         STVF( tmp[0][4], Rv1 );
660 
661         Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v);
662         Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v);
663         Tm2v = Rv;
664         Tm2v1 = Rv1;
665         STVF( tmp[1][0], Rv );
666         STVF( tmp[1][4], Rv1 );
667 
668         Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
669         Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v;
670         STVF( tmp[2][0], Rv );
671         STVF( tmp[2][4], Rv1 );
672 
673         for (int j = 3; j < H; j++) {
674             Tv = Rv;
675             Tv1 = Rv1;
676             Rv = LVFU(src[j][i]) * Bv +  Tv * b1v + Tm2v * b2v + Tm3v * b3v;
677             Rv1 = LVFU(src[j][i + 4]) * Bv +  Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
678             STVF( tmp[j][0], Rv );
679             STVF( tmp[j][4], Rv1 );
680             Tm3v = Tm2v;
681             Tm3v1 = Tm2v1;
682             Tm2v = Tv;
683             Tm2v1 = Tv1;
684         }
685 
686         Tv = LVFU(src[H - 1][i]);
687         Tv1 = LVFU(src[H - 1][i + 4]);
688 
689         temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv);
690         temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1);
691         temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
692         temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1);
693 
694         Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
695         Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1);
696         STVFU( dst[H - 1][i], LVFU(dst[H - 1][i]) * Rv );
697         STVFU( dst[H - 1][i + 4], LVFU(dst[H - 1][i + 4]) * Rv1 );
698 
699         Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
700         Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11;
701         STVFU( dst[H - 2][i], LVFU(dst[H - 2][i]) * Tm2v );
702         STVFU( dst[H - 2][i + 4], LVFU(dst[H - 2][i + 4]) * Tm2v1 );
703 
704         Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
705         Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1;
706         STVFU( dst[H - 3][i], LVFU(dst[H - 3][i]) * Tm3v );
707         STVFU( dst[H - 3][i + 4], LVFU(dst[H - 3][i + 4]) * Tm3v1 );
708 
709         Tv = Rv;
710         Tv1 = Rv1;
711         Rv = Tm3v;
712         Rv1 = Tm3v1;
713         Tm3v = Tv;
714         Tm3v1 = Tv1;
715 
716         for (int j = H - 4; j >= 0; j--) {
717             Tv = Rv;
718             Tv1 = Rv1;
719             Rv = LVF(tmp[j][0]) * Bv +  Tv * b1v + Tm2v * b2v + Tm3v * b3v;
720             Rv1 = LVF(tmp[j][4]) * Bv +  Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
721             STVFU( dst[j][i], LVFU(dst[j][i]) * Rv );
722             STVFU( dst[j][i + 4], LVFU(dst[j][i + 4]) * Rv1 );
723             Tm3v = Tm2v;
724             Tm3v1 = Tm2v1;
725             Tm2v = Tv;
726             Tm2v1 = Tv1;
727         }
728     }
729 
730 // Borders are done without SSE
731 #ifdef _OPENMP
732     #pragma omp single
733 #endif
734 
735     for (int i = W - (W % 8); i < W; i++) {
736         tmp[0][0] = src[0][i] * (B + b1 + b2 + b3);
737         tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3);
738         tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
739 
740         for (int j = 3; j < H; j++) {
741             tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
742         }
743 
744         float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]);
745         float temp2H   = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]);
746         float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]);
747 
748         tmp[H - 1][0] = temp2Hm1;
749         tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
750         tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H;
751 
752         for (int j = H - 4; j >= 0; j--) {
753             tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
754         }
755 
756         for (int j = 0; j < H; j++) {
757             dst[j][i] *= tmp[j][0];
758         }
759 
760     }
761 }
762 
gaussVerticalSsediv(T ** src,T ** dst,T ** divBuffer,const int W,const int H,const float sigma)763 template<class T> void gaussVerticalSsediv (T** src, T** dst, T** divBuffer, const int W, const int H, const float sigma)
764 {
765     double b1, b2, b3, B, M[3][3];
766     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
767 
768     for (int i = 0; i < 3; i++)
769         for (int j = 0; j < 3; j++) {
770             M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
771             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
772         }
773 
774     float tmp[H][8] ALIGNED16;
775     vfloat Rv;
776     vfloat Tv, Tm2v, Tm3v;
777     vfloat Rv1;
778     vfloat Tv1, Tm2v1, Tm3v1;
779     vfloat Bv, b1v, b2v, b3v;
780     vfloat temp2W, temp2Wp1;
781     vfloat temp2W1, temp2Wp11;
782     vfloat onev = F2V(1.f);
783     Bv = F2V(B);
784     b1v = F2V(b1);
785     b2v = F2V(b2);
786     b3v = F2V(b3);
787 
788 #ifdef _OPENMP
789     #pragma omp for nowait
790 #endif
791 
792     // process 8 columns per iteration for better usage of cpu cache
793     for (int i = 0; i < W - 7; i += 8) {
794         Tv = LVFU( src[0][i]);
795         Tv1 = LVFU( src[0][i + 4]);
796         Rv = Tv * (Bv + b1v + b2v + b3v);
797         Rv1 = Tv1 * (Bv + b1v + b2v + b3v);
798         Tm3v = Rv;
799         Tm3v1 = Rv1;
800         STVF( tmp[0][0], Rv );
801         STVF( tmp[0][4], Rv1 );
802 
803         Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v);
804         Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v);
805         Tm2v = Rv;
806         Tm2v1 = Rv1;
807         STVF( tmp[1][0], Rv );
808         STVF( tmp[1][4], Rv1 );
809 
810         Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
811         Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v;
812         STVF( tmp[2][0], Rv );
813         STVF( tmp[2][4], Rv1 );
814 
815         for (int j = 3; j < H; j++) {
816             Tv = Rv;
817             Tv1 = Rv1;
818             Rv = LVFU(src[j][i]) * Bv +  Tv * b1v + Tm2v * b2v + Tm3v * b3v;
819             Rv1 = LVFU(src[j][i + 4]) * Bv +  Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
820             STVF( tmp[j][0], Rv );
821             STVF( tmp[j][4], Rv1 );
822             Tm3v = Tm2v;
823             Tm3v1 = Tm2v1;
824             Tm2v = Tv;
825             Tm2v1 = Tv1;
826         }
827 
828         Tv = LVFU(src[H - 1][i]);
829         Tv1 = LVFU(src[H - 1][i + 4]);
830 
831         temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv);
832         temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1);
833         temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
834         temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1);
835 
836         Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
837         Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1);
838 
839         STVFU( dst[H - 1][i], LVFU(divBuffer[H - 1][i]) / vself(vmaskf_gt(Rv, ZEROV), Rv, onev));
840         STVFU( dst[H - 1][i + 4], LVFU(divBuffer[H - 1][i + 4]) / vself(vmaskf_gt(Rv1, ZEROV), Rv1, onev));
841 
842         Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
843         Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11;
844         STVFU( dst[H - 2][i], LVFU(divBuffer[H - 2][i]) / vself(vmaskf_gt(Tm2v, ZEROV), Tm2v, onev));
845         STVFU( dst[H - 2][i + 4], LVFU(divBuffer[H - 2][i + 4]) / vself(vmaskf_gt(Tm2v1, ZEROV), Tm2v1, onev));
846 
847         Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
848         Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1;
849         STVFU( dst[H - 3][i], LVFU(divBuffer[H - 3][i]) / vself(vmaskf_gt(Tm3v, ZEROV), Tm3v, onev));
850         STVFU( dst[H - 3][i + 4], LVFU(divBuffer[H - 3][i + 4]) / vself(vmaskf_gt(Tm3v1, ZEROV), Tm3v1, onev));
851 
852         Tv = Rv;
853         Tv1 = Rv1;
854         Rv = Tm3v;
855         Rv1 = Tm3v1;
856         Tm3v = Tv;
857         Tm3v1 = Tv1;
858 
859         for (int j = H - 4; j >= 0; j--) {
860             Tv = Rv;
861             Tv1 = Rv1;
862             Rv = LVF(tmp[j][0]) * Bv +  Tv * b1v + Tm2v * b2v + Tm3v * b3v;
863             Rv1 = LVF(tmp[j][4]) * Bv +  Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v;
864             STVFU( dst[j][i], vmaxf(LVFU(divBuffer[j][i]) / vself(vmaskf_gt(Rv, ZEROV), Rv, onev), ZEROV));
865             STVFU( dst[j][i + 4], vmaxf(LVFU(divBuffer[j][i + 4]) / vself(vmaskf_gt(Rv1, ZEROV), Rv1, onev), ZEROV));
866             Tm3v = Tm2v;
867             Tm3v1 = Tm2v1;
868             Tm2v = Tv;
869             Tm2v1 = Tv1;
870         }
871     }
872 
873 // Borders are done without SSE
874 #ifdef _OPENMP
875     #pragma omp single
876 #endif
877 
878     for (int i = W - (W % 8); i < W; i++) {
879         tmp[0][0] = src[0][i] * (B + b1 + b2 + b3);
880         tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3);
881         tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
882 
883         for (int j = 3; j < H; j++) {
884             tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
885         }
886 
887         float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]);
888         float temp2H   = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]);
889         float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]);
890 
891         tmp[H - 1][0] = temp2Hm1;
892         tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
893         tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H;
894 
895         for (int j = H - 4; j >= 0; j--) {
896             tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
897         }
898 
899         for (int j = 0; j < H; j++) {
900             dst[j][i] = librtprocess::max(divBuffer[j][i] / (tmp[j][0] > 0.f ? tmp[j][0] : 1.f), 0.f);
901         }
902 
903     }
904 }
905 
906 #endif
907 
gaussVertical(T ** src,T ** dst,const int W,const int H,const double sigma)908 template<class T> void gaussVertical (T** src, T** dst, const int W, const int H, const double sigma)
909 {
910     double b1, b2, b3, B, M[3][3];
911     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
912 
913     for (int i = 0; i < 3; i++)
914         for (int j = 0; j < 3; j++) {
915             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
916         }
917 
918     // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
919     static const int numcols = 8;
920     double temp2[H][numcols] ALIGNED16;
921     double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
922 #ifdef _OPENMP
923     #pragma omp for nowait
924 #endif
925 
926     for (unsigned int i = 0; i < static_cast<unsigned>(std::max(0, W - numcols + 1)); i += numcols) {
927         for (int k = 0; k < numcols; k++) {
928             temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
929             temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
930             temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
931         }
932 
933         for (int j = 3; j < H; j++) {
934             for (int k = 0; k < numcols; k++) {
935                 temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
936             }
937         }
938 
939         for (int k = 0; k < numcols; k++) {
940             temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
941             temp2H[k]   = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
942             temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
943         }
944 
945         for (int k = 0; k < numcols; k++) {
946             dst[H - 1][i + k] = temp2[H - 1][k] = temp2Hm1[k];
947             dst[H - 2][i + k] = temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
948             dst[H - 3][i + k] = temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
949         }
950 
951         for (int j = H - 4; j >= 0; j--) {
952             for (int k = 0; k < numcols; k++) {
953                 dst[j][i + k] = temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k];
954             }
955         }
956     }
957 
958 #ifdef _OPENMP
959     #pragma omp single
960 #endif
961 
962     // process remaining columns
963     for (int i = W - (W % numcols); i < W; i++) {
964         temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
965         temp2[1][0] = B * src[1][i] + b1 * temp2[0][0]  + b2 * src[0][i] + b3 * src[0][i];
966         temp2[2][0] = B * src[2][i] + b1 * temp2[1][0]  + b2 * temp2[0][0]  + b3 * src[0][i];
967 
968         for (int j = 3; j < H; j++) {
969             temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
970         }
971 
972         double temp2Hm1s = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
973         double temp2Hs   = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
974         double temp2Hp1s = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
975 
976         dst[H - 1][i] = temp2[H - 1][0] = temp2Hm1s;
977         dst[H - 2][i] = temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2Hs + b3 * temp2Hp1s;
978         dst[H - 3][i] = temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2Hs;
979 
980         for (int j = H - 4; j >= 0; j--) {
981             dst[j][i] = temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0];
982         }
983     }
984 }
985 
986 #ifndef __SSE2__
gaussVerticaldiv(T ** src,T ** dst,T ** divBuffer,const int W,const int H,const double sigma)987 template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const int W, const int H, const double sigma)
988 {
989     double b1, b2, b3, B, M[3][3];
990     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
991 
992     for (int i = 0; i < 3; i++)
993         for (int j = 0; j < 3; j++) {
994             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
995         }
996 
997     // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
998     static const int numcols = 8;
999     double temp2[H][numcols] ALIGNED16;
1000     double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
1001 #ifdef _OPENMP
1002     #pragma omp for nowait
1003 #endif
1004 
1005     for (int i = 0; i < W - numcols + 1; i += numcols) {
1006         for (int k = 0; k < numcols; k++) {
1007             temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1008             temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1009             temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
1010         }
1011 
1012         for (int j = 3; j < H; j++) {
1013             for (int k = 0; k < numcols; k++) {
1014                 temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
1015             }
1016         }
1017 
1018         for (int k = 0; k < numcols; k++) {
1019             temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1020             temp2H[k]   = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1021             temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1022         }
1023 
1024         for (int k = 0; k < numcols; k++) {
1025             dst[H - 1][i + k] = librtprocess::max(divBuffer[H - 1][i + k] / (temp2[H - 1][k] = temp2Hm1[k]), 0.0);
1026             dst[H - 2][i + k] = librtprocess::max(divBuffer[H - 2][i + k] / (temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k]), 0.0);
1027             dst[H - 3][i + k] = librtprocess::max(divBuffer[H - 3][i + k] / (temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k]), 0.0);
1028         }
1029 
1030         for (int j = H - 4; j >= 0; j--) {
1031             for (int k = 0; k < numcols; k++) {
1032                 dst[j][i + k] = librtprocess::max(divBuffer[j][i + k] / (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]), 0.0);
1033             }
1034         }
1035     }
1036 
1037 #ifdef _OPENMP
1038     #pragma omp single
1039 #endif
1040 
1041     // process remaining columns
1042     for (int i = W - (W % numcols); i < W; i++) {
1043         temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
1044         temp2[1][0] = B * src[1][i] + b1 * temp2[0][0]  + b2 * src[0][i] + b3 * src[0][i];
1045         temp2[2][0] = B * src[2][i] + b1 * temp2[1][0]  + b2 * temp2[0][0]  + b3 * src[0][i];
1046 
1047         for (int j = 3; j < H; j++) {
1048             temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
1049         }
1050 
1051         double temp2Hm1s = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
1052         double temp2Hs   = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
1053         double temp2Hp1s = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
1054 
1055         dst[H - 1][i] = librtprocess::max(divBuffer[H - 1][i] / (temp2[H - 1][0] = temp2Hm1s), 0.0);
1056         dst[H - 2][i] = librtprocess::max(divBuffer[H - 2][i] / (temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2Hs + b3 * temp2Hp1s), 0.0);
1057         dst[H - 3][i] = librtprocess::max(divBuffer[H - 3][i] / (temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2Hs), 0.0);
1058 
1059         for (int j = H - 4; j >= 0; j--) {
1060             dst[j][i] = librtprocess::max(divBuffer[j][i] / (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]), 0.0);
1061         }
1062     }
1063 }
1064 
gaussVerticalmult(T ** src,T ** dst,const int W,const int H,const double sigma)1065 template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const int H, const double sigma)
1066 {
1067     double b1, b2, b3, B, M[3][3];
1068     calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
1069 
1070     for (int i = 0; i < 3; i++)
1071         for (int j = 0; j < 3; j++) {
1072             M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
1073         }
1074 
1075     // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
1076     static const int numcols = 8;
1077     double temp2[H][numcols] ALIGNED16;
1078     double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
1079 #ifdef _OPENMP
1080     #pragma omp for nowait
1081 #endif
1082 
1083     for (int i = 0; i < W - numcols + 1; i += numcols) {
1084         for (int k = 0; k < numcols; k++) {
1085             temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1086             temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
1087             temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
1088         }
1089 
1090         for (int j = 3; j < H; j++) {
1091             for (int k = 0; k < numcols; k++) {
1092                 temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
1093             }
1094         }
1095 
1096         for (int k = 0; k < numcols; k++) {
1097             temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1098             temp2H[k]   = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1099             temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
1100         }
1101 
1102         for (int k = 0; k < numcols; k++) {
1103             dst[H - 1][i + k] *= temp2[H - 1][k] = temp2Hm1[k];
1104             dst[H - 2][i + k] *= temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
1105             dst[H - 3][i + k] *= temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
1106         }
1107 
1108         for (int j = H - 4; j >= 0; j--) {
1109             for (int k = 0; k < numcols; k++) {
1110                 dst[j][i + k] *= (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]);
1111             }
1112         }
1113     }
1114 
1115 #ifdef _OPENMP
1116     #pragma omp single
1117 #endif
1118 
1119     // process remaining columns
1120     for (int i = W - (W % numcols); i < W; i++) {
1121         temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
1122         temp2[1][0] = B * src[1][i] + b1 * temp2[0][0]  + b2 * src[0][i] + b3 * src[0][i];
1123         temp2[2][0] = B * src[2][i] + b1 * temp2[1][0]  + b2 * temp2[0][0]  + b3 * src[0][i];
1124 
1125         for (int j = 3; j < H; j++) {
1126             temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
1127         }
1128 
1129         double temp2Hm1s = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
1130         double temp2Hs   = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
1131         double temp2Hp1s = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
1132 
1133         dst[H - 1][i] *= temp2[H - 1][0] = temp2Hm1s;
1134         dst[H - 2][i] *= temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2Hs + b3 * temp2Hp1s;
1135         dst[H - 3][i] *= temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2Hs;
1136 
1137         for (int j = H - 4; j >= 0; j--) {
1138             dst[j][i] *= (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]);
1139         }
1140     }
1141 }
1142 #endif
1143 
1144 template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int H, const double sigma, T *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, T** buffer2 = nullptr)
1145 {
1146     static constexpr auto GAUSS_SKIP = 0.25;
1147     static constexpr auto GAUSS_3X3_LIMIT = 0.6;
1148     static constexpr auto GAUSS_DOUBLE = 25.0;
1149 
1150     if(buffer) {
1151         // special variant for very large sigma, currently only used by retinex algorithm
1152         // use iterated boxblur to approximate gaussian blur
1153         // Compute ideal averaging filter width and number of iterations
1154         int n = 1;
1155         double wIdeal = sqrt((12 * sigma * sigma) + 1);
1156 
1157         while(wIdeal > W || wIdeal > H) {
1158             n++;
1159             wIdeal = sqrt((12 * sigma * sigma / n) + 1);
1160         }
1161 
1162         if(n < 3) {
1163             n = 3;
1164             wIdeal = sqrt((12 * sigma * sigma / n) + 1);
1165         } else if(n > 6) {
1166             n = 6;
1167         }
1168 
1169         int wl = wIdeal;
1170 
1171         if(wl % 2 == 0) {
1172             wl--;
1173         }
1174 
1175         int wu = wl + 2;
1176 
1177         double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4);
1178         int m = round(mIdeal);
1179 
1180         int sizes[n];
1181 
1182         for(int i = 0; i < n; i++) {
1183             sizes[i] = ((i < m ? wl : wu) - 1) / 2;
1184         }
1185 
1186         librtprocess::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H);
1187 
1188         for(int i = 1; i < n; i++) {
1189             librtprocess::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H);
1190         }
1191     } else {
1192         if (sigma < GAUSS_SKIP) {
1193             // don't perform filtering
1194             if (src != dst) {
1195                 for(int i = 0; i < H; ++i) {
1196                     memcpy(dst[i], src[i], W * sizeof(T));
1197                 }
1198             }
1199         } else if (sigma < GAUSS_3X3_LIMIT) {
1200             if(src != dst) {
1201                 // If src != dst we can take the fast way
1202                 // compute 3x3 kernel values
1203                 double c0 = 1.0;
1204                 double c1 = exp( -0.5 * (librtprocess::SQR(1.0 / sigma)) );
1205                 double c2 = exp( -librtprocess::SQR(1.0 / sigma) );
1206 
1207                 // normalize kernel values
1208                 double sum = c0 + 4.0 * (c1 + c2);
1209                 c0 /= sum;
1210                 c1 /= sum;
1211                 c2 /= sum;
1212                 // compute kernel values for border pixels
1213                 double b1 = exp (-1.0 / (2.0 * sigma * sigma));
1214                 double bsum = 2.0 * b1 + 1.0;
1215                 b1 /= bsum;
1216                 double b0 = 1.0 / bsum;
1217 
1218                 switch (gausstype) {
1219                 case GAUSS_MULT     :
1220                     gauss3x3mult<T> (src, dst, W, H, c0, c1, c2, b0, b1);
1221                     break;
1222 
1223                 case GAUSS_DIV      :
1224                     gauss3x3div<T> (src, dst, buffer2, W, H, c0, c1, c2, b0, b1);
1225                     break;
1226 
1227                 case GAUSS_STANDARD :
1228                     gauss3x3<T> (src, dst, W, H, c0, c1, c2, b0, b1);
1229                     break;
1230                 }
1231             } else {
1232                 // compute kernel values for separated 3x3 gaussian blur
1233                 double c1 = exp (-1.0 / (2.0 * sigma * sigma));
1234                 double csum = 2.0 * c1 + 1.0;
1235                 c1 /= csum;
1236                 double c0 = 1.0 / csum;
1237                 gaussHorizontal3<T> (src, dst, W, H, c0, c1);
1238                 gaussVertical3<T>   (dst, dst, W, H, c0, c1);
1239             }
1240         } else {
1241 #ifdef __SSE2__
1242 
1243             if (sigma < GAUSS_DOUBLE) {
1244                 switch (gausstype) {
1245                 case GAUSS_MULT : {
1246                     gaussHorizontalSse<T> (src, src, W, H, sigma);
1247                     gaussVerticalSsemult<T> (src, dst, W, H, sigma);
1248                     break;
1249                 }
1250 
1251                 case GAUSS_DIV : {
1252                     gaussHorizontalSse<T> (src, dst, W, H, sigma);
1253                     gaussVerticalSsediv<T> (dst, dst, buffer2, W, H, sigma);
1254                     break;
1255                 }
1256 
1257                 case GAUSS_STANDARD : {
1258                     gaussHorizontalSse<T> (src, dst, W, H, sigma);
1259                     gaussVerticalSse<T> (dst, dst, W, H, sigma);
1260                     break;
1261                 }
1262                 }
1263             } else { // large sigma only with double precision
1264                 gaussHorizontal<T> (src, dst, W, H, sigma);
1265                 gaussVertical<T>   (dst, dst, W, H, sigma);
1266             }
1267 
1268 #else
1269 
1270             if (sigma < GAUSS_DOUBLE) {
1271                 switch (gausstype) {
1272                 case GAUSS_MULT : {
1273                     gaussHorizontal<T> (src, src, W, H, sigma);
1274                     gaussVerticalmult<T> (src, dst, W, H, sigma);
1275                     break;
1276                 }
1277 
1278                 case GAUSS_DIV : {
1279                     gaussHorizontal<T> (src, dst, W, H, sigma);
1280                     gaussVerticaldiv<T> (dst, dst, buffer2, W, H, sigma);
1281                     break;
1282                 }
1283 
1284                 case GAUSS_STANDARD : {
1285                     gaussHorizontal<T> (src, dst, W, H, sigma);
1286                     gaussVertical<T> (dst, dst, W, H, sigma);
1287                     break;
1288                 }
1289                 }
1290             } else { // large sigma only with double precision
1291                 gaussHorizontal<T> (src, dst, W, H, sigma);
1292                 gaussVertical<T>   (dst, dst, W, H, sigma);
1293             }
1294 
1295 #endif
1296         }
1297     }
1298 }
1299 
1300 static void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr)
1301 {
1302     gaussianBlurImpl<float>(src, dst, W, H, sigma, buffer, gausstype, buffer2);
1303 }
1304 
1305 
1306