1 
2 
3 #include "traster.h"
4 #include "trop.h"
5 #include "tpixelgr.h"
6 
7 #if defined(_WIN32) && defined(x64)
8 #define USE_SSE2
9 #endif
10 
11 #ifdef USE_SSE2
12 #include <emmintrin.h>
13 #include <stdlib.h>
14 #endif
15 
16 
17 namespace {
18 
19 #ifdef _WIN32
20 template <class T>
21 struct BlurPixel {
22   T b;
23   T g;
24   T r;
25   T m;
26 };
27 
28 #else
29 template <class T>
30 struct BlurPixel {
31   T r;
32   T g;
33   T b;
34   T m;
35 };
36 
37 #endif
38 
39 //===================================================================
40 
41 #define LOAD_COL_CODE                                                          \
42                                                                                \
43   buffer += x;                                                                 \
44   pix = col + by1;                                                             \
45                                                                                \
46   for (i = by1; i < ly + by1; i++) {                                           \
47     *pix++ = *buffer;                                                          \
48     buffer += lx;                                                              \
49   }                                                                            \
50                                                                                \
51   pix += by2;                                                                  \
52   left_val  = col[0];                                                          \
53   right_val = *(pix - 1);                                                      \
54   col--;                                                                       \
55                                                                                \
56   for (i = 0; i < brad; i++) {                                                 \
57     *col-- = left_val;                                                         \
58     *pix++ = right_val;                                                        \
59   }
60 
61 //-------------------------------------------------------------------
62 
63 #define BLUR_CODE(round_fac, channel_type)                                     \
64   pix1 = row1;                                                                 \
65   pix2 = row1 - 1;                                                             \
66                                                                                \
67   sigma1.r = pix1->r;                                                          \
68   sigma1.g = pix1->g;                                                          \
69   sigma1.b = pix1->b;                                                          \
70   sigma1.m = pix1->m;                                                          \
71   pix1++;                                                                      \
72                                                                                \
73   sigma2.r = sigma2.g = sigma2.b = sigma2.m = 0.0;                             \
74   sigma3.r = sigma3.g = sigma3.b = sigma3.m = 0.0;                             \
75                                                                                \
76   for (i = 1; i < brad; i++) {                                                 \
77     sigma1.r += pix1->r;                                                       \
78     sigma1.g += pix1->g;                                                       \
79     sigma1.b += pix1->b;                                                       \
80     sigma1.m += pix1->m;                                                       \
81                                                                                \
82     sigma2.r += pix2->r;                                                       \
83     sigma2.g += pix2->g;                                                       \
84     sigma2.b += pix2->b;                                                       \
85     sigma2.m += pix2->m;                                                       \
86                                                                                \
87     sigma3.r += i * (pix1->r + pix2->r);                                       \
88     sigma3.g += i * (pix1->g + pix2->g);                                       \
89     sigma3.b += i * (pix1->b + pix2->b);                                       \
90     sigma3.m += i * (pix1->m + pix2->m);                                       \
91                                                                                \
92     pix1++;                                                                    \
93     pix2--;                                                                    \
94   }                                                                            \
95                                                                                \
96   rsum = (sigma1.r + sigma2.r) * coeff - sigma3.r * coeffq + (round_fac);      \
97   gsum = (sigma1.g + sigma2.g) * coeff - sigma3.g * coeffq + (round_fac);      \
98   bsum = (sigma1.b + sigma2.b) * coeff - sigma3.b * coeffq + (round_fac);      \
99   msum = (sigma1.m + sigma2.m) * coeff - sigma3.m * coeffq + (round_fac);      \
100                                                                                \
101   row2->r = (channel_type)(rsum);                                              \
102   row2->g = (channel_type)(gsum);                                              \
103   row2->b = (channel_type)(bsum);                                              \
104   row2->m = (channel_type)(msum);                                              \
105   row2++;                                                                      \
106                                                                                \
107   sigma2.r += row1[-brad].r;                                                   \
108   sigma2.g += row1[-brad].g;                                                   \
109   sigma2.b += row1[-brad].b;                                                   \
110   sigma2.m += row1[-brad].m;                                                   \
111                                                                                \
112   pix1 = row1 + brad;                                                          \
113   pix2 = row1;                                                                 \
114   pix3 = row1 - brad;                                                          \
115   pix4 = row1 - brad + 1;                                                      \
116                                                                                \
117   desigma.r = sigma1.r - sigma2.r;                                             \
118   desigma.g = sigma1.g - sigma2.g;                                             \
119   desigma.b = sigma1.b - sigma2.b;                                             \
120   desigma.m = sigma1.m - sigma2.m;                                             \
121                                                                                \
122   for (i = 1; i < length; i++) {                                               \
123     desigma.r += pix1->r - 2 * pix2->r + pix3->r;                              \
124     desigma.g += pix1->g - 2 * pix2->g + pix3->g;                              \
125     desigma.b += pix1->b - 2 * pix2->b + pix3->b;                              \
126     desigma.m += pix1->m - 2 * pix2->m + pix3->m;                              \
127                                                                                \
128     rsum += (desigma.r + diff * (pix1->r - pix4->r)) * coeffq;                 \
129     gsum += (desigma.g + diff * (pix1->g - pix4->g)) * coeffq;                 \
130     bsum += (desigma.b + diff * (pix1->b - pix4->b)) * coeffq;                 \
131     msum += (desigma.m + diff * (pix1->m - pix4->m)) * coeffq;                 \
132                                                                                \
133     row2->r = (channel_type)(rsum);                                            \
134     row2->g = (channel_type)(gsum);                                            \
135     row2->b = (channel_type)(bsum);                                            \
136     row2->m = (channel_type)(msum);                                            \
137     row2++;                                                                    \
138     pix1++, pix2++, pix3++, pix4++;                                            \
139   }
140 
141 //-------------------------------------------------------------------
142 
143 template <typename PIXEL_SRC, typename PIXEL_DST, typename T>
blur_code(PIXEL_SRC * row1,PIXEL_DST * row2,int length,float coeff,float coeffq,int brad,float diff,float round_fac)144 inline void blur_code(PIXEL_SRC *row1, PIXEL_DST *row2, int length, float coeff,
145                       float coeffq, int brad, float diff, float round_fac) {
146   int i;
147   T rsum, gsum, bsum, msum;
148 
149   BlurPixel<T> sigma1, sigma2, sigma3, desigma;
150   PIXEL_SRC *pix1, *pix2, *pix3, *pix4;
151 
152   pix1 = row1;
153   pix2 = row1 - 1;
154 
155   sigma1.r = pix1->r;
156   sigma1.g = pix1->g;
157   sigma1.b = pix1->b;
158   sigma1.m = pix1->m;
159   pix1++;
160 
161   sigma2.r = sigma2.g = sigma2.b = sigma2.m = 0.0;
162   sigma3.r = sigma3.g = sigma3.b = sigma3.m = 0.0;
163 
164   for (i = 1; i < brad; i++) {
165     sigma1.r += pix1->r;
166     sigma1.g += pix1->g;
167     sigma1.b += pix1->b;
168     sigma1.m += pix1->m;
169 
170     sigma2.r += pix2->r;
171     sigma2.g += pix2->g;
172     sigma2.b += pix2->b;
173     sigma2.m += pix2->m;
174 
175     sigma3.r += i * (pix1->r + pix2->r);
176     sigma3.g += i * (pix1->g + pix2->g);
177     sigma3.b += i * (pix1->b + pix2->b);
178     sigma3.m += i * (pix1->m + pix2->m);
179 
180     pix1++;
181     pix2--;
182   }
183 
184   rsum = (sigma1.r + sigma2.r) * coeff - sigma3.r * coeffq + (round_fac);
185   gsum = (sigma1.g + sigma2.g) * coeff - sigma3.g * coeffq + (round_fac);
186   bsum = (sigma1.b + sigma2.b) * coeff - sigma3.b * coeffq + (round_fac);
187   msum = (sigma1.m + sigma2.m) * coeff - sigma3.m * coeffq + (round_fac);
188 
189   row2->r = rsum;
190   row2->g = gsum;
191   row2->b = bsum;
192   row2->m = msum;
193   row2++;
194 
195   sigma2.r += row1[-brad].r;
196   sigma2.g += row1[-brad].g;
197   sigma2.b += row1[-brad].b;
198   sigma2.m += row1[-brad].m;
199 
200   pix1 = row1 + brad;
201   pix2 = row1;
202   pix3 = row1 - brad;
203   pix4 = row1 - brad + 1;
204 
205   desigma.r = sigma1.r - sigma2.r;
206   desigma.g = sigma1.g - sigma2.g;
207   desigma.b = sigma1.b - sigma2.b;
208   desigma.m = sigma1.m - sigma2.m;
209 
210   for (i = 1; i < length; i++) {
211     desigma.r += pix1->r - 2 * pix2->r + pix3->r;
212     desigma.g += pix1->g - 2 * pix2->g + pix3->g;
213     desigma.b += pix1->b - 2 * pix2->b + pix3->b;
214     desigma.m += pix1->m - 2 * pix2->m + pix3->m;
215 
216     rsum += (desigma.r + diff * (pix1->r - pix4->r)) * coeffq;
217     gsum += (desigma.g + diff * (pix1->g - pix4->g)) * coeffq;
218     bsum += (desigma.b + diff * (pix1->b - pix4->b)) * coeffq;
219     msum += (desigma.m + diff * (pix1->m - pix4->m)) * coeffq;
220 
221     row2->r = rsum;
222     row2->g = gsum;
223     row2->b = bsum;
224     row2->m = msum;
225     row2++;
226     pix1++, pix2++, pix3++, pix4++;
227   }
228 }
229 
230 //-------------------------------------------------------------------
231 
232 #ifdef USE_SSE2
233 
234 //-------------------------------------------------------------------
235 template <class T, class P>
blur_code_SSE2(T * row1,BlurPixel<P> * row2,int length,float coeff,float coeffq,int brad,float diff,float round_fac)236 inline void blur_code_SSE2(T *row1, BlurPixel<P> *row2, int length, float coeff,
237                            float coeffq, int brad, float diff,
238                            float round_fac) {
239   static float two     = 2;
240   static __m128i zeros = _mm_setzero_si128();
241   static __m128 twos   = _mm_load_ps1(&two);
242 
243   int i;
244 
245   __m128 sigma1, sigma2, sigma3, desigma;
246   T *pix1, *pix2, *pix3, *pix4;
247 
248   pix1 = row1;
249   pix2 = row1 - 1;
250 
251   //
252   __m128i piPix1 = _mm_cvtsi32_si128(*(DWORD *)pix1);
253   __m128i piPix2 = _mm_cvtsi32_si128(*(DWORD *)pix2);
254 
255   piPix1 = _mm_unpacklo_epi8(piPix1, zeros);
256   piPix2 = _mm_unpacklo_epi8(piPix2, zeros);
257   piPix1 = _mm_unpacklo_epi16(piPix1, zeros);
258   piPix2 = _mm_unpacklo_epi16(piPix2, zeros);
259 
260   sigma1 = _mm_cvtepi32_ps(piPix1);
261   //
262 
263   pix1++;
264 
265   float zero = 0;
266   sigma2     = _mm_load1_ps(&zero);
267   sigma3     = _mm_load1_ps(&zero);
268 
269   for (i = 1; i < brad; i++) {
270     piPix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix1), zeros);
271     piPix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix2), zeros);
272 
273     __m128 pPix1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix1, zeros));
274     __m128 pPix2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix2, zeros));
275 
276     sigma1 = _mm_add_ps(sigma1, pPix1);
277     sigma2 = _mm_add_ps(sigma2, pPix2);
278 
279     __m128i pii = _mm_unpacklo_epi8(_mm_cvtsi32_si128(i), zeros);
280     __m128 pi   = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pii, zeros));
281 
282     pPix1  = _mm_add_ps(pPix1, pPix2);
283     pPix1  = _mm_mul_ps(pi, pPix1);      // i*(pix1 + pix2)
284     sigma3 = _mm_add_ps(sigma3, pPix1);  // sigma3 += i*(pix1 + pix2)
285 
286     pix1++;
287     pix2--;
288   }
289 
290   __m128 pCoeff    = _mm_load1_ps(&coeff);
291   __m128 pCoeffq   = _mm_load1_ps(&coeffq);
292   __m128 pRoundFac = _mm_load1_ps(&round_fac);
293   __m128 pDiff     = _mm_load1_ps(&diff);
294 
295   // sum = (sigma1 + sigma2)*coeff - sigma3*coeffq + round_fac
296   __m128 sum  = _mm_add_ps(sigma1, sigma2);
297   sum         = _mm_mul_ps(sum, pCoeff);
298   __m128 sum2 = _mm_mul_ps(sigma3, pCoeffq);
299   sum2        = _mm_add_ps(sum2, pRoundFac);
300   sum         = _mm_sub_ps(sum, sum2);
301   /*
302   __m128i isum = _mm_cvtps_epi32(sum);
303 isum = _mm_packs_epi32(isum, zeros);
304 isum = _mm_packs_epi16(isum, zeros);
305 
306 *(DWORD*)row2 = _mm_cvtsi128_si32(isum);
307 */
308   _mm_store_ps((float *)row2, sum);
309   row2++;
310 
311   __m128i piPixMin =
312       _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)(row1 - brad)), zeros);
313   __m128 pPixMin = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPixMin, zeros));
314   sigma2         = _mm_add_ps(sigma2, pPixMin);
315   /*
316 sigma2.r += row1[-brad].r;
317 sigma2.g += row1[-brad].g;
318 sigma2.b += row1[-brad].b;
319 sigma2.m += row1[-brad].m;
320 */
321 
322   pix1 = row1 + brad;
323   pix2 = row1;
324   pix3 = row1 - brad;
325   pix4 = row1 - brad + 1;
326 
327   desigma = _mm_sub_ps(sigma1, sigma2);
328 
329   for (i = 1; i < length; i++) {
330     piPix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix1), zeros);
331     piPix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix2), zeros);
332     __m128i piPix3 =
333         _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix3), zeros);
334     __m128i piPix4 =
335         _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix4), zeros);
336 
337     __m128 pPix1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix1, zeros));
338     __m128 pPix2 =
339         _mm_cvtepi32_ps(_mm_slli_epi32(_mm_unpacklo_epi16(piPix2, zeros), 1));
340     __m128 pPix3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix3, zeros));
341     __m128 pPix4 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix4, zeros));
342 
343     // desigma += pix1 - 2*pix2 + pix3
344     __m128 tmp = _mm_sub_ps(pPix3, pPix2);
345     tmp        = _mm_add_ps(tmp, pPix1);
346     desigma    = _mm_add_ps(desigma, tmp);
347 
348     // sum += (desigma + diff*(pix1 - pix4))*coeffq
349     tmp = _mm_sub_ps(pPix1, pPix4);
350     tmp = _mm_mul_ps(tmp, pDiff);
351     tmp = _mm_add_ps(desigma, tmp);
352     tmp = _mm_mul_ps(tmp, pCoeffq);
353     sum = _mm_add_ps(sum, tmp);
354     /*
355 isum = _mm_cvtps_epi32(sum);
356 isum = _mm_packs_epi32(isum, zeros);
357 isum = _mm_packs_epi16(isum, zeros);
358 
359 *(DWORD*)row2 = _mm_cvtsi128_si32(isum);
360 */
361     _mm_store_ps((float *)row2, sum);
362 
363     row2++;
364     pix1++, pix2++, pix3++, pix4++;
365   }
366 }
367 
368 //-------------------------------------------------------------------
369 template <class T, class P>
blur_code_SSE2(BlurPixel<P> * row1,T * row2,int length,float coeff,float coeffq,int brad,float diff,float round_fac)370 inline void blur_code_SSE2(BlurPixel<P> *row1, T *row2, int length, float coeff,
371                            float coeffq, int brad, float diff,
372                            float round_fac) {
373   int i;
374 
375   float two     = 2;
376   __m128i zeros = _mm_setzero_si128();
377   __m128 twos   = _mm_load_ps1(&two);
378 
379   __m128 sigma1, sigma2, sigma3, desigma;
380   BlurPixel<P> *pix1, *pix2, *pix3, *pix4;
381 
382   pix1 = row1;
383   pix2 = row1 - 1;
384 
385   __m128 pPix1 = _mm_load_ps((float *)pix1);
386   __m128 pPix2 = _mm_load_ps((float *)pix2);
387 
388   // sigma1 = *pix1
389   sigma1 = pPix1;
390 
391   pix1++;
392 
393   float zero = 0;
394   sigma2     = _mm_load1_ps(&zero);
395   sigma3     = _mm_load1_ps(&zero);
396 
397   for (i = 1; i < brad; i++) {
398     pPix1 = _mm_load_ps((float *)pix1);
399     pPix2 = _mm_load_ps((float *)pix2);
400 
401     sigma1 = _mm_add_ps(sigma1, pPix1);
402     sigma2 = _mm_add_ps(sigma2, pPix2);
403 
404     __m128i pii = _mm_unpacklo_epi8(_mm_cvtsi32_si128(i), zeros);
405     __m128 pi   = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pii, zeros));
406 
407     pPix1  = _mm_add_ps(pPix1, pPix2);
408     pPix1  = _mm_mul_ps(pi, pPix1);      // i*(pix1 + pix2)
409     sigma3 = _mm_add_ps(sigma3, pPix1);  // sigma3 += i*(pix1 + pix2)
410 
411     pix1++;
412     pix2--;
413   }
414 
415   __m128 pCoeff  = _mm_load1_ps(&coeff);
416   __m128 pCoeffq = _mm_load1_ps(&coeffq);
417   //  __m128 pRoundFac = _mm_load1_ps(&round_fac);
418   __m128 pDiff = _mm_load1_ps(&diff);
419 
420   // sum = (sigma1 + sigma2)*coeff - sigma3*coeffq + round_fac
421   __m128 sum  = _mm_add_ps(sigma1, sigma2);
422   sum         = _mm_mul_ps(sum, pCoeff);
423   __m128 sum2 = _mm_mul_ps(sigma3, pCoeffq);
424   // sum2 = _mm_add_ps(sum2, pRoundFac);
425   sum = _mm_sub_ps(sum, sum2);
426 
427   // converte i canali da float a char
428   __m128i isum = _mm_cvtps_epi32(sum);
429   isum         = _mm_packs_epi32(isum, zeros);
430   // isum = _mm_packs_epi16(isum, zeros);
431   isum           = _mm_packus_epi16(isum, zeros);
432   *(DWORD *)row2 = _mm_cvtsi128_si32(isum);
433 
434   row2++;
435 
436   // sigma2 += row1[-brad]
437   __m128 pPixMin = _mm_load_ps((float *)(row1 - brad));
438   sigma2         = _mm_add_ps(sigma2, pPixMin);
439 
440   pix1 = row1 + brad;
441   pix2 = row1;
442   pix3 = row1 - brad;
443   pix4 = row1 - brad + 1;
444 
445   desigma = _mm_sub_ps(sigma1, sigma2);
446 
447   for (i = 1; i < length; i++) {
448     __m128 pPix1 = _mm_load_ps((float *)pix1);
449     __m128 pPix2 = _mm_load_ps((float *)pix2);
450     __m128 pPix3 = _mm_load_ps((float *)pix3);
451     __m128 pPix4 = _mm_load_ps((float *)pix4);
452 
453     pPix2 = _mm_mul_ps(pPix2, twos);
454 
455     // desigma += pix1 - 2*pix2 + pix3
456     __m128 tmp = _mm_sub_ps(pPix3, pPix2);
457     tmp        = _mm_add_ps(tmp, pPix1);
458     desigma    = _mm_add_ps(desigma, tmp);
459 
460     // sum += (desigma + diff*(pix1 - pix4))*coeffq
461     tmp = _mm_sub_ps(pPix1, pPix4);
462     tmp = _mm_mul_ps(tmp, pDiff);
463     tmp = _mm_add_ps(desigma, tmp);
464     tmp = _mm_mul_ps(tmp, pCoeffq);
465     sum = _mm_add_ps(sum, tmp);
466 
467     // converte i canali da float a char
468     __m128i isum = _mm_cvtps_epi32(sum);
469     isum         = _mm_packs_epi32(isum, zeros);
470     // isum = _mm_packs_epi16(isum, zeros);  // QUESTA RIGA E' SBAGLIATA
471     // assert(false);
472     isum           = _mm_packus_epi16(isum, zeros);
473     *(DWORD *)row2 = _mm_cvtsi128_si32(isum);
474 
475     row2++;
476     pix1++, pix2++, pix3++, pix4++;
477   }
478 }
479 
480 #endif  // _WIN32
481 
482 //-------------------------------------------------------------------
483 
484 #define STORE_COL_CODE(crop_val)                                               \
485   {                                                                            \
486     int i, val;                                                                \
487     double ampl;                                                               \
488     buffer += x;                                                               \
489                                                                                \
490     ampl = 1.0 + blur / 15.0;                                                  \
491                                                                                \
492     if (backlit)                                                               \
493       for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) {      \
494         val       = troundp(col[i].r * ampl);                                  \
495         buffer->r = (val > crop_val) ? crop_val : val;                         \
496         val       = troundp(col[i].g * ampl);                                  \
497         buffer->g = (val > crop_val) ? crop_val : val;                         \
498         val       = troundp(col[i].b * ampl);                                  \
499         buffer->b = (val > crop_val) ? crop_val : val;                         \
500         val       = troundp(col[i].m * ampl);                                  \
501         buffer->m = (val > crop_val) ? crop_val : val;                         \
502         buffer += wrap;                                                        \
503       }                                                                        \
504     else                                                                       \
505       for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) {      \
506         *buffer = col[i];                                                      \
507         buffer += wrap;                                                        \
508       }                                                                        \
509   }
510 
511 //-------------------------------------------------------------------
512 template <class T>
store_colRgb(T * buffer,int wrap,int r_ly,T * col,int ly,int x,int dy,int backlit,double blur)513 void store_colRgb(T *buffer, int wrap, int r_ly, T *col, int ly, int x, int dy,
514                   int backlit, double blur) {
515   int val = T::maxChannelValue;
516 
517   if (val == 255)
518     STORE_COL_CODE(204)
519   else if (val == 65535)
520     STORE_COL_CODE(204 * 257)
521   else
522     assert(false);
523 }
524 
525 //-------------------------------------------------------------------
526 template <class T>
store_colGray(T * buffer,int wrap,int r_ly,T * col,int ly,int x,int dy,int backlit,double blur)527 void store_colGray(T *buffer, int wrap, int r_ly, T *col, int ly, int x, int dy,
528                    int backlit, double blur) {
529   int i;
530   double ampl;
531   buffer += x;
532 
533   ampl = 1.0 + blur / 15.0;
534 
535   for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) {
536     *buffer = col[i];
537     buffer += wrap;
538   }
539 }
540 
541 //-------------------------------------------------------------------
542 template <class P>
load_colRgb(BlurPixel<P> * buffer,BlurPixel<P> * col,int lx,int ly,int x,int brad,int by1,int by2)543 void load_colRgb(BlurPixel<P> *buffer, BlurPixel<P> *col, int lx, int ly, int x,
544                  int brad, int by1, int by2) {
545   int i;
546   BlurPixel<P> *pix, left_val, right_val;
547 
548   LOAD_COL_CODE
549 }
550 
551 //-------------------------------------------------------------------
552 
load_channel_col32(float * buffer,float * col,int lx,int ly,int x,int brad,int by1,int by2)553 void load_channel_col32(float *buffer, float *col, int lx, int ly, int x,
554                         int brad, int by1, int by2) {
555   int i;
556   float *pix, left_val, right_val;
557 
558   LOAD_COL_CODE
559 }
560 
561 //-------------------------------------------------------------------
562 template <class T, class Q, class P>
do_filtering_chan(BlurPixel<P> * row1,T * row2,int length,float coeff,float coeffq,int brad,float diff,bool useSSE)563 void do_filtering_chan(BlurPixel<P> *row1, T *row2, int length, float coeff,
564                        float coeffq, int brad, float diff, bool useSSE) {
565 #ifdef USE_SSE2
566   if (useSSE && T::maxChannelValue == 255)
567     blur_code_SSE2<T, P>(row1, row2, length, coeff, coeffq, brad, diff, 0.5);
568   else
569 #endif
570   {
571     int i;
572     P rsum, gsum, bsum, msum;
573     BlurPixel<P> sigma1, sigma2, sigma3, desigma;
574     BlurPixel<P> *pix1, *pix2, *pix3, *pix4;
575 
576     BLUR_CODE((P)0.5, Q)
577   }
578 }
579 
580 //-------------------------------------------------------------------
581 
582 template <class T>
do_filtering_channel_float(T * row1,float * row2,int length,float coeff,float coeffq,int brad,float diff)583 void do_filtering_channel_float(T *row1, float *row2, int length, float coeff,
584                                 float coeffq, int brad, float diff) {
585   int i;
586   float sum;
587   float sigma1, sigma2, sigma3, desigma;
588   T *pix1, *pix2, *pix3, *pix4;
589 
590   pix1 = row1;
591   pix2 = row1 - 1;
592 
593   sigma1 = pix1->value;
594   pix1++;
595 
596   sigma2 = 0.0;
597   sigma3 = 0.0;
598 
599   for (i = 1; i < brad; i++) {
600     sigma1 += pix1->value;
601     sigma2 += pix2->value;
602     sigma3 += i * (pix1->value + pix2->value);
603     pix1++;
604     pix2--;
605   }
606 
607   sum = (sigma1 + sigma2) * coeff - sigma3 * coeffq;
608 
609   *row2 = sum;
610   row2++;
611 
612   sigma2 += row1[-brad].value;
613 
614   pix1 = row1 + brad;
615   pix2 = row1;
616   pix3 = row1 - brad;
617   pix4 = row1 - brad + 1;
618 
619   desigma = sigma1 - sigma2;
620 
621   for (i = 1; i < length; i++) {
622     desigma += pix1->value - 2 * pix2->value + pix3->value;
623 
624     sum += (desigma + diff * (pix1->value - pix4->value)) * coeffq;
625 
626     *row2 = sum;
627     row2++;
628     pix1++, pix2++, pix3++, pix4++;
629   }
630 }
631 
632 //-------------------------------------------------------------------
633 template <class T>
do_filtering_channel_gray(float * row1,T * row2,int length,float coeff,float coeffq,int brad,float diff)634 void do_filtering_channel_gray(float *row1, T *row2, int length, float coeff,
635                                float coeffq, int brad, float diff) {
636   int i;
637   float sum;
638   float sigma1, sigma2, sigma3, desigma;
639   float *pix1, *pix2, *pix3, *pix4;
640 
641   pix1 = row1;
642   pix2 = row1 - 1;
643 
644   sigma1 = *pix1;
645   pix1++;
646 
647   sigma2 = 0.0;
648   sigma3 = 0.0;
649 
650   for (i = 1; i < brad; i++) {
651     sigma1 += *pix1;
652     sigma2 += *pix2;
653     sigma3 += i * (*pix1 + *pix2);
654     pix1++;
655     pix2--;
656   }
657 
658   sum = (sigma1 + sigma2) * coeff - sigma3 * coeffq + 0.5F;
659 
660   row2->setValue((int)sum);
661   row2++;
662 
663   sigma2 += row1[-brad];
664 
665   pix1 = row1 + brad;
666   pix2 = row1;
667   pix3 = row1 - brad;
668   pix4 = row1 - brad + 1;
669 
670   desigma = sigma1 - sigma2;
671 
672   for (i = 1; i < length; i++) {
673     desigma += *pix1 - 2 * (*pix2) + (*pix3);
674 
675     sum += (desigma + diff * (*pix1 - *pix4)) * coeffq;
676 
677     row2->setValue((int)sum);
678     row2++;
679     pix1++, pix2++, pix3++, pix4++;
680   }
681 }
682 
683 //-------------------------------------------------------------------
684 template <class T>
load_rowRgb(TRasterPT<T> & rin,T * row,int lx,int y,int brad,int bx1,int bx2)685 void load_rowRgb(TRasterPT<T> &rin, T *row, int lx, int y, int brad, int bx1,
686                  int bx2) {
687   int i;
688   T *buf32, *pix;
689   T left_val, right_val;
690 
691   pix = row + bx1;
692 
693   {
694     rin->lock();
695     buf32 = rin->pixels(y);
696 
697     for (i = 0; i < lx; i++) *pix++ = *buf32++;
698     rin->unlock();
699   }
700 
701   pix += bx2;
702   left_val  = *row;
703   right_val = *(pix - 1);
704   row--;
705 
706   for (i = 0; i < brad;
707        i++) /* pixels equal to the ones of border of image are added   */
708   {         /* to avoid a black blur to get into the picture.          */
709     *row-- = left_val;
710     *pix++ = right_val;
711   }
712 }
713 
714 //-------------------------------------------------------------------
715 template <class T>
load_rowGray(TRasterPT<T> & rin,T * row,int lx,int y,int brad,int bx1,int bx2)716 void load_rowGray(TRasterPT<T> &rin, T *row, int lx, int y, int brad, int bx1,
717                   int bx2) {
718   int i;
719   T *buf8, *pix;
720   T left_val, right_val;
721 
722   pix  = row + bx1;
723   buf8 = (T *)(rin->pixels(y));
724 
725   for (i = 0; i < lx; i++) *pix++ = *buf8++;
726 
727   pix += bx2;
728   left_val  = *row;
729   right_val = *(pix - 1);
730   row--;
731 
732   for (i = 0; i < brad;
733        i++) /* pixels equal to the ones of border of image are added   */
734   {         /* to avoid a black blur to get into the picture.          */
735     *row-- = left_val;
736     *pix++ = right_val;
737   }
738 }
739 
740 //-------------------------------------------------------------------
741 template <class T, class P>
do_filtering_floatRgb(T * row1,BlurPixel<P> * row2,int length,float coeff,float coeffq,int brad,float diff,bool useSSE)742 void do_filtering_floatRgb(T *row1, BlurPixel<P> *row2, int length, float coeff,
743                            float coeffq, int brad, float diff, bool useSSE) {
744 /*
745   int i;
746   float rsum, gsum, bsum,  msum;
747   CASM_FPIXEL sigma1, sigma2, sigma3, desigma;
748   TPixel32 *pix1, *pix2, *pix3, *pix4;
749 
750   BLUR_CODE(0, unsigned char)
751 */
752 
753 #ifdef USE_SSE2
754   if (useSSE)
755     blur_code_SSE2<T, P>(row1, row2, length, coeff, coeffq, brad, diff, 0);
756   else
757 #endif
758     blur_code<T, BlurPixel<P>, P>(row1, row2, length, coeff, coeffq, brad, diff,
759                                   0);
760 }
761 
762 //-------------------------------------------------------------------
763 template <class T, class Q, class P>
doBlurRgb(TRasterPT<T> & dstRas,TRasterPT<T> & srcRas,double blur,int dx,int dy,bool useSSE)764 void doBlurRgb(TRasterPT<T> &dstRas, TRasterPT<T> &srcRas, double blur, int dx,
765                int dy, bool useSSE) {
766   int i, lx, ly, llx, lly, brad;
767   float coeff, coeffq, diff;
768   int bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
769 
770   brad = (int)ceil(blur); /* number of pixels involved in the filtering */
771 
772   // int border = brad*2; // per sicurezza
773 
774   coeff = (float)(blur /
775                   (brad - brad * brad +
776                    blur * (2 * brad -
777                            1))); /*sum of the weights of triangolar filter. */
778   coeffq = (float)(coeff / blur);
779   diff   = (float)(blur - brad);
780   lx     = srcRas->getLx();
781   ly     = srcRas->getLy();
782 
783   if ((lx == 0) || (ly == 0)) return;
784 
785   llx = lx + bx1 + bx2;
786   lly = ly + by1 + by2;
787 
788   T *row1, *col2, *buffer;
789   BlurPixel<P> *row2, *col1, *fbuffer;
790   TRasterGR8P r1;
791 
792 #ifdef _WIN32
793   if (useSSE) {
794     fbuffer =
795         (BlurPixel<P> *)_aligned_malloc(llx * ly * sizeof(BlurPixel<P>), 16);
796     row1 = (T *)_aligned_malloc((llx + 2 * brad) * sizeof(T), 16);
797     col1 = (BlurPixel<P> *)_aligned_malloc(
798         (lly + 2 * brad) * sizeof(BlurPixel<P>), 16);
799     col2 = (T *)_aligned_malloc(lly * sizeof(T), 16);
800   } else
801 #endif
802   {
803     TRasterGR8P raux(llx * sizeof(BlurPixel<P>), ly);
804     r1 = raux;
805     r1->lock();
806     fbuffer = (BlurPixel<P> *)r1->getRawData();  // new CASM_FPIXEL [llx *ly];
807     row1    = new T[llx + 2 * brad];
808     col1    = new BlurPixel<P>[ lly + 2 * brad ];
809     col2    = new T[lly];
810   }
811 
812   if ((!fbuffer) || (!row1) || (!col1) || (!col2)) {
813     if (!useSSE) r1->unlock();
814 #ifdef _WIN32
815     if (useSSE) {
816       _aligned_free(col2);
817       _aligned_free(col1);
818       _aligned_free(row1);
819       _aligned_free(fbuffer);
820     } else
821 #endif
822     {
823       delete[] col2;
824       delete[] col1;
825       delete[] row1;
826     }
827     return;
828   }
829 
830   row2 = fbuffer;
831 
832   try {
833     for (i = 0; i < ly; i++) {
834       load_rowRgb<T>(srcRas, row1 + brad, lx, i, brad, bx1, bx2);
835       do_filtering_floatRgb<T>(row1 + brad, row2, llx, coeff, coeffq, brad,
836                                diff, useSSE);
837       row2 += llx;
838     }
839     dstRas->lock();
840     buffer = (T *)dstRas->getRawData();
841 
842     if (dy >= 0) buffer += (dstRas->getWrap()) * dy;
843 
844     for (i = (dx >= 0) ? 0 : -dx; i < std::min(llx, dstRas->getLx() - dx);
845          i++) {
846       load_colRgb<P>(fbuffer, col1 + brad, llx, ly, i, brad, by1, by2);
847       do_filtering_chan<T, Q, P>(col1 + brad, col2, lly, coeff, coeffq, brad,
848                                  diff, useSSE);
849       store_colRgb<T>(buffer, dstRas->getWrap(), dstRas->getLy(), col2, lly,
850                       i + dx, dy, 0, blur);
851     }
852     dstRas->unlock();
853   } catch (...) {
854     dstRas->clear();
855   }
856 
857 #ifdef _WIN32
858   if (useSSE) {
859     _aligned_free(col2);
860     _aligned_free(col1);
861     _aligned_free(row1);
862     _aligned_free(fbuffer);
863   } else
864 #endif
865   {
866     delete[] col2;
867     delete[] col1;
868     delete[] row1;
869     r1->unlock();
870   }
871 }
872 
873 //-------------------------------------------------------------------
874 
875 template <class T>
doBlurGray(TRasterPT<T> & dstRas,TRasterPT<T> & srcRas,double blur,int dx,int dy)876 void doBlurGray(TRasterPT<T> &dstRas, TRasterPT<T> &srcRas, double blur, int dx,
877                 int dy) {
878   int i, lx, ly, llx, lly, brad;
879   float coeff, coeffq, diff;
880   int bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
881 
882   brad  = (int)ceil(blur); /* number of pixels involved in the filtering */
883   coeff = (float)(blur /
884                   (brad - brad * brad +
885                    blur * (2 * brad -
886                            1))); /*sum of the weights of triangolar filter. */
887   coeffq = (float)(coeff / blur);
888   diff   = (float)(blur - brad);
889   lx     = srcRas->getLx();
890   ly     = srcRas->getLy();
891 
892   if ((lx == 0) || (ly == 0)) return;
893 
894   llx = lx + bx1 + bx2;
895   lly = ly + by1 + by2;
896 
897   T *row1, *col2, *buffer;
898   float *row2, *col1, *fbuffer;
899 
900   TRasterGR8P r1(llx * sizeof(float), ly);
901   r1->lock();
902   fbuffer = (float *)r1->getRawData();  // new float[llx *ly];
903 
904   row1 = new T[llx + 2 * brad];
905   col1 = new float[lly + 2 * brad];
906   col2 = new T[lly];
907 
908   if ((!fbuffer) || (!row1) || (!col1) || (!col2)) {
909     delete[] row1;
910     delete[] col1;
911     delete[] col2;
912     return;
913   }
914 
915   row2 = fbuffer;
916 
917   for (i = 0; i < ly; i++) {
918     load_rowGray<T>(srcRas, row1 + brad, lx, i, brad, bx1, bx2);
919     do_filtering_channel_float<T>(row1 + brad, row2, llx, coeff, coeffq, brad,
920                                   diff);
921     row2 += llx;
922   }
923   dstRas->lock();
924   buffer = (T *)dstRas->getRawData();
925 
926   if (dy >= 0) buffer += (dstRas->getWrap()) * dy;
927 
928   for (i = (dx >= 0) ? 0 : -dx; i < std::min(llx, dstRas->getLx() - dx); i++) {
929     load_channel_col32(fbuffer, col1 + brad, llx, ly, i, brad, by1, by2);
930     do_filtering_channel_gray<T>(col1 + brad, col2, lly, coeff, coeffq, brad,
931                                  diff);
932 
933     int backlit = 0;
934     store_colGray<T>(buffer, dstRas->getWrap(), dstRas->getLy(), col2, lly,
935                      i + dx, dy, backlit, blur);
936   }
937   dstRas->unlock();
938   delete[] col2;
939   delete[] col1;
940   delete[] row1;
941   r1->unlock();  // delete[]fbuffer;
942 }
943 
944 };  // namespace
945 
946 //====================================================================
947 
getBlurBorder(double blur)948 int TRop::getBlurBorder(double blur) {
949   int brad = (int)ceil(blur); /* number of pixels involved in the filtering */
950 
951   int border = brad * 2;  // per sicurezza
952   return border;
953 }
954 
955 //--------------------------------------------------------------------
956 
blur(const TRasterP & dstRas,const TRasterP & srcRas,double blur,int dx,int dy,bool useSSE)957 void TRop::blur(const TRasterP &dstRas, const TRasterP &srcRas, double blur,
958                 int dx, int dy, bool useSSE) {
959   TRaster32P dstRas32 = dstRas;
960   TRaster32P srcRas32 = srcRas;
961 
962   if (dstRas32 && srcRas32)
963     doBlurRgb<TPixel32, UCHAR, float>(dstRas32, srcRas32, blur, dx, dy, useSSE);
964   else {
965     TRaster64P dstRas64 = dstRas;
966     TRaster64P srcRas64 = srcRas;
967     if (dstRas64 && srcRas64)
968       doBlurRgb<TPixel64, USHORT, double>(dstRas64, srcRas64, blur, dx, dy,
969                                           useSSE);
970     else {
971       TRasterGR8P dstRasGR8 = dstRas;
972       TRasterGR8P srcRasGR8 = srcRas;
973 
974       if (dstRasGR8 && srcRasGR8)
975         doBlurGray<TPixelGR8>(dstRasGR8, srcRasGR8, blur, dx, dy);
976       else {
977         TRasterGR16P dstRasGR16 = dstRas;
978         TRasterGR16P srcRasGR16 = srcRas;
979 
980         if (dstRasGR16 && srcRasGR16)
981           doBlurGray<TPixelGR16>(dstRasGR16, srcRasGR16, blur, dx, dy);
982         else
983           throw TException("TRop::blur unsupported pixel type");
984       }
985     }
986   }
987 }
988