1
2
3 #include "traster.h"
4 #include "trop.h"
5 #include "tpixelgr.h"
6
7 #if defined(_WIN32) && defined(x64)
8 #define USE_SSE2
9 #endif
10
11 #ifdef USE_SSE2
12 #include <emmintrin.h>
13 #include <stdlib.h>
14 #endif
15
16
17 namespace {
18
19 #ifdef _WIN32
20 template <class T>
21 struct BlurPixel {
22 T b;
23 T g;
24 T r;
25 T m;
26 };
27
28 #else
29 template <class T>
30 struct BlurPixel {
31 T r;
32 T g;
33 T b;
34 T m;
35 };
36
37 #endif
38
39 //===================================================================
40
41 #define LOAD_COL_CODE \
42 \
43 buffer += x; \
44 pix = col + by1; \
45 \
46 for (i = by1; i < ly + by1; i++) { \
47 *pix++ = *buffer; \
48 buffer += lx; \
49 } \
50 \
51 pix += by2; \
52 left_val = col[0]; \
53 right_val = *(pix - 1); \
54 col--; \
55 \
56 for (i = 0; i < brad; i++) { \
57 *col-- = left_val; \
58 *pix++ = right_val; \
59 }
60
61 //-------------------------------------------------------------------
62
63 #define BLUR_CODE(round_fac, channel_type) \
64 pix1 = row1; \
65 pix2 = row1 - 1; \
66 \
67 sigma1.r = pix1->r; \
68 sigma1.g = pix1->g; \
69 sigma1.b = pix1->b; \
70 sigma1.m = pix1->m; \
71 pix1++; \
72 \
73 sigma2.r = sigma2.g = sigma2.b = sigma2.m = 0.0; \
74 sigma3.r = sigma3.g = sigma3.b = sigma3.m = 0.0; \
75 \
76 for (i = 1; i < brad; i++) { \
77 sigma1.r += pix1->r; \
78 sigma1.g += pix1->g; \
79 sigma1.b += pix1->b; \
80 sigma1.m += pix1->m; \
81 \
82 sigma2.r += pix2->r; \
83 sigma2.g += pix2->g; \
84 sigma2.b += pix2->b; \
85 sigma2.m += pix2->m; \
86 \
87 sigma3.r += i * (pix1->r + pix2->r); \
88 sigma3.g += i * (pix1->g + pix2->g); \
89 sigma3.b += i * (pix1->b + pix2->b); \
90 sigma3.m += i * (pix1->m + pix2->m); \
91 \
92 pix1++; \
93 pix2--; \
94 } \
95 \
96 rsum = (sigma1.r + sigma2.r) * coeff - sigma3.r * coeffq + (round_fac); \
97 gsum = (sigma1.g + sigma2.g) * coeff - sigma3.g * coeffq + (round_fac); \
98 bsum = (sigma1.b + sigma2.b) * coeff - sigma3.b * coeffq + (round_fac); \
99 msum = (sigma1.m + sigma2.m) * coeff - sigma3.m * coeffq + (round_fac); \
100 \
101 row2->r = (channel_type)(rsum); \
102 row2->g = (channel_type)(gsum); \
103 row2->b = (channel_type)(bsum); \
104 row2->m = (channel_type)(msum); \
105 row2++; \
106 \
107 sigma2.r += row1[-brad].r; \
108 sigma2.g += row1[-brad].g; \
109 sigma2.b += row1[-brad].b; \
110 sigma2.m += row1[-brad].m; \
111 \
112 pix1 = row1 + brad; \
113 pix2 = row1; \
114 pix3 = row1 - brad; \
115 pix4 = row1 - brad + 1; \
116 \
117 desigma.r = sigma1.r - sigma2.r; \
118 desigma.g = sigma1.g - sigma2.g; \
119 desigma.b = sigma1.b - sigma2.b; \
120 desigma.m = sigma1.m - sigma2.m; \
121 \
122 for (i = 1; i < length; i++) { \
123 desigma.r += pix1->r - 2 * pix2->r + pix3->r; \
124 desigma.g += pix1->g - 2 * pix2->g + pix3->g; \
125 desigma.b += pix1->b - 2 * pix2->b + pix3->b; \
126 desigma.m += pix1->m - 2 * pix2->m + pix3->m; \
127 \
128 rsum += (desigma.r + diff * (pix1->r - pix4->r)) * coeffq; \
129 gsum += (desigma.g + diff * (pix1->g - pix4->g)) * coeffq; \
130 bsum += (desigma.b + diff * (pix1->b - pix4->b)) * coeffq; \
131 msum += (desigma.m + diff * (pix1->m - pix4->m)) * coeffq; \
132 \
133 row2->r = (channel_type)(rsum); \
134 row2->g = (channel_type)(gsum); \
135 row2->b = (channel_type)(bsum); \
136 row2->m = (channel_type)(msum); \
137 row2++; \
138 pix1++, pix2++, pix3++, pix4++; \
139 }
140
141 //-------------------------------------------------------------------
142
143 template <typename PIXEL_SRC, typename PIXEL_DST, typename T>
blur_code(PIXEL_SRC * row1,PIXEL_DST * row2,int length,float coeff,float coeffq,int brad,float diff,float round_fac)144 inline void blur_code(PIXEL_SRC *row1, PIXEL_DST *row2, int length, float coeff,
145 float coeffq, int brad, float diff, float round_fac) {
146 int i;
147 T rsum, gsum, bsum, msum;
148
149 BlurPixel<T> sigma1, sigma2, sigma3, desigma;
150 PIXEL_SRC *pix1, *pix2, *pix3, *pix4;
151
152 pix1 = row1;
153 pix2 = row1 - 1;
154
155 sigma1.r = pix1->r;
156 sigma1.g = pix1->g;
157 sigma1.b = pix1->b;
158 sigma1.m = pix1->m;
159 pix1++;
160
161 sigma2.r = sigma2.g = sigma2.b = sigma2.m = 0.0;
162 sigma3.r = sigma3.g = sigma3.b = sigma3.m = 0.0;
163
164 for (i = 1; i < brad; i++) {
165 sigma1.r += pix1->r;
166 sigma1.g += pix1->g;
167 sigma1.b += pix1->b;
168 sigma1.m += pix1->m;
169
170 sigma2.r += pix2->r;
171 sigma2.g += pix2->g;
172 sigma2.b += pix2->b;
173 sigma2.m += pix2->m;
174
175 sigma3.r += i * (pix1->r + pix2->r);
176 sigma3.g += i * (pix1->g + pix2->g);
177 sigma3.b += i * (pix1->b + pix2->b);
178 sigma3.m += i * (pix1->m + pix2->m);
179
180 pix1++;
181 pix2--;
182 }
183
184 rsum = (sigma1.r + sigma2.r) * coeff - sigma3.r * coeffq + (round_fac);
185 gsum = (sigma1.g + sigma2.g) * coeff - sigma3.g * coeffq + (round_fac);
186 bsum = (sigma1.b + sigma2.b) * coeff - sigma3.b * coeffq + (round_fac);
187 msum = (sigma1.m + sigma2.m) * coeff - sigma3.m * coeffq + (round_fac);
188
189 row2->r = rsum;
190 row2->g = gsum;
191 row2->b = bsum;
192 row2->m = msum;
193 row2++;
194
195 sigma2.r += row1[-brad].r;
196 sigma2.g += row1[-brad].g;
197 sigma2.b += row1[-brad].b;
198 sigma2.m += row1[-brad].m;
199
200 pix1 = row1 + brad;
201 pix2 = row1;
202 pix3 = row1 - brad;
203 pix4 = row1 - brad + 1;
204
205 desigma.r = sigma1.r - sigma2.r;
206 desigma.g = sigma1.g - sigma2.g;
207 desigma.b = sigma1.b - sigma2.b;
208 desigma.m = sigma1.m - sigma2.m;
209
210 for (i = 1; i < length; i++) {
211 desigma.r += pix1->r - 2 * pix2->r + pix3->r;
212 desigma.g += pix1->g - 2 * pix2->g + pix3->g;
213 desigma.b += pix1->b - 2 * pix2->b + pix3->b;
214 desigma.m += pix1->m - 2 * pix2->m + pix3->m;
215
216 rsum += (desigma.r + diff * (pix1->r - pix4->r)) * coeffq;
217 gsum += (desigma.g + diff * (pix1->g - pix4->g)) * coeffq;
218 bsum += (desigma.b + diff * (pix1->b - pix4->b)) * coeffq;
219 msum += (desigma.m + diff * (pix1->m - pix4->m)) * coeffq;
220
221 row2->r = rsum;
222 row2->g = gsum;
223 row2->b = bsum;
224 row2->m = msum;
225 row2++;
226 pix1++, pix2++, pix3++, pix4++;
227 }
228 }
229
230 //-------------------------------------------------------------------
231
232 #ifdef USE_SSE2
233
234 //-------------------------------------------------------------------
235 template <class T, class P>
blur_code_SSE2(T * row1,BlurPixel<P> * row2,int length,float coeff,float coeffq,int brad,float diff,float round_fac)236 inline void blur_code_SSE2(T *row1, BlurPixel<P> *row2, int length, float coeff,
237 float coeffq, int brad, float diff,
238 float round_fac) {
239 static float two = 2;
240 static __m128i zeros = _mm_setzero_si128();
241 static __m128 twos = _mm_load_ps1(&two);
242
243 int i;
244
245 __m128 sigma1, sigma2, sigma3, desigma;
246 T *pix1, *pix2, *pix3, *pix4;
247
248 pix1 = row1;
249 pix2 = row1 - 1;
250
251 //
252 __m128i piPix1 = _mm_cvtsi32_si128(*(DWORD *)pix1);
253 __m128i piPix2 = _mm_cvtsi32_si128(*(DWORD *)pix2);
254
255 piPix1 = _mm_unpacklo_epi8(piPix1, zeros);
256 piPix2 = _mm_unpacklo_epi8(piPix2, zeros);
257 piPix1 = _mm_unpacklo_epi16(piPix1, zeros);
258 piPix2 = _mm_unpacklo_epi16(piPix2, zeros);
259
260 sigma1 = _mm_cvtepi32_ps(piPix1);
261 //
262
263 pix1++;
264
265 float zero = 0;
266 sigma2 = _mm_load1_ps(&zero);
267 sigma3 = _mm_load1_ps(&zero);
268
269 for (i = 1; i < brad; i++) {
270 piPix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix1), zeros);
271 piPix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix2), zeros);
272
273 __m128 pPix1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix1, zeros));
274 __m128 pPix2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix2, zeros));
275
276 sigma1 = _mm_add_ps(sigma1, pPix1);
277 sigma2 = _mm_add_ps(sigma2, pPix2);
278
279 __m128i pii = _mm_unpacklo_epi8(_mm_cvtsi32_si128(i), zeros);
280 __m128 pi = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pii, zeros));
281
282 pPix1 = _mm_add_ps(pPix1, pPix2);
283 pPix1 = _mm_mul_ps(pi, pPix1); // i*(pix1 + pix2)
284 sigma3 = _mm_add_ps(sigma3, pPix1); // sigma3 += i*(pix1 + pix2)
285
286 pix1++;
287 pix2--;
288 }
289
290 __m128 pCoeff = _mm_load1_ps(&coeff);
291 __m128 pCoeffq = _mm_load1_ps(&coeffq);
292 __m128 pRoundFac = _mm_load1_ps(&round_fac);
293 __m128 pDiff = _mm_load1_ps(&diff);
294
295 // sum = (sigma1 + sigma2)*coeff - sigma3*coeffq + round_fac
296 __m128 sum = _mm_add_ps(sigma1, sigma2);
297 sum = _mm_mul_ps(sum, pCoeff);
298 __m128 sum2 = _mm_mul_ps(sigma3, pCoeffq);
299 sum2 = _mm_add_ps(sum2, pRoundFac);
300 sum = _mm_sub_ps(sum, sum2);
301 /*
302 __m128i isum = _mm_cvtps_epi32(sum);
303 isum = _mm_packs_epi32(isum, zeros);
304 isum = _mm_packs_epi16(isum, zeros);
305
306 *(DWORD*)row2 = _mm_cvtsi128_si32(isum);
307 */
308 _mm_store_ps((float *)row2, sum);
309 row2++;
310
311 __m128i piPixMin =
312 _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)(row1 - brad)), zeros);
313 __m128 pPixMin = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPixMin, zeros));
314 sigma2 = _mm_add_ps(sigma2, pPixMin);
315 /*
316 sigma2.r += row1[-brad].r;
317 sigma2.g += row1[-brad].g;
318 sigma2.b += row1[-brad].b;
319 sigma2.m += row1[-brad].m;
320 */
321
322 pix1 = row1 + brad;
323 pix2 = row1;
324 pix3 = row1 - brad;
325 pix4 = row1 - brad + 1;
326
327 desigma = _mm_sub_ps(sigma1, sigma2);
328
329 for (i = 1; i < length; i++) {
330 piPix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix1), zeros);
331 piPix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix2), zeros);
332 __m128i piPix3 =
333 _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix3), zeros);
334 __m128i piPix4 =
335 _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix4), zeros);
336
337 __m128 pPix1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix1, zeros));
338 __m128 pPix2 =
339 _mm_cvtepi32_ps(_mm_slli_epi32(_mm_unpacklo_epi16(piPix2, zeros), 1));
340 __m128 pPix3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix3, zeros));
341 __m128 pPix4 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix4, zeros));
342
343 // desigma += pix1 - 2*pix2 + pix3
344 __m128 tmp = _mm_sub_ps(pPix3, pPix2);
345 tmp = _mm_add_ps(tmp, pPix1);
346 desigma = _mm_add_ps(desigma, tmp);
347
348 // sum += (desigma + diff*(pix1 - pix4))*coeffq
349 tmp = _mm_sub_ps(pPix1, pPix4);
350 tmp = _mm_mul_ps(tmp, pDiff);
351 tmp = _mm_add_ps(desigma, tmp);
352 tmp = _mm_mul_ps(tmp, pCoeffq);
353 sum = _mm_add_ps(sum, tmp);
354 /*
355 isum = _mm_cvtps_epi32(sum);
356 isum = _mm_packs_epi32(isum, zeros);
357 isum = _mm_packs_epi16(isum, zeros);
358
359 *(DWORD*)row2 = _mm_cvtsi128_si32(isum);
360 */
361 _mm_store_ps((float *)row2, sum);
362
363 row2++;
364 pix1++, pix2++, pix3++, pix4++;
365 }
366 }
367
368 //-------------------------------------------------------------------
369 template <class T, class P>
blur_code_SSE2(BlurPixel<P> * row1,T * row2,int length,float coeff,float coeffq,int brad,float diff,float round_fac)370 inline void blur_code_SSE2(BlurPixel<P> *row1, T *row2, int length, float coeff,
371 float coeffq, int brad, float diff,
372 float round_fac) {
373 int i;
374
375 float two = 2;
376 __m128i zeros = _mm_setzero_si128();
377 __m128 twos = _mm_load_ps1(&two);
378
379 __m128 sigma1, sigma2, sigma3, desigma;
380 BlurPixel<P> *pix1, *pix2, *pix3, *pix4;
381
382 pix1 = row1;
383 pix2 = row1 - 1;
384
385 __m128 pPix1 = _mm_load_ps((float *)pix1);
386 __m128 pPix2 = _mm_load_ps((float *)pix2);
387
388 // sigma1 = *pix1
389 sigma1 = pPix1;
390
391 pix1++;
392
393 float zero = 0;
394 sigma2 = _mm_load1_ps(&zero);
395 sigma3 = _mm_load1_ps(&zero);
396
397 for (i = 1; i < brad; i++) {
398 pPix1 = _mm_load_ps((float *)pix1);
399 pPix2 = _mm_load_ps((float *)pix2);
400
401 sigma1 = _mm_add_ps(sigma1, pPix1);
402 sigma2 = _mm_add_ps(sigma2, pPix2);
403
404 __m128i pii = _mm_unpacklo_epi8(_mm_cvtsi32_si128(i), zeros);
405 __m128 pi = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pii, zeros));
406
407 pPix1 = _mm_add_ps(pPix1, pPix2);
408 pPix1 = _mm_mul_ps(pi, pPix1); // i*(pix1 + pix2)
409 sigma3 = _mm_add_ps(sigma3, pPix1); // sigma3 += i*(pix1 + pix2)
410
411 pix1++;
412 pix2--;
413 }
414
415 __m128 pCoeff = _mm_load1_ps(&coeff);
416 __m128 pCoeffq = _mm_load1_ps(&coeffq);
417 // __m128 pRoundFac = _mm_load1_ps(&round_fac);
418 __m128 pDiff = _mm_load1_ps(&diff);
419
420 // sum = (sigma1 + sigma2)*coeff - sigma3*coeffq + round_fac
421 __m128 sum = _mm_add_ps(sigma1, sigma2);
422 sum = _mm_mul_ps(sum, pCoeff);
423 __m128 sum2 = _mm_mul_ps(sigma3, pCoeffq);
424 // sum2 = _mm_add_ps(sum2, pRoundFac);
425 sum = _mm_sub_ps(sum, sum2);
426
427 // converte i canali da float a char
428 __m128i isum = _mm_cvtps_epi32(sum);
429 isum = _mm_packs_epi32(isum, zeros);
430 // isum = _mm_packs_epi16(isum, zeros);
431 isum = _mm_packus_epi16(isum, zeros);
432 *(DWORD *)row2 = _mm_cvtsi128_si32(isum);
433
434 row2++;
435
436 // sigma2 += row1[-brad]
437 __m128 pPixMin = _mm_load_ps((float *)(row1 - brad));
438 sigma2 = _mm_add_ps(sigma2, pPixMin);
439
440 pix1 = row1 + brad;
441 pix2 = row1;
442 pix3 = row1 - brad;
443 pix4 = row1 - brad + 1;
444
445 desigma = _mm_sub_ps(sigma1, sigma2);
446
447 for (i = 1; i < length; i++) {
448 __m128 pPix1 = _mm_load_ps((float *)pix1);
449 __m128 pPix2 = _mm_load_ps((float *)pix2);
450 __m128 pPix3 = _mm_load_ps((float *)pix3);
451 __m128 pPix4 = _mm_load_ps((float *)pix4);
452
453 pPix2 = _mm_mul_ps(pPix2, twos);
454
455 // desigma += pix1 - 2*pix2 + pix3
456 __m128 tmp = _mm_sub_ps(pPix3, pPix2);
457 tmp = _mm_add_ps(tmp, pPix1);
458 desigma = _mm_add_ps(desigma, tmp);
459
460 // sum += (desigma + diff*(pix1 - pix4))*coeffq
461 tmp = _mm_sub_ps(pPix1, pPix4);
462 tmp = _mm_mul_ps(tmp, pDiff);
463 tmp = _mm_add_ps(desigma, tmp);
464 tmp = _mm_mul_ps(tmp, pCoeffq);
465 sum = _mm_add_ps(sum, tmp);
466
467 // converte i canali da float a char
468 __m128i isum = _mm_cvtps_epi32(sum);
469 isum = _mm_packs_epi32(isum, zeros);
470 // isum = _mm_packs_epi16(isum, zeros); // QUESTA RIGA E' SBAGLIATA
471 // assert(false);
472 isum = _mm_packus_epi16(isum, zeros);
473 *(DWORD *)row2 = _mm_cvtsi128_si32(isum);
474
475 row2++;
476 pix1++, pix2++, pix3++, pix4++;
477 }
478 }
479
480 #endif // _WIN32
481
482 //-------------------------------------------------------------------
483
484 #define STORE_COL_CODE(crop_val) \
485 { \
486 int i, val; \
487 double ampl; \
488 buffer += x; \
489 \
490 ampl = 1.0 + blur / 15.0; \
491 \
492 if (backlit) \
493 for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) { \
494 val = troundp(col[i].r * ampl); \
495 buffer->r = (val > crop_val) ? crop_val : val; \
496 val = troundp(col[i].g * ampl); \
497 buffer->g = (val > crop_val) ? crop_val : val; \
498 val = troundp(col[i].b * ampl); \
499 buffer->b = (val > crop_val) ? crop_val : val; \
500 val = troundp(col[i].m * ampl); \
501 buffer->m = (val > crop_val) ? crop_val : val; \
502 buffer += wrap; \
503 } \
504 else \
505 for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) { \
506 *buffer = col[i]; \
507 buffer += wrap; \
508 } \
509 }
510
511 //-------------------------------------------------------------------
512 template <class T>
store_colRgb(T * buffer,int wrap,int r_ly,T * col,int ly,int x,int dy,int backlit,double blur)513 void store_colRgb(T *buffer, int wrap, int r_ly, T *col, int ly, int x, int dy,
514 int backlit, double blur) {
515 int val = T::maxChannelValue;
516
517 if (val == 255)
518 STORE_COL_CODE(204)
519 else if (val == 65535)
520 STORE_COL_CODE(204 * 257)
521 else
522 assert(false);
523 }
524
525 //-------------------------------------------------------------------
526 template <class T>
store_colGray(T * buffer,int wrap,int r_ly,T * col,int ly,int x,int dy,int backlit,double blur)527 void store_colGray(T *buffer, int wrap, int r_ly, T *col, int ly, int x, int dy,
528 int backlit, double blur) {
529 int i;
530 double ampl;
531 buffer += x;
532
533 ampl = 1.0 + blur / 15.0;
534
535 for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) {
536 *buffer = col[i];
537 buffer += wrap;
538 }
539 }
540
541 //-------------------------------------------------------------------
542 template <class P>
load_colRgb(BlurPixel<P> * buffer,BlurPixel<P> * col,int lx,int ly,int x,int brad,int by1,int by2)543 void load_colRgb(BlurPixel<P> *buffer, BlurPixel<P> *col, int lx, int ly, int x,
544 int brad, int by1, int by2) {
545 int i;
546 BlurPixel<P> *pix, left_val, right_val;
547
548 LOAD_COL_CODE
549 }
550
551 //-------------------------------------------------------------------
552
load_channel_col32(float * buffer,float * col,int lx,int ly,int x,int brad,int by1,int by2)553 void load_channel_col32(float *buffer, float *col, int lx, int ly, int x,
554 int brad, int by1, int by2) {
555 int i;
556 float *pix, left_val, right_val;
557
558 LOAD_COL_CODE
559 }
560
561 //-------------------------------------------------------------------
562 template <class T, class Q, class P>
do_filtering_chan(BlurPixel<P> * row1,T * row2,int length,float coeff,float coeffq,int brad,float diff,bool useSSE)563 void do_filtering_chan(BlurPixel<P> *row1, T *row2, int length, float coeff,
564 float coeffq, int brad, float diff, bool useSSE) {
565 #ifdef USE_SSE2
566 if (useSSE && T::maxChannelValue == 255)
567 blur_code_SSE2<T, P>(row1, row2, length, coeff, coeffq, brad, diff, 0.5);
568 else
569 #endif
570 {
571 int i;
572 P rsum, gsum, bsum, msum;
573 BlurPixel<P> sigma1, sigma2, sigma3, desigma;
574 BlurPixel<P> *pix1, *pix2, *pix3, *pix4;
575
576 BLUR_CODE((P)0.5, Q)
577 }
578 }
579
580 //-------------------------------------------------------------------
581
582 template <class T>
do_filtering_channel_float(T * row1,float * row2,int length,float coeff,float coeffq,int brad,float diff)583 void do_filtering_channel_float(T *row1, float *row2, int length, float coeff,
584 float coeffq, int brad, float diff) {
585 int i;
586 float sum;
587 float sigma1, sigma2, sigma3, desigma;
588 T *pix1, *pix2, *pix3, *pix4;
589
590 pix1 = row1;
591 pix2 = row1 - 1;
592
593 sigma1 = pix1->value;
594 pix1++;
595
596 sigma2 = 0.0;
597 sigma3 = 0.0;
598
599 for (i = 1; i < brad; i++) {
600 sigma1 += pix1->value;
601 sigma2 += pix2->value;
602 sigma3 += i * (pix1->value + pix2->value);
603 pix1++;
604 pix2--;
605 }
606
607 sum = (sigma1 + sigma2) * coeff - sigma3 * coeffq;
608
609 *row2 = sum;
610 row2++;
611
612 sigma2 += row1[-brad].value;
613
614 pix1 = row1 + brad;
615 pix2 = row1;
616 pix3 = row1 - brad;
617 pix4 = row1 - brad + 1;
618
619 desigma = sigma1 - sigma2;
620
621 for (i = 1; i < length; i++) {
622 desigma += pix1->value - 2 * pix2->value + pix3->value;
623
624 sum += (desigma + diff * (pix1->value - pix4->value)) * coeffq;
625
626 *row2 = sum;
627 row2++;
628 pix1++, pix2++, pix3++, pix4++;
629 }
630 }
631
632 //-------------------------------------------------------------------
633 template <class T>
do_filtering_channel_gray(float * row1,T * row2,int length,float coeff,float coeffq,int brad,float diff)634 void do_filtering_channel_gray(float *row1, T *row2, int length, float coeff,
635 float coeffq, int brad, float diff) {
636 int i;
637 float sum;
638 float sigma1, sigma2, sigma3, desigma;
639 float *pix1, *pix2, *pix3, *pix4;
640
641 pix1 = row1;
642 pix2 = row1 - 1;
643
644 sigma1 = *pix1;
645 pix1++;
646
647 sigma2 = 0.0;
648 sigma3 = 0.0;
649
650 for (i = 1; i < brad; i++) {
651 sigma1 += *pix1;
652 sigma2 += *pix2;
653 sigma3 += i * (*pix1 + *pix2);
654 pix1++;
655 pix2--;
656 }
657
658 sum = (sigma1 + sigma2) * coeff - sigma3 * coeffq + 0.5F;
659
660 row2->setValue((int)sum);
661 row2++;
662
663 sigma2 += row1[-brad];
664
665 pix1 = row1 + brad;
666 pix2 = row1;
667 pix3 = row1 - brad;
668 pix4 = row1 - brad + 1;
669
670 desigma = sigma1 - sigma2;
671
672 for (i = 1; i < length; i++) {
673 desigma += *pix1 - 2 * (*pix2) + (*pix3);
674
675 sum += (desigma + diff * (*pix1 - *pix4)) * coeffq;
676
677 row2->setValue((int)sum);
678 row2++;
679 pix1++, pix2++, pix3++, pix4++;
680 }
681 }
682
683 //-------------------------------------------------------------------
684 template <class T>
load_rowRgb(TRasterPT<T> & rin,T * row,int lx,int y,int brad,int bx1,int bx2)685 void load_rowRgb(TRasterPT<T> &rin, T *row, int lx, int y, int brad, int bx1,
686 int bx2) {
687 int i;
688 T *buf32, *pix;
689 T left_val, right_val;
690
691 pix = row + bx1;
692
693 {
694 rin->lock();
695 buf32 = rin->pixels(y);
696
697 for (i = 0; i < lx; i++) *pix++ = *buf32++;
698 rin->unlock();
699 }
700
701 pix += bx2;
702 left_val = *row;
703 right_val = *(pix - 1);
704 row--;
705
706 for (i = 0; i < brad;
707 i++) /* pixels equal to the ones of border of image are added */
708 { /* to avoid a black blur to get into the picture. */
709 *row-- = left_val;
710 *pix++ = right_val;
711 }
712 }
713
714 //-------------------------------------------------------------------
715 template <class T>
load_rowGray(TRasterPT<T> & rin,T * row,int lx,int y,int brad,int bx1,int bx2)716 void load_rowGray(TRasterPT<T> &rin, T *row, int lx, int y, int brad, int bx1,
717 int bx2) {
718 int i;
719 T *buf8, *pix;
720 T left_val, right_val;
721
722 pix = row + bx1;
723 buf8 = (T *)(rin->pixels(y));
724
725 for (i = 0; i < lx; i++) *pix++ = *buf8++;
726
727 pix += bx2;
728 left_val = *row;
729 right_val = *(pix - 1);
730 row--;
731
732 for (i = 0; i < brad;
733 i++) /* pixels equal to the ones of border of image are added */
734 { /* to avoid a black blur to get into the picture. */
735 *row-- = left_val;
736 *pix++ = right_val;
737 }
738 }
739
740 //-------------------------------------------------------------------
741 template <class T, class P>
do_filtering_floatRgb(T * row1,BlurPixel<P> * row2,int length,float coeff,float coeffq,int brad,float diff,bool useSSE)742 void do_filtering_floatRgb(T *row1, BlurPixel<P> *row2, int length, float coeff,
743 float coeffq, int brad, float diff, bool useSSE) {
744 /*
745 int i;
746 float rsum, gsum, bsum, msum;
747 CASM_FPIXEL sigma1, sigma2, sigma3, desigma;
748 TPixel32 *pix1, *pix2, *pix3, *pix4;
749
750 BLUR_CODE(0, unsigned char)
751 */
752
753 #ifdef USE_SSE2
754 if (useSSE)
755 blur_code_SSE2<T, P>(row1, row2, length, coeff, coeffq, brad, diff, 0);
756 else
757 #endif
758 blur_code<T, BlurPixel<P>, P>(row1, row2, length, coeff, coeffq, brad, diff,
759 0);
760 }
761
762 //-------------------------------------------------------------------
763 template <class T, class Q, class P>
doBlurRgb(TRasterPT<T> & dstRas,TRasterPT<T> & srcRas,double blur,int dx,int dy,bool useSSE)764 void doBlurRgb(TRasterPT<T> &dstRas, TRasterPT<T> &srcRas, double blur, int dx,
765 int dy, bool useSSE) {
766 int i, lx, ly, llx, lly, brad;
767 float coeff, coeffq, diff;
768 int bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
769
770 brad = (int)ceil(blur); /* number of pixels involved in the filtering */
771
772 // int border = brad*2; // per sicurezza
773
774 coeff = (float)(blur /
775 (brad - brad * brad +
776 blur * (2 * brad -
777 1))); /*sum of the weights of triangolar filter. */
778 coeffq = (float)(coeff / blur);
779 diff = (float)(blur - brad);
780 lx = srcRas->getLx();
781 ly = srcRas->getLy();
782
783 if ((lx == 0) || (ly == 0)) return;
784
785 llx = lx + bx1 + bx2;
786 lly = ly + by1 + by2;
787
788 T *row1, *col2, *buffer;
789 BlurPixel<P> *row2, *col1, *fbuffer;
790 TRasterGR8P r1;
791
792 #ifdef _WIN32
793 if (useSSE) {
794 fbuffer =
795 (BlurPixel<P> *)_aligned_malloc(llx * ly * sizeof(BlurPixel<P>), 16);
796 row1 = (T *)_aligned_malloc((llx + 2 * brad) * sizeof(T), 16);
797 col1 = (BlurPixel<P> *)_aligned_malloc(
798 (lly + 2 * brad) * sizeof(BlurPixel<P>), 16);
799 col2 = (T *)_aligned_malloc(lly * sizeof(T), 16);
800 } else
801 #endif
802 {
803 TRasterGR8P raux(llx * sizeof(BlurPixel<P>), ly);
804 r1 = raux;
805 r1->lock();
806 fbuffer = (BlurPixel<P> *)r1->getRawData(); // new CASM_FPIXEL [llx *ly];
807 row1 = new T[llx + 2 * brad];
808 col1 = new BlurPixel<P>[ lly + 2 * brad ];
809 col2 = new T[lly];
810 }
811
812 if ((!fbuffer) || (!row1) || (!col1) || (!col2)) {
813 if (!useSSE) r1->unlock();
814 #ifdef _WIN32
815 if (useSSE) {
816 _aligned_free(col2);
817 _aligned_free(col1);
818 _aligned_free(row1);
819 _aligned_free(fbuffer);
820 } else
821 #endif
822 {
823 delete[] col2;
824 delete[] col1;
825 delete[] row1;
826 }
827 return;
828 }
829
830 row2 = fbuffer;
831
832 try {
833 for (i = 0; i < ly; i++) {
834 load_rowRgb<T>(srcRas, row1 + brad, lx, i, brad, bx1, bx2);
835 do_filtering_floatRgb<T>(row1 + brad, row2, llx, coeff, coeffq, brad,
836 diff, useSSE);
837 row2 += llx;
838 }
839 dstRas->lock();
840 buffer = (T *)dstRas->getRawData();
841
842 if (dy >= 0) buffer += (dstRas->getWrap()) * dy;
843
844 for (i = (dx >= 0) ? 0 : -dx; i < std::min(llx, dstRas->getLx() - dx);
845 i++) {
846 load_colRgb<P>(fbuffer, col1 + brad, llx, ly, i, brad, by1, by2);
847 do_filtering_chan<T, Q, P>(col1 + brad, col2, lly, coeff, coeffq, brad,
848 diff, useSSE);
849 store_colRgb<T>(buffer, dstRas->getWrap(), dstRas->getLy(), col2, lly,
850 i + dx, dy, 0, blur);
851 }
852 dstRas->unlock();
853 } catch (...) {
854 dstRas->clear();
855 }
856
857 #ifdef _WIN32
858 if (useSSE) {
859 _aligned_free(col2);
860 _aligned_free(col1);
861 _aligned_free(row1);
862 _aligned_free(fbuffer);
863 } else
864 #endif
865 {
866 delete[] col2;
867 delete[] col1;
868 delete[] row1;
869 r1->unlock();
870 }
871 }
872
873 //-------------------------------------------------------------------
874
875 template <class T>
doBlurGray(TRasterPT<T> & dstRas,TRasterPT<T> & srcRas,double blur,int dx,int dy)876 void doBlurGray(TRasterPT<T> &dstRas, TRasterPT<T> &srcRas, double blur, int dx,
877 int dy) {
878 int i, lx, ly, llx, lly, brad;
879 float coeff, coeffq, diff;
880 int bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
881
882 brad = (int)ceil(blur); /* number of pixels involved in the filtering */
883 coeff = (float)(blur /
884 (brad - brad * brad +
885 blur * (2 * brad -
886 1))); /*sum of the weights of triangolar filter. */
887 coeffq = (float)(coeff / blur);
888 diff = (float)(blur - brad);
889 lx = srcRas->getLx();
890 ly = srcRas->getLy();
891
892 if ((lx == 0) || (ly == 0)) return;
893
894 llx = lx + bx1 + bx2;
895 lly = ly + by1 + by2;
896
897 T *row1, *col2, *buffer;
898 float *row2, *col1, *fbuffer;
899
900 TRasterGR8P r1(llx * sizeof(float), ly);
901 r1->lock();
902 fbuffer = (float *)r1->getRawData(); // new float[llx *ly];
903
904 row1 = new T[llx + 2 * brad];
905 col1 = new float[lly + 2 * brad];
906 col2 = new T[lly];
907
908 if ((!fbuffer) || (!row1) || (!col1) || (!col2)) {
909 delete[] row1;
910 delete[] col1;
911 delete[] col2;
912 return;
913 }
914
915 row2 = fbuffer;
916
917 for (i = 0; i < ly; i++) {
918 load_rowGray<T>(srcRas, row1 + brad, lx, i, brad, bx1, bx2);
919 do_filtering_channel_float<T>(row1 + brad, row2, llx, coeff, coeffq, brad,
920 diff);
921 row2 += llx;
922 }
923 dstRas->lock();
924 buffer = (T *)dstRas->getRawData();
925
926 if (dy >= 0) buffer += (dstRas->getWrap()) * dy;
927
928 for (i = (dx >= 0) ? 0 : -dx; i < std::min(llx, dstRas->getLx() - dx); i++) {
929 load_channel_col32(fbuffer, col1 + brad, llx, ly, i, brad, by1, by2);
930 do_filtering_channel_gray<T>(col1 + brad, col2, lly, coeff, coeffq, brad,
931 diff);
932
933 int backlit = 0;
934 store_colGray<T>(buffer, dstRas->getWrap(), dstRas->getLy(), col2, lly,
935 i + dx, dy, backlit, blur);
936 }
937 dstRas->unlock();
938 delete[] col2;
939 delete[] col1;
940 delete[] row1;
941 r1->unlock(); // delete[]fbuffer;
942 }
943
944 }; // namespace
945
946 //====================================================================
947
getBlurBorder(double blur)948 int TRop::getBlurBorder(double blur) {
949 int brad = (int)ceil(blur); /* number of pixels involved in the filtering */
950
951 int border = brad * 2; // per sicurezza
952 return border;
953 }
954
955 //--------------------------------------------------------------------
956
blur(const TRasterP & dstRas,const TRasterP & srcRas,double blur,int dx,int dy,bool useSSE)957 void TRop::blur(const TRasterP &dstRas, const TRasterP &srcRas, double blur,
958 int dx, int dy, bool useSSE) {
959 TRaster32P dstRas32 = dstRas;
960 TRaster32P srcRas32 = srcRas;
961
962 if (dstRas32 && srcRas32)
963 doBlurRgb<TPixel32, UCHAR, float>(dstRas32, srcRas32, blur, dx, dy, useSSE);
964 else {
965 TRaster64P dstRas64 = dstRas;
966 TRaster64P srcRas64 = srcRas;
967 if (dstRas64 && srcRas64)
968 doBlurRgb<TPixel64, USHORT, double>(dstRas64, srcRas64, blur, dx, dy,
969 useSSE);
970 else {
971 TRasterGR8P dstRasGR8 = dstRas;
972 TRasterGR8P srcRasGR8 = srcRas;
973
974 if (dstRasGR8 && srcRasGR8)
975 doBlurGray<TPixelGR8>(dstRasGR8, srcRasGR8, blur, dx, dy);
976 else {
977 TRasterGR16P dstRasGR16 = dstRas;
978 TRasterGR16P srcRasGR16 = srcRas;
979
980 if (dstRasGR16 && srcRasGR16)
981 doBlurGray<TPixelGR16>(dstRasGR16, srcRasGR16, blur, dx, dy);
982 else
983 throw TException("TRop::blur unsupported pixel type");
984 }
985 }
986 }
987 }
988