1 /* convi
2 *
3 * Copyright: 1990, N. Dessipris.
4 *
5 * Author: Nicos Dessipris & Kirk Martinez
6 * Written on: 29/04/1991
7 * Modified on: 19/05/1991
8 * 8/7/93 JC
9 * - adapted for partial v2
10 * - memory leaks fixed
11 * - ANSIfied
12 * 23/7/93 JC
13 * - inner loop unrolled with a switch - 25% speed-up!
14 * 13/12/93 JC
15 * - tiny rounding error removed
16 * 7/10/94 JC
17 * - new IM_ARRAY() macro
18 * - various simplifications
19 * - evalend callback added
20 * 1/2/95 JC
21 * - use of IM_REGION_ADDR() updated
22 * - output size was incorrect! see comment below
23 * - bug with large non-square matricies fixed too
24 * - uses new im_embed() function
25 * 13/7/98 JC
26 * - wierd bug ... im_free_imask is no longer directly called for close
27 * callback, caused SIGKILL on solaris 2.6 ... linker bug?
28 * 9/3/01 JC
29 * - reworked and simplified, about 10% faster
30 * - slightly better range clipping
31 * 27/7/01 JC
32 * - reject masks with scale == 0
33 * 7/4/04
34 * - im_conv() now uses im_embed() with edge stretching on the input, not
35 * the output
36 * - sets Xoffset / Yoffset
37 * 11/11/05
38 * - simpler inner loop avoids gcc4 bug
39 * 7/11/07
40 * - new evalstart/end callbacks
41 * 12/5/08
42 * - int rounding was +1 too much, argh
43 * - only rebuild the buffer offsets if bpl changes
44 * 5/4/09
45 * - tiny speedups and cleanups
46 * - add restrict, though it doesn't seem to help gcc
47 * 12/11/09
48 * - only check for non-zero elements once
49 * - add mask-all-zero check
50 * - cleanups
51 * 3/2/10
52 * - gtkdoc
53 * - more cleanups
54 * 23/08/10
55 * - add a special case for 3x3 masks, about 20% faster
56 * 1/10/10
57 * - support complex (just double the bands)
58 * 18/10/10
59 * - add experimental Orc path
60 * 29/10/10
61 * - use VipsVector
62 * - get rid of im_convsep(), just call this twice, no longer worth
63 * keeping two versions
64 * 8/11/10
65 * - add array tiling
66 * 9/5/11
67 * - argh typo in overflow estimation could cause errors
68 * 15/10/11 Nicolas
69 * - handle offset correctly in seperable convolutions
70 * 26/1/16 Lovell Fuller
71 * - remove Duff for a 25% speedup
72 * 23/6/16
73 * - rewritten as a class
74 * - new fixed-point vector path, up to 2x faster
75 * 2/7/17
76 * - remove pts for a small speedup
77 * 12/10/17
78 * - fix leak of vectors, thanks MHeimbuc
79 * 14/10/17
80 * - switch to half-float for vector path
81 */
82
83 /*
84
85 This file is part of VIPS.
86
87 VIPS is free software; you can redistribute it and/or modify
88 it under the terms of the GNU Lesser General Public License as published by
89 the Free Software Foundation; either version 2 of the License, or
90 (at your option) any later version.
91
92 This program is distributed in the hope that it will be useful,
93 but WITHOUT ANY WARRANTY; without even the implied warranty of
94 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
95 GNU Lesser General Public License for more details.
96
97 You should have received a copy of the GNU Lesser General Public License
98 along with this program; if not, write to the Free Software
99 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
100 02110-1301 USA
101
102 */
103
104 /*
105
106 These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
107
108 */
109
110 /*
111 #define DEBUG
112 #define DEBUG_PIXELS
113 #define DEBUG_COMPILE
114 */
115
116 #ifdef HAVE_CONFIG_H
117 #include <config.h>
118 #endif /*HAVE_CONFIG_H*/
119 #include <vips/intl.h>
120
121 #include <stdio.h>
122 #include <stdlib.h>
123 #include <limits.h>
124
125 #include <vips/vips.h>
126 #include <vips/internal.h>
127
128 #include "pconvolution.h"
129
130 /* Larger than this and we fall back to C.
131 */
132 #define MAX_PASS (20)
133
134 /* A pass with a vector.
135 */
136 typedef struct {
137 int first; /* The index of the first mask coff we use */
138 int last; /* The index of the last mask coff we use */
139
140 int r; /* Set previous result in this var */
141
142 /* The code we generate for this section of the mask.
143 */
144 VipsVector *vector;
145 } Pass;
146
147 typedef struct {
148 VipsConvolution parent_instance;
149
150 int n_point; /* w * h for our matrix */
151
152 /* We make a smaller version of the mask with the zeros squeezed out.
153 */
154 int nnz; /* Number of non-zero mask elements */
155 int *coeff; /* Array of non-zero mask coefficients */
156 int *coeff_pos; /* Index of each nnz element in mask->coeff */
157
158 /* And a half float version for the vector path. mant has the signed
159 * 8-bit mantissas in [-1, +1), sexp has the exponent shift after the
160 * mul and before the add, and exp has the final exponent shift before
161 * write-back.
162 */
163 int *mant;
164 int sexp;
165 int exp;
166
167 /* The set of passes we need for this mask.
168 */
169 int n_pass;
170 Pass pass[MAX_PASS];
171
172 /* Code for the final clip back to 8 bits.
173 */
174 int r;
175 VipsVector *vector;
176 } VipsConvi;
177
178 typedef VipsConvolutionClass VipsConviClass;
179
180 G_DEFINE_TYPE( VipsConvi, vips_convi, VIPS_TYPE_CONVOLUTION );
181
182 /* Our sequence value.
183 */
184 typedef struct {
185 VipsConvi *convi;
186 VipsRegion *ir; /* Input region */
187
188 int *offsets; /* Offsets for each non-zero matrix element */
189
190 int last_bpl; /* Avoid recalcing offsets, if we can */
191
192 /* We need a pair of intermediate buffers to keep the results of each
193 * vector conv pass.
194 */
195 short *t1;
196 short *t2;
197 } VipsConviSequence;
198
199 static void
vips_convi_compile_free(VipsConvi * convi)200 vips_convi_compile_free( VipsConvi *convi )
201 {
202 int i;
203
204 for( i = 0; i < convi->n_pass; i++ )
205 VIPS_FREEF( vips_vector_free, convi->pass[i].vector );
206 convi->n_pass = 0;
207 VIPS_FREEF( vips_vector_free, convi->vector );
208 }
209
210 static void
vips_convi_dispose(GObject * gobject)211 vips_convi_dispose( GObject *gobject )
212 {
213 VipsConvi *convi = (VipsConvi *) gobject;
214
215 #ifdef DEBUG
216 printf( "vips_convi_dispose: " );
217 vips_object_print_name( VIPS_OBJECT( gobject ) );
218 printf( "\n" );
219 #endif /*DEBUG*/
220
221 vips_convi_compile_free( convi );
222
223 G_OBJECT_CLASS( vips_convi_parent_class )->dispose( gobject );
224 }
225
226 /* Free a sequence value.
227 */
228 static int
vips_convi_stop(void * vseq,void * a,void * b)229 vips_convi_stop( void *vseq, void *a, void *b )
230 {
231 VipsConviSequence *seq = (VipsConviSequence *) vseq;
232
233 VIPS_UNREF( seq->ir );
234 VIPS_FREE( seq->offsets );
235 VIPS_FREE( seq->t1 );
236 VIPS_FREE( seq->t2 );
237
238 return( 0 );
239 }
240
241 /* Convolution start function.
242 */
243 static void *
vips_convi_start(VipsImage * out,void * a,void * b)244 vips_convi_start( VipsImage *out, void *a, void *b )
245 {
246 VipsImage *in = (VipsImage *) a;
247 VipsConvi *convi = (VipsConvi *) b;
248 VipsConviSequence *seq;
249
250 if( !(seq = VIPS_NEW( out, VipsConviSequence )) )
251 return( NULL );
252
253 seq->convi = convi;
254 seq->ir = NULL;
255 seq->offsets = NULL;
256 seq->last_bpl = -1;
257 seq->t1 = NULL;
258 seq->t2 = NULL;
259
260 seq->ir = vips_region_new( in );
261
262 /* C mode.
263 */
264 if( convi->nnz ) {
265 if( !(seq->offsets = VIPS_ARRAY( NULL, convi->nnz, int )) ) {
266 vips_convi_stop( seq, in, convi );
267 return( NULL );
268 }
269 }
270
271 /* Vector mode.
272 */
273 if( convi->n_pass ) {
274 seq->t1 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );
275 seq->t2 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );
276
277 if( !seq->t1 ||
278 !seq->t2 ) {
279 vips_convi_stop( seq, in, convi );
280 return( NULL );
281 }
282 }
283
284 return( (void *) seq );
285 }
286
287 #define TEMP( N, S ) vips_vector_temporary( v, (char *) N, S )
288 #define PARAM( N, S ) vips_vector_parameter( v, (char *) N, S )
289 #define SCANLINE( N, P, S ) vips_vector_source_scanline( v, (char *) N, P, S )
290 #define CONST( N, V, S ) vips_vector_constant( v, (char *) N, V, S )
291 #define ASM2( OP, A, B ) vips_vector_asm2( v, (char *) OP, A, B )
292 #define ASM3( OP, A, B, C ) vips_vector_asm3( v, (char *) OP, A, B, C )
293
294 /* Generate code for a section of the mask. first is the index we start
295 * at, we set last to the index of the last one we use before we run
296 * out of intermediates / constants / parameters / sources or mask
297 * coefficients.
298 *
299 * 0 for success, -1 on error.
300 */
301 static int
vips_convi_compile_section(VipsConvi * convi,VipsImage * in,Pass * pass)302 vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass )
303 {
304 VipsConvolution *convolution = (VipsConvolution *) convi;
305 VipsImage *M = convolution->M;
306
307 VipsVector *v;
308 int i;
309
310 #ifdef DEBUG_COMPILE
311 printf( "starting pass %d\n", pass->first );
312 #endif /*DEBUG_COMPILE*/
313
314 pass->vector = v = vips_vector_new( "convi", 2 );
315
316 /* "r" is the array of sums from the previous pass (if any).
317 */
318 pass->r = vips_vector_source_name( v, "r", 2 );
319
320 /* The value we fetch from the image, the accumulated sum.
321 */
322 TEMP( "value", 2 );
323 TEMP( "valueb", 1 );
324 TEMP( "sum", 2 );
325
326 /* Init the sum. If this is the first pass, it's a constant. If this
327 * is a later pass, we have to init the sum from the result
328 * of the previous pass.
329 */
330 if( pass->first == 0 ) {
331 char c0[256];
332
333 CONST( c0, 0, 2 );
334 ASM2( "loadpw", "sum", c0 );
335 }
336 else
337 ASM2( "loadw", "sum", "r" );
338
339 for( i = pass->first; i < convi->n_point; i++ ) {
340 int x = i % M->Xsize;
341 int y = i / M->Xsize;
342
343 char source[256];
344 char off[256];
345 char rnd[256];
346 char sexp[256];
347 char coeff[256];
348
349 /* Exclude zero elements.
350 */
351 if( !convi->mant[i] )
352 continue;
353
354 /* The source. sl0 is the first scanline in the mask.
355 */
356 SCANLINE( source, y, 1 );
357
358 /* Load with an offset. Only for non-first-columns though.
359 */
360 if( x == 0 )
361 ASM2( "convubw", "value", source );
362 else {
363 CONST( off, in->Bands * x, 1 );
364 ASM3( "loadoffb", "valueb", source, off );
365 ASM2( "convubw", "value", "valueb" );
366 }
367
368 /* We need a signed multiply, so the image pixel needs to
369 * become a signed 16-bit value. We know only the bottom 8 bits
370 * of the image and coefficient are interesting, so we can take
371 * the bottom half of a 16x16->32 multiply.
372 */
373 CONST( coeff, convi->mant[i], 2 );
374 ASM3( "mullw", "value", "value", coeff );
375
376 /* Shift right before add to prevent overflow on large masks.
377 */
378 CONST( sexp, convi->sexp, 2 );
379 CONST( rnd, 1 << (convi->sexp - 1), 2 );
380 ASM3( "addw", "value", "value", rnd );
381 ASM3( "shrsw", "value", "value", sexp );
382
383 /* We accumulate the signed 16-bit result in sum. Saturated
384 * add.
385 */
386 ASM3( "addssw", "sum", "sum", "value" );
387
388 if( vips_vector_full( v ) )
389 break;
390 }
391
392 pass->last = i;
393
394 /* And write to our intermediate buffer.
395 */
396 ASM2( "copyw", "d1", "sum" );
397
398 if( !vips_vector_compile( v ) )
399 return( -1 );
400
401 #ifdef DEBUG_COMPILE
402 printf( "done coeffs %d to %d\n", pass->first, pass->last );
403 vips_vector_print( v );
404 #endif /*DEBUG_COMPILE*/
405
406 return( 0 );
407 }
408
409 /* Generate code for the final 16->8 conversion.
410 *
411 * 0 for success, -1 on error.
412 */
413 static int
vips_convi_compile_clip(VipsConvi * convi)414 vips_convi_compile_clip( VipsConvi *convi )
415 {
416 VipsConvolution *convolution = (VipsConvolution *) convi;
417 VipsImage *M = convolution->M;
418 int offset = VIPS_RINT( vips_image_get_offset( M ) );
419
420 VipsVector *v;
421 char rnd[256];
422 char exp[256];
423 char c0[256];
424 char c255[256];
425 char off[256];
426
427 convi->vector = v = vips_vector_new( "convi", 1 );
428
429 /* "r" is the array of sums we clip down.
430 */
431 convi->r = vips_vector_source_name( v, "r", 2 );
432
433 /* The value we fetch from the image.
434 */
435 TEMP( "value", 2 );
436
437 CONST( rnd, 1 << (convi->exp - 1), 2 );
438 ASM3( "addw", "value", "r", rnd );
439 CONST( exp, convi->exp, 2 );
440 ASM3( "shrsw", "value", "value", exp );
441
442 CONST( off, offset, 2 );
443 ASM3( "addw", "value", "value", off );
444
445 /* You'd think "convsuswb" (convert signed 16-bit to unsigned
446 * 8-bit with saturation) would be quicker, but it's a lot
447 * slower.
448 */
449 CONST( c0, 0, 2 );
450 ASM3( "maxsw", "value", c0, "value" );
451 CONST( c255, 255, 2 );
452 ASM3( "minsw", "value", c255, "value" );
453
454 ASM2( "convwb", "d1", "value" );
455
456 if( !vips_vector_compile( v ) )
457 return( -1 );
458
459 return( 0 );
460 }
461
462 static int
vips_convi_compile(VipsConvi * convi,VipsImage * in)463 vips_convi_compile( VipsConvi *convi, VipsImage *in )
464 {
465 int i;
466 Pass *pass;
467
468 /* Generate passes until we've used up the whole mask.
469 */
470 for( i = 0;; ) {
471 /* Allocate space for another pass.
472 */
473 if( convi->n_pass == MAX_PASS )
474 return( -1 );
475 pass = &convi->pass[convi->n_pass];
476 convi->n_pass += 1;
477
478 pass->first = i;
479 pass->r = -1;
480
481 if( vips_convi_compile_section( convi, in, pass ) )
482 return( -1 );
483 i = pass->last + 1;
484
485 if( i >= convi->n_point )
486 break;
487 }
488
489 if( vips_convi_compile_clip( convi ) )
490 return( -1 );
491
492 return( 0 );
493 }
494
495 static int
vips_convi_gen_vector(VipsRegion * or,void * vseq,void * a,void * b,gboolean * stop)496 vips_convi_gen_vector( VipsRegion *or,
497 void *vseq, void *a, void *b, gboolean *stop )
498 {
499 VipsConviSequence *seq = (VipsConviSequence *) vseq;
500 VipsConvi *convi = (VipsConvi *) b;
501 VipsConvolution *convolution = (VipsConvolution *) convi;
502 VipsImage *M = convolution->M;
503 VipsImage *in = (VipsImage *) a;
504 VipsRegion *ir = seq->ir;
505 VipsRect *r = &or->valid;
506 int ne = r->width * in->Bands;
507
508 VipsRect s;
509 int i, y;
510 VipsExecutor executor[MAX_PASS];
511 VipsExecutor clip;
512
513 #ifdef DEBUG_PIXELS
514 printf( "vips_convi_gen_vector: generating %d x %d at %d x %d\n",
515 r->width, r->height, r->left, r->top );
516 #endif /*DEBUG_PIXELS*/
517
518 /* Prepare the section of the input image we need. A little larger
519 * than the section of the output image we are producing.
520 */
521 s = *r;
522 s.width += M->Xsize - 1;
523 s.height += M->Ysize - 1;
524 if( vips_region_prepare( ir, &s ) )
525 return( -1 );
526
527 for( i = 0; i < convi->n_pass; i++ )
528 vips_executor_set_program( &executor[i],
529 convi->pass[i].vector, ne );
530 vips_executor_set_program( &clip, convi->vector, ne );
531
532 VIPS_GATE_START( "vips_convi_gen_vector: work" );
533
534 for( y = 0; y < r->height; y ++ ) {
535 VipsPel *q = VIPS_REGION_ADDR( or, r->left, r->top + y );
536
537 #ifdef DEBUG_PIXELS
538 {
539 int h, v;
540
541 printf( "before convolve: x = %d, y = %d\n",
542 r->left, r->top + y );
543 for( v = 0; v < M->Ysize; v++ ) {
544 for( h = 0; h < M->Xsize; h++ )
545 printf( "%3d ", *VIPS_REGION_ADDR( ir,
546 r->left + h, r->top + y + v ) );
547 printf( "\n" );
548 }
549 }
550 #endif /*DEBUG_PIXELS*/
551
552 /* We run our n passes to generate this scanline.
553 */
554 for( i = 0; i < convi->n_pass; i++ ) {
555 Pass *pass = &convi->pass[i];
556
557 vips_executor_set_scanline( &executor[i],
558 ir, r->left, r->top + y );
559 vips_executor_set_array( &executor[i],
560 pass->r, seq->t1 );
561 vips_executor_set_destination( &executor[i], seq->t2 );
562 vips_executor_run( &executor[i] );
563
564 VIPS_SWAP( signed short *, seq->t1, seq->t2 );
565 }
566
567 #ifdef DEBUG_PIXELS
568 printf( "before clip: %d\n", ((signed short *) seq->t1)[0] );
569 #endif /*DEBUG_PIXELS*/
570
571 vips_executor_set_array( &clip, convi->r, seq->t1 );
572 vips_executor_set_destination( &clip, q );
573 vips_executor_run( &clip );
574
575 #ifdef DEBUG_PIXELS
576 printf( "after clip: %d\n",
577 *VIPS_REGION_ADDR( or, r->left, r->top + y ) );
578 #endif /*DEBUG_PIXELS*/
579 }
580
581 VIPS_GATE_STOP( "vips_convi_gen_vector: work" );
582
583 VIPS_COUNT_PIXELS( or, "vips_convi_gen_vector" );
584
585 return( 0 );
586 }
587
588 /* INT inner loops.
589 */
590 #define CONV_INT( TYPE, CLIP ) { \
591 TYPE * restrict p = (TYPE *) VIPS_REGION_ADDR( ir, le, y ); \
592 TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
593 int * restrict offsets = seq->offsets; \
594 \
595 for( x = 0; x < sz; x++ ) { \
596 int sum; \
597 int i; \
598 \
599 sum = 0; \
600 for ( i = 0; i < nnz; i++ ) \
601 sum += t[i] * p[offsets[i]]; \
602 \
603 sum = ((sum + rounding) / scale) + offset; \
604 \
605 CLIP; \
606 \
607 q[x] = sum; \
608 p += 1; \
609 } \
610 }
611
612 /* FLOAT inner loops.
613 */
614 #define CONV_FLOAT( TYPE ) { \
615 TYPE * restrict p = (TYPE *) VIPS_REGION_ADDR( ir, le, y ); \
616 TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
617 int * restrict offsets = seq->offsets; \
618 \
619 for( x = 0; x < sz; x++ ) { \
620 double sum; \
621 int i; \
622 \
623 sum = 0; \
624 for ( i = 0; i < nnz; i++ ) \
625 sum += t[i] * p[offsets[i]]; \
626 \
627 sum = (sum / scale) + offset; \
628 \
629 q[x] = sum; \
630 p += 1; \
631 } \
632 }
633
634 /* Various integer range clips. Record over/under flows.
635 */
636 #define CLIP_UCHAR( V ) \
637 G_STMT_START { \
638 if( (V) < 0 ) \
639 (V) = 0; \
640 else if( (V) > UCHAR_MAX ) \
641 (V) = UCHAR_MAX; \
642 } G_STMT_END
643
644 #define CLIP_CHAR( V ) \
645 G_STMT_START { \
646 if( (V) < SCHAR_MIN ) \
647 (V) = SCHAR_MIN; \
648 else if( (V) > SCHAR_MAX ) \
649 (V) = SCHAR_MAX; \
650 } G_STMT_END
651
652 #define CLIP_USHORT( V ) \
653 G_STMT_START { \
654 if( (V) < 0 ) \
655 (V) = 0; \
656 else if( (V) > USHRT_MAX ) \
657 (V) = USHRT_MAX; \
658 } G_STMT_END
659
660 #define CLIP_SHORT( V ) \
661 G_STMT_START { \
662 if( (V) < SHRT_MIN ) \
663 (V) = SHRT_MIN; \
664 else if( (V) > SHRT_MAX ) \
665 (V) = SHRT_MAX; \
666 } G_STMT_END
667
668 #define CLIP_NONE( V ) {}
669
670 /* Convolve!
671 */
672 static int
vips_convi_gen(VipsRegion * or,void * vseq,void * a,void * b,gboolean * stop)673 vips_convi_gen( VipsRegion *or,
674 void *vseq, void *a, void *b, gboolean *stop )
675 {
676 VipsConviSequence *seq = (VipsConviSequence *) vseq;
677 VipsConvi *convi = (VipsConvi *) b;
678 VipsConvolution *convolution = (VipsConvolution *) convi;
679 VipsImage *M = convolution->M;
680 int scale = VIPS_RINT( vips_image_get_scale( M ) );
681 int rounding = scale / 2;
682 int offset = VIPS_RINT( vips_image_get_offset( M ) );
683 VipsImage *in = (VipsImage *) a;
684 VipsRegion *ir = seq->ir;
685 int * restrict t = convi->coeff;
686 const int nnz = convi->nnz;
687 VipsRect *r = &or->valid;
688 int le = r->left;
689 int to = r->top;
690 int bo = VIPS_RECT_BOTTOM( r );
691 int sz = VIPS_REGION_N_ELEMENTS( or ) *
692 (vips_band_format_iscomplex( in->BandFmt ) ? 2 : 1);
693
694 VipsRect s;
695 int x, y, z, i;
696
697 /* Prepare the section of the input image we need. A little larger
698 * than the section of the output image we are producing.
699 */
700 s = *r;
701 s.width += M->Xsize - 1;
702 s.height += M->Ysize - 1;
703 if( vips_region_prepare( ir, &s ) )
704 return( -1 );
705
706 /* Fill offset array. Only do this if the bpl has changed since the
707 * previous vips_region_prepare().
708 */
709 if( seq->last_bpl != VIPS_REGION_LSKIP( ir ) ) {
710 seq->last_bpl = VIPS_REGION_LSKIP( ir );
711
712 for( i = 0; i < nnz; i++ ) {
713 z = convi->coeff_pos[i];
714 x = z % M->Xsize;
715 y = z / M->Xsize;
716
717 seq->offsets[i] =
718 (VIPS_REGION_ADDR( ir, x + le, y + to ) -
719 VIPS_REGION_ADDR( ir, le, to )) /
720 VIPS_IMAGE_SIZEOF_ELEMENT( ir->im );
721 }
722 }
723
724 VIPS_GATE_START( "vips_convi_gen: work" );
725
726 for( y = to; y < bo; y++ ) {
727 switch( in->BandFmt ) {
728 case VIPS_FORMAT_UCHAR:
729 CONV_INT( unsigned char, CLIP_UCHAR( sum ) );
730 break;
731
732 case VIPS_FORMAT_CHAR:
733 CONV_INT( signed char, CLIP_CHAR( sum ) );
734 break;
735
736 case VIPS_FORMAT_USHORT:
737 CONV_INT( unsigned short, CLIP_USHORT( sum ) );
738 break;
739
740 case VIPS_FORMAT_SHORT:
741 CONV_INT( signed short, CLIP_SHORT( sum ) );
742 break;
743
744 case VIPS_FORMAT_UINT:
745 CONV_INT( unsigned int, CLIP_NONE( sum ) );
746 break;
747
748 case VIPS_FORMAT_INT:
749 CONV_INT( signed int, CLIP_NONE( sum ) );
750 break;
751
752 case VIPS_FORMAT_FLOAT:
753 case VIPS_FORMAT_COMPLEX:
754 CONV_FLOAT( float );
755 break;
756
757 case VIPS_FORMAT_DOUBLE:
758 case VIPS_FORMAT_DPCOMPLEX:
759 CONV_FLOAT( double );
760 break;
761
762 default:
763 g_assert_not_reached();
764 }
765 }
766
767 VIPS_GATE_STOP( "vips_convi_gen: work" );
768
769 VIPS_COUNT_PIXELS( or, "vips_convi_gen" );
770
771 return( 0 );
772 }
773
774 /* Make an int version of a mask.
775 *
776 * We rint() everything, then adjust the scale try to match the overall
777 * effect.
778 */
779 int
vips__image_intize(VipsImage * in,VipsImage ** out)780 vips__image_intize( VipsImage *in, VipsImage **out )
781 {
782 VipsImage *t;
783 int x, y;
784 double double_result;
785 double out_scale;
786 double out_offset;
787 int int_result;
788
789 if( vips_check_matrix( "vips2imask", in, &t ) )
790 return( -1 );
791 if( !(*out = vips_image_new_matrix( t->Xsize, t->Ysize )) ) {
792 g_object_unref( t );
793 return( -1 );
794 }
795
796 /* We want to make an intmask which has the same input to output ratio
797 * as the double image.
798 *
799 * Imagine convolving with the double image, what's the ratio of
800 * brightness between input and output? We want the same ratio for the
801 * int version, if we can.
802 *
803 * Imagine an input image where every pixel is 1, what will the output
804 * be?
805 */
806 double_result = 0;
807 for( y = 0; y < t->Ysize; y++ )
808 for( x = 0; x < t->Xsize; x++ )
809 double_result += *VIPS_MATRIX( t, x, y );
810 double_result /= vips_image_get_scale( t );
811
812 for( y = 0; y < t->Ysize; y++ )
813 for( x = 0; x < t->Xsize; x++ )
814 *VIPS_MATRIX( *out, x, y ) =
815 VIPS_RINT( *VIPS_MATRIX( t, x, y ) );
816
817 out_scale = VIPS_RINT( vips_image_get_scale( t ) );
818 if( out_scale == 0 )
819 out_scale = 1;
820 out_offset = VIPS_RINT( vips_image_get_offset( t ) );
821
822 /* Now convolve a 1 everywhere image with the int version we've made,
823 * what do we get?
824 */
825 int_result = 0;
826 for( y = 0; y < t->Ysize; y++ )
827 for( x = 0; x < t->Xsize; x++ )
828 int_result += *VIPS_MATRIX( *out, x, y );
829 int_result /= out_scale;
830
831 /* And adjust the scale to get as close to a match as we can.
832 */
833 out_scale = VIPS_RINT( out_scale + (int_result - double_result) );
834 if( out_scale == 0 )
835 out_scale = 1;
836
837 vips_image_set_double( *out, "scale", out_scale );
838 vips_image_set_double( *out, "offset", out_offset );
839
840 g_object_unref( t );
841
842 return( 0 );
843 }
844
845 /* Make an int version of a mask. Each element is 8.8 float, with the same
846 * exponent for each element (so just 8 bits in @out).
847 *
848 * @out is a w x h int array.
849 */
850 static int
vips_convi_intize(VipsConvi * convi,VipsImage * M)851 vips_convi_intize( VipsConvi *convi, VipsImage *M )
852 {
853 VipsImage *t;
854 double scale;
855 double *scaled;
856 double mx;
857 double mn;
858 int shift;
859 int i;
860
861 if( vips_check_matrix( "vips2imask", M, &t ) )
862 return( -1 );
863
864 /* Bake the scale into the mask to make a double version.
865 */
866 scale = vips_image_get_scale( t );
867 if( !(scaled = VIPS_ARRAY( convi, convi->n_point, double )) ) {
868 g_object_unref( t );
869 return( -1 );
870 }
871 for( i = 0; i < convi->n_point; i++ )
872 scaled[i] = VIPS_MATRIX( t, 0, 0 )[i] / scale;
873 g_object_unref( t );
874
875 #ifdef DEBUG_COMPILE
876 {
877 int x, y;
878
879 printf( "vips_convi_intize: double version\n" );
880 for( y = 0; y < t->Ysize; y++ ) {
881 printf( "\t" );
882 for( x = 0; x < t->Xsize; x++ )
883 printf( "%g ", scaled[y * t->Xsize + x] );
884 printf( "\n" );
885 }
886 }
887 #endif /*DEBUG_COMPILE*/
888
889 mx = scaled[0];
890 mn = scaled[0];
891 for( i = 1; i < convi->n_point; i++ ) {
892 if( scaled[i] > mx )
893 mx = scaled[i];
894 if( scaled[i] < mn )
895 mn = scaled[i];
896 }
897
898 /* The mask max rounded up to the next power of two gives the exponent
899 * all elements share. Values are eg. -3 for 1/8, 3 for 8.
900 *
901 * Add one so we round up stuff exactly on x.0. We multiply by 128
902 * later, so 1.0 (for example) would become 128, which is outside
903 * signed 8 bit.
904 */
905 shift = ceil( log2( mx ) + 1 );
906
907 /* We need to sum n_points, so we have to shift right before adding a
908 * new value to make sure we have enough range.
909 */
910 convi->sexp = ceil( log2( convi->n_point ) );
911 if( convi->sexp > 10 ) {
912 g_info( "vips_convi_intize: mask too large" );
913 return( -1 );
914 }
915
916 /* With that already done, the final shift must be ...
917 */
918 convi->exp = 7 - shift - convi->sexp;
919
920 if( !(convi->mant = VIPS_ARRAY( convi, convi->n_point, int )) )
921 return( -1 );
922 for( i = 0; i < convi->n_point; i++ ) {
923 /* 128 since this is signed.
924 */
925 convi->mant[i] = VIPS_RINT( 128 * scaled[i] * pow(2, -shift) );
926
927 if( convi->mant[i] < -128 ||
928 convi->mant[i] > 127 ) {
929 g_info( "vips_convi_intize: mask range too large" );
930 return( -1 );
931 }
932 }
933
934 #ifdef DEBUG_COMPILE
935 {
936 int x, y;
937
938 printf( "vips_convi_intize:\n" );
939 printf( "sexp = %d\n", convi->sexp );
940 printf( "exp = %d\n", convi->exp );
941 for( y = 0; y < t->Ysize; y++ ) {
942 printf( "\t" );
943 for( x = 0; x < t->Xsize; x++ )
944 printf( "%4d ", convi->mant[y * t->Xsize + x] );
945 printf( "\n" );
946 }
947 }
948 #endif /*DEBUG_COMPILE*/
949
950 /* Verify accuracy.
951 */
952 {
953 double true_sum;
954 int int_sum;
955 int true_value;
956 int int_value;
957
958 true_sum = 0.0;
959 int_sum = 0;
960 for( i = 0; i < convi->n_point; i++ ) {
961 int value;
962
963 true_sum += 128 * scaled[i];
964 value = 128 * convi->mant[i];
965 value = (value + (1 << (convi->sexp - 1))) >> convi->sexp;
966 int_sum += value;
967 int_sum = VIPS_CLIP( SHRT_MIN, int_sum, SHRT_MAX );
968 }
969
970 true_value = VIPS_CLIP( 0, true_sum, 255 );
971
972 if( convi->exp > 0 )
973 int_value = (int_sum + (1 << (convi->exp - 1))) >> convi->exp;
974 else
975 int_value = VIPS_LSHIFT_INT( int_sum, convi->exp );
976 int_value = VIPS_CLIP( 0, int_value, 255 );
977
978 if( VIPS_ABS( true_value - int_value ) > 2 ) {
979 g_info( "vips_convi_intize: too inaccurate" );
980 return( -1 );
981 }
982 }
983
984 return( 0 );
985 }
986
987 static int
vips_convi_build(VipsObject * object)988 vips_convi_build( VipsObject *object )
989 {
990 VipsConvolution *convolution = (VipsConvolution *) object;
991 VipsConvi *convi = (VipsConvi *) object;
992 VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
993
994 VipsImage *in;
995 VipsImage *M;
996 VipsGenerateFn generate;
997 double *coeff;
998 int i;
999
1000 if( VIPS_OBJECT_CLASS( vips_convi_parent_class )->build( object ) )
1001 return( -1 );
1002
1003 in = convolution->in;
1004 M = convolution->M;
1005 convi->n_point = M->Xsize * M->Ysize;
1006
1007 if( vips_embed( in, &t[0],
1008 M->Xsize / 2, M->Ysize / 2,
1009 in->Xsize + M->Xsize - 1, in->Ysize + M->Ysize - 1,
1010 "extend", VIPS_EXTEND_COPY,
1011 NULL ) )
1012 return( -1 );
1013 in = t[0];
1014
1015 /* Default to the C path.
1016 */
1017 generate = vips_convi_gen;
1018
1019 /* For uchar input, try to make a vector path.
1020 */
1021 if( vips_vector_isenabled() &&
1022 in->BandFmt == VIPS_FORMAT_UCHAR ) {
1023 if( !vips_convi_intize( convi, M ) &&
1024 !vips_convi_compile( convi, in ) ) {
1025 generate = vips_convi_gen_vector;
1026 g_info( "convi: using vector path" );
1027 }
1028 else
1029 vips_convi_compile_free( convi );
1030 }
1031
1032 /* Make the data for the C path.
1033 */
1034 if( generate == vips_convi_gen ) {
1035 g_info( "convi: using C path" );
1036
1037 /* Make an int version of our mask.
1038 */
1039 if( vips__image_intize( M, &t[1] ) )
1040 return( -1 );
1041 M = t[1];
1042
1043 coeff = VIPS_MATRIX( M, 0, 0 );
1044 if( !(convi->coeff = VIPS_ARRAY( object, convi->n_point, int )) ||
1045 !(convi->coeff_pos =
1046 VIPS_ARRAY( object, convi->n_point, int )) )
1047 return( -1 );
1048
1049 /* Squeeze out zero mask elements.
1050 */
1051 convi->nnz = 0;
1052 for( i = 0; i < convi->n_point; i++ )
1053 if( coeff[i] ) {
1054 convi->coeff[convi->nnz] = coeff[i];
1055 convi->coeff_pos[convi->nnz] = i;
1056 convi->nnz += 1;
1057 }
1058
1059 /* Was the whole mask zero? We must have at least 1 element
1060 * in there: set it to zero.
1061 */
1062 if( convi->nnz == 0 ) {
1063 convi->coeff[0] = 0;
1064 convi->coeff_pos[0] = 0;
1065 convi->nnz = 1;
1066 }
1067 }
1068
1069 g_object_set( convi, "out", vips_image_new(), NULL );
1070 if( vips_image_pipelinev( convolution->out,
1071 VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
1072 return( -1 );
1073
1074 /* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
1075 * would be 1x1.
1076 */
1077 convolution->out->Xsize -= M->Xsize - 1;
1078 convolution->out->Ysize -= M->Ysize - 1;
1079
1080 if( vips_image_generate( convolution->out,
1081 vips_convi_start, generate, vips_convi_stop, in, convi ) )
1082 return( -1 );
1083
1084 convolution->out->Xoffset = -M->Xsize / 2;
1085 convolution->out->Yoffset = -M->Ysize / 2;
1086
1087 return( 0 );
1088 }
1089
1090 static void
vips_convi_class_init(VipsConviClass * class)1091 vips_convi_class_init( VipsConviClass *class )
1092 {
1093 GObjectClass *gobject_class = G_OBJECT_CLASS( class );
1094 VipsObjectClass *object_class = (VipsObjectClass *) class;
1095
1096 gobject_class->dispose = vips_convi_dispose;
1097
1098 object_class->nickname = "convi";
1099 object_class->description = _( "int convolution operation" );
1100 object_class->build = vips_convi_build;
1101 }
1102
1103 static void
vips_convi_init(VipsConvi * convi)1104 vips_convi_init( VipsConvi *convi )
1105 {
1106 convi->nnz = 0;
1107 convi->coeff = NULL;
1108 convi->coeff_pos = NULL;
1109 }
1110
1111 /**
1112 * vips_convi: (method)
1113 * @in: input image
1114 * @out: (out): output image
1115 * @mask: convolve with this mask
1116 * @...: %NULL-terminated list of optional named arguments
1117 *
1118 * Integer convolution. This is a low-level operation, see vips_conv() for
1119 * something more convenient.
1120 *
1121 * @mask is converted to an integer mask with rint() of each element, rint of
1122 * scale and rint of offset. Each output pixel is then calculated as
1123 *
1124 * |[
1125 * sigma[i]{pixel[i] * mask[i]} / scale + offset
1126 * ]|
1127 *
1128 * The output image always has the same #VipsBandFormat as the input image.
1129 *
1130 * For #VIPS_FORMAT_UCHAR images, vips_convi() uses a fast vector path based on
1131 * half-float arithmetic. This can produce slightly different results.
1132 * Disable the vector path with `--vips-novector` or `VIPS_NOVECTOR` or
1133 * vips_vector_set_enabled().
1134 *
1135 * See also: vips_conv().
1136 *
1137 * Returns: 0 on success, -1 on error
1138 */
1139 int
vips_convi(VipsImage * in,VipsImage ** out,VipsImage * mask,...)1140 vips_convi( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
1141 {
1142 va_list ap;
1143 int result;
1144
1145 va_start( ap, mask );
1146 result = vips_call_split( "convi", ap, in, out, mask );
1147 va_end( ap );
1148
1149 return( result );
1150 }
1151
1152