1 /* convi
2  *
3  * Copyright: 1990, N. Dessipris.
4  *
5  * Author: Nicos Dessipris & Kirk Martinez
6  * Written on: 29/04/1991
7  * Modified on: 19/05/1991
8  * 8/7/93 JC
9  *      - adapted for partial v2
10  *      - memory leaks fixed
11  *      - ANSIfied
12  * 23/7/93 JC
13  *	- inner loop unrolled with a switch - 25% speed-up!
14  * 13/12/93 JC
15  *	- tiny rounding error removed
16  * 7/10/94 JC
17  *	- new IM_ARRAY() macro
18  *	- various simplifications
19  *	- evalend callback added
20  * 1/2/95 JC
21  *	- use of IM_REGION_ADDR() updated
22  *	- output size was incorrect! see comment below
23  *	- bug with large non-square matricies fixed too
24  *	- uses new im_embed() function
25  * 13/7/98 JC
26  *	- wierd bug ... im_free_imask is no longer directly called for close
27  *	  callback, caused SIGKILL on solaris 2.6 ... linker bug?
28  * 9/3/01 JC
29  *	- reworked and simplified, about 10% faster
30  *	- slightly better range clipping
31  * 27/7/01 JC
32  *	- reject masks with scale == 0
33  * 7/4/04
34  *	- im_conv() now uses im_embed() with edge stretching on the input, not
35  *	  the output
36  *	- sets Xoffset / Yoffset
37  * 11/11/05
38  * 	- simpler inner loop avoids gcc4 bug
39  * 7/11/07
40  * 	- new evalstart/end callbacks
41  * 12/5/08
42  * 	- int rounding was +1 too much, argh
43  * 	- only rebuild the buffer offsets if bpl changes
44  * 5/4/09
45  * 	- tiny speedups and cleanups
46  * 	- add restrict, though it doesn't seem to help gcc
47  * 12/11/09
48  * 	- only check for non-zero elements once
49  * 	- add mask-all-zero check
50  * 	- cleanups
51  * 3/2/10
52  * 	- gtkdoc
53  * 	- more cleanups
54  * 23/08/10
55  * 	- add a special case for 3x3 masks, about 20% faster
56  * 1/10/10
57  * 	- support complex (just double the bands)
58  * 18/10/10
59  * 	- add experimental Orc path
60  * 29/10/10
61  * 	- use VipsVector
62  * 	- get rid of im_convsep(), just call this twice, no longer worth
63  * 	  keeping two versions
64  * 8/11/10
65  * 	- add array tiling
66  * 9/5/11
67  * 	- argh typo in overflow estimation could cause errors
68  * 15/10/11 Nicolas
69  * 	- handle offset correctly in seperable convolutions
70  * 26/1/16 Lovell Fuller
71  * 	- remove Duff for a 25% speedup
72  * 23/6/16
73  * 	- rewritten as a class
74  * 	- new fixed-point vector path, up to 2x faster
75  * 2/7/17
76  * 	- remove pts for a small speedup
77  * 12/10/17
78  * 	- fix leak of vectors, thanks MHeimbuc
79  * 14/10/17
80  * 	- switch to half-float for vector path
81  */
82 
83 /*
84 
85     This file is part of VIPS.
86 
87     VIPS is free software; you can redistribute it and/or modify
88     it under the terms of the GNU Lesser General Public License as published by
89     the Free Software Foundation; either version 2 of the License, or
90     (at your option) any later version.
91 
92     This program is distributed in the hope that it will be useful,
93     but WITHOUT ANY WARRANTY; without even the implied warranty of
94     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
95     GNU Lesser General Public License for more details.
96 
97     You should have received a copy of the GNU Lesser General Public License
98     along with this program; if not, write to the Free Software
99     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
100     02110-1301  USA
101 
102  */
103 
104 /*
105 
106     These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
107 
108  */
109 
110 /*
111 #define DEBUG
112 #define DEBUG_PIXELS
113 #define DEBUG_COMPILE
114  */
115 
116 #ifdef HAVE_CONFIG_H
117 #include <config.h>
118 #endif /*HAVE_CONFIG_H*/
119 #include <vips/intl.h>
120 
121 #include <stdio.h>
122 #include <stdlib.h>
123 #include <limits.h>
124 
125 #include <vips/vips.h>
126 #include <vips/internal.h>
127 
128 #include "pconvolution.h"
129 
130 /* Larger than this and we fall back to C.
131  */
132 #define MAX_PASS (20)
133 
134 /* A pass with a vector.
135  */
136 typedef struct {
137 	int first;		/* The index of the first mask coff we use */
138 	int last;		/* The index of the last mask coff we use */
139 
140 	int r;			/* Set previous result in this var */
141 
142         /* The code we generate for this section of the mask.
143 	 */
144         VipsVector *vector;
145 } Pass;
146 
147 typedef struct {
148 	VipsConvolution parent_instance;
149 
150 	int n_point;		/* w * h for our matrix */
151 
152 	/* We make a smaller version of the mask with the zeros squeezed out.
153 	 */
154 	int nnz;		/* Number of non-zero mask elements */
155 	int *coeff;		/* Array of non-zero mask coefficients */
156 	int *coeff_pos;		/* Index of each nnz element in mask->coeff */
157 
158 	/* And a half float version for the vector path. mant has the signed
159 	 * 8-bit mantissas in [-1, +1), sexp has the exponent shift after the
160 	 * mul and before the add, and exp has the final exponent shift before
161 	 * write-back.
162 	 */
163 	int *mant;
164 	int sexp;
165 	int exp;
166 
167 	/* The set of passes we need for this mask.
168 	 */
169 	int n_pass;
170 	Pass pass[MAX_PASS];
171 
172 	/* Code for the final clip back to 8 bits.
173 	 */
174 	int r;
175         VipsVector *vector;
176 } VipsConvi;
177 
178 typedef VipsConvolutionClass VipsConviClass;
179 
180 G_DEFINE_TYPE( VipsConvi, vips_convi, VIPS_TYPE_CONVOLUTION );
181 
182 /* Our sequence value.
183  */
184 typedef struct {
185 	VipsConvi *convi;
186 	VipsRegion *ir;		/* Input region */
187 
188 	int *offsets;		/* Offsets for each non-zero matrix element */
189 
190 	int last_bpl;		/* Avoid recalcing offsets, if we can */
191 
192 	/* We need a pair of intermediate buffers to keep the results of each
193 	 * vector conv pass.
194 	 */
195 	short *t1;
196 	short *t2;
197 } VipsConviSequence;
198 
199 static void
vips_convi_compile_free(VipsConvi * convi)200 vips_convi_compile_free( VipsConvi *convi )
201 {
202 	int i;
203 
204 	for( i = 0; i < convi->n_pass; i++ )
205 		VIPS_FREEF( vips_vector_free, convi->pass[i].vector );
206 	convi->n_pass = 0;
207 	VIPS_FREEF( vips_vector_free, convi->vector );
208 }
209 
210 static void
vips_convi_dispose(GObject * gobject)211 vips_convi_dispose( GObject *gobject )
212 {
213 	VipsConvi *convi = (VipsConvi *) gobject;
214 
215 #ifdef DEBUG
216 	printf( "vips_convi_dispose: " );
217 	vips_object_print_name( VIPS_OBJECT( gobject ) );
218 	printf( "\n" );
219 #endif /*DEBUG*/
220 
221 	vips_convi_compile_free( convi );
222 
223 	G_OBJECT_CLASS( vips_convi_parent_class )->dispose( gobject );
224 }
225 
226 /* Free a sequence value.
227  */
228 static int
vips_convi_stop(void * vseq,void * a,void * b)229 vips_convi_stop( void *vseq, void *a, void *b )
230 {
231 	VipsConviSequence *seq = (VipsConviSequence *) vseq;
232 
233 	VIPS_UNREF( seq->ir );
234 	VIPS_FREE( seq->offsets );
235 	VIPS_FREE( seq->t1 );
236 	VIPS_FREE( seq->t2 );
237 
238 	return( 0 );
239 }
240 
241 /* Convolution start function.
242  */
243 static void *
vips_convi_start(VipsImage * out,void * a,void * b)244 vips_convi_start( VipsImage *out, void *a, void *b )
245 {
246 	VipsImage *in = (VipsImage *) a;
247 	VipsConvi *convi = (VipsConvi *) b;
248 	VipsConviSequence *seq;
249 
250 	if( !(seq = VIPS_NEW( out, VipsConviSequence )) )
251 		return( NULL );
252 
253 	seq->convi = convi;
254 	seq->ir = NULL;
255 	seq->offsets = NULL;
256 	seq->last_bpl = -1;
257 	seq->t1 = NULL;
258 	seq->t2 = NULL;
259 
260 	seq->ir = vips_region_new( in );
261 
262 	/* C mode.
263 	 */
264 	if( convi->nnz ) {
265 		if( !(seq->offsets = VIPS_ARRAY( NULL, convi->nnz, int )) ) {
266 			vips_convi_stop( seq, in, convi );
267 			return( NULL );
268 		}
269 	}
270 
271 	/* Vector mode.
272 	 */
273 	if( convi->n_pass ) {
274 		seq->t1 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );
275 		seq->t2 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );
276 
277 		if( !seq->t1 ||
278 			!seq->t2 ) {
279 			vips_convi_stop( seq, in, convi );
280 			return( NULL );
281 		}
282 	}
283 
284 	return( (void *) seq );
285 }
286 
287 #define TEMP( N, S ) vips_vector_temporary( v, (char *) N, S )
288 #define PARAM( N, S ) vips_vector_parameter( v, (char *) N, S )
289 #define SCANLINE( N, P, S ) vips_vector_source_scanline( v, (char *) N, P, S )
290 #define CONST( N, V, S ) vips_vector_constant( v, (char *) N, V, S )
291 #define ASM2( OP, A, B ) vips_vector_asm2( v, (char *) OP, A, B )
292 #define ASM3( OP, A, B, C ) vips_vector_asm3( v, (char *) OP, A, B, C )
293 
294 /* Generate code for a section of the mask. first is the index we start
295  * at, we set last to the index of the last one we use before we run
296  * out of intermediates / constants / parameters / sources or mask
297  * coefficients.
298  *
299  * 0 for success, -1 on error.
300  */
301 static int
vips_convi_compile_section(VipsConvi * convi,VipsImage * in,Pass * pass)302 vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass )
303 {
304 	VipsConvolution *convolution = (VipsConvolution *) convi;
305 	VipsImage *M = convolution->M;
306 
307 	VipsVector *v;
308 	int i;
309 
310 #ifdef DEBUG_COMPILE
311 	printf( "starting pass %d\n", pass->first );
312 #endif /*DEBUG_COMPILE*/
313 
314 	pass->vector = v = vips_vector_new( "convi", 2 );
315 
316 	/* "r" is the array of sums from the previous pass (if any).
317 	 */
318 	pass->r = vips_vector_source_name( v, "r", 2 );
319 
320 	/* The value we fetch from the image, the accumulated sum.
321 	 */
322 	TEMP( "value", 2 );
323 	TEMP( "valueb", 1 );
324 	TEMP( "sum", 2 );
325 
326 	/* Init the sum. If this is the first pass, it's a constant. If this
327 	 * is a later pass, we have to init the sum from the result
328 	 * of the previous pass.
329 	 */
330 	if( pass->first == 0 ) {
331 		char c0[256];
332 
333 		CONST( c0, 0, 2 );
334 		ASM2( "loadpw", "sum", c0 );
335 	}
336 	else
337 		ASM2( "loadw", "sum", "r" );
338 
339 	for( i = pass->first; i < convi->n_point; i++ ) {
340 		int x = i % M->Xsize;
341 		int y = i / M->Xsize;
342 
343 		char source[256];
344 		char off[256];
345 		char rnd[256];
346 		char sexp[256];
347 		char coeff[256];
348 
349 		/* Exclude zero elements.
350 		 */
351 		if( !convi->mant[i] )
352 			continue;
353 
354 		/* The source. sl0 is the first scanline in the mask.
355 		 */
356 		SCANLINE( source, y, 1 );
357 
358 		/* Load with an offset. Only for non-first-columns though.
359 		 */
360 		if( x == 0 )
361 			ASM2( "convubw", "value", source );
362 		else {
363 			CONST( off, in->Bands * x, 1 );
364 			ASM3( "loadoffb", "valueb", source, off );
365 			ASM2( "convubw", "value", "valueb" );
366 		}
367 
368 		/* We need a signed multiply, so the image pixel needs to
369 		 * become a signed 16-bit value. We know only the bottom 8 bits
370 		 * of the image and coefficient are interesting, so we can take
371 		 * the bottom half of a 16x16->32 multiply.
372 		 */
373 		CONST( coeff, convi->mant[i], 2 );
374 		ASM3( "mullw", "value", "value", coeff );
375 
376 		/* Shift right before add to prevent overflow on large masks.
377 		 */
378 		CONST( sexp, convi->sexp, 2 );
379 		CONST( rnd, 1 << (convi->sexp - 1), 2 );
380 		ASM3( "addw", "value", "value", rnd );
381 		ASM3( "shrsw", "value", "value", sexp );
382 
383 		/* We accumulate the signed 16-bit result in sum. Saturated
384 		 * add.
385 		 */
386 		ASM3( "addssw", "sum", "sum", "value" );
387 
388 		if( vips_vector_full( v ) )
389 			break;
390 	}
391 
392 	pass->last = i;
393 
394 	/* And write to our intermediate buffer.
395 	 */
396 	ASM2( "copyw", "d1", "sum" );
397 
398 	if( !vips_vector_compile( v ) )
399 		return( -1 );
400 
401 #ifdef DEBUG_COMPILE
402 	printf( "done coeffs %d to %d\n", pass->first, pass->last );
403 	vips_vector_print( v );
404 #endif /*DEBUG_COMPILE*/
405 
406 	return( 0 );
407 }
408 
409 /* Generate code for the final 16->8 conversion.
410  *
411  * 0 for success, -1 on error.
412  */
413 static int
vips_convi_compile_clip(VipsConvi * convi)414 vips_convi_compile_clip( VipsConvi *convi )
415 {
416 	VipsConvolution *convolution = (VipsConvolution *) convi;
417 	VipsImage *M = convolution->M;
418 	int offset = VIPS_RINT( vips_image_get_offset( M ) );
419 
420 	VipsVector *v;
421 	char rnd[256];
422 	char exp[256];
423 	char c0[256];
424 	char c255[256];
425 	char off[256];
426 
427 	convi->vector = v = vips_vector_new( "convi", 1 );
428 
429 	/* "r" is the array of sums we clip down.
430 	 */
431 	convi->r = vips_vector_source_name( v, "r", 2 );
432 
433 	/* The value we fetch from the image.
434 	 */
435 	TEMP( "value", 2 );
436 
437 	CONST( rnd, 1 << (convi->exp - 1), 2 );
438 	ASM3( "addw", "value", "r", rnd );
439 	CONST( exp, convi->exp, 2 );
440 	ASM3( "shrsw", "value", "value", exp );
441 
442 	CONST( off, offset, 2 );
443 	ASM3( "addw", "value", "value", off );
444 
445 	/* You'd think "convsuswb" (convert signed 16-bit to unsigned
446 	 * 8-bit with saturation) would be quicker, but it's a lot
447 	 * slower.
448 	 */
449 	CONST( c0, 0, 2 );
450 	ASM3( "maxsw", "value", c0, "value" );
451 	CONST( c255, 255, 2 );
452 	ASM3( "minsw", "value", c255, "value" );
453 
454 	ASM2( "convwb", "d1", "value" );
455 
456 	if( !vips_vector_compile( v ) )
457 		return( -1 );
458 
459 	return( 0 );
460 }
461 
462 static int
vips_convi_compile(VipsConvi * convi,VipsImage * in)463 vips_convi_compile( VipsConvi *convi, VipsImage *in )
464 {
465 	int i;
466 	Pass *pass;
467 
468 	/* Generate passes until we've used up the whole mask.
469 	 */
470 	for( i = 0;; ) {
471 		/* Allocate space for another pass.
472 		 */
473 		if( convi->n_pass == MAX_PASS )
474 			return( -1 );
475 		pass = &convi->pass[convi->n_pass];
476 		convi->n_pass += 1;
477 
478 		pass->first = i;
479 		pass->r = -1;
480 
481 		if( vips_convi_compile_section( convi, in, pass ) )
482 			return( -1 );
483 		i = pass->last + 1;
484 
485 		if( i >= convi->n_point )
486 			break;
487 	}
488 
489 	if( vips_convi_compile_clip( convi ) )
490 		return( -1 );
491 
492 	return( 0 );
493 }
494 
495 static int
vips_convi_gen_vector(VipsRegion * or,void * vseq,void * a,void * b,gboolean * stop)496 vips_convi_gen_vector( VipsRegion *or,
497 	void *vseq, void *a, void *b, gboolean *stop )
498 {
499 	VipsConviSequence *seq = (VipsConviSequence *) vseq;
500 	VipsConvi *convi = (VipsConvi *) b;
501 	VipsConvolution *convolution = (VipsConvolution *) convi;
502 	VipsImage *M = convolution->M;
503 	VipsImage *in = (VipsImage *) a;
504 	VipsRegion *ir = seq->ir;
505 	VipsRect *r = &or->valid;
506 	int ne = r->width * in->Bands;
507 
508 	VipsRect s;
509 	int i, y;
510 	VipsExecutor executor[MAX_PASS];
511 	VipsExecutor clip;
512 
513 #ifdef DEBUG_PIXELS
514 	printf( "vips_convi_gen_vector: generating %d x %d at %d x %d\n",
515 		r->width, r->height, r->left, r->top );
516 #endif /*DEBUG_PIXELS*/
517 
518 	/* Prepare the section of the input image we need. A little larger
519 	 * than the section of the output image we are producing.
520 	 */
521 	s = *r;
522 	s.width += M->Xsize - 1;
523 	s.height += M->Ysize - 1;
524 	if( vips_region_prepare( ir, &s ) )
525 		return( -1 );
526 
527 	for( i = 0; i < convi->n_pass; i++ )
528 		vips_executor_set_program( &executor[i],
529 			convi->pass[i].vector, ne );
530 	vips_executor_set_program( &clip, convi->vector, ne );
531 
532 	VIPS_GATE_START( "vips_convi_gen_vector: work" );
533 
534 	for( y = 0; y < r->height; y ++ ) {
535 		VipsPel *q = VIPS_REGION_ADDR( or, r->left, r->top + y );
536 
537 #ifdef DEBUG_PIXELS
538 {
539 		int h, v;
540 
541 		printf( "before convolve: x = %d, y = %d\n",
542 			r->left, r->top + y );
543 		for( v = 0; v < M->Ysize; v++ ) {
544 			for( h = 0; h < M->Xsize; h++ )
545 				printf( "%3d ", *VIPS_REGION_ADDR( ir,
546 					r->left + h, r->top + y + v ) );
547 			printf( "\n" );
548 		}
549 }
550 #endif /*DEBUG_PIXELS*/
551 
552 		/* We run our n passes to generate this scanline.
553 		 */
554 		for( i = 0; i < convi->n_pass; i++ ) {
555 			Pass *pass = &convi->pass[i];
556 
557 			vips_executor_set_scanline( &executor[i],
558 				ir, r->left, r->top + y );
559 			vips_executor_set_array( &executor[i],
560 				pass->r, seq->t1 );
561 			vips_executor_set_destination( &executor[i], seq->t2 );
562 			vips_executor_run( &executor[i] );
563 
564 			VIPS_SWAP( signed short *, seq->t1, seq->t2 );
565 		}
566 
567 #ifdef DEBUG_PIXELS
568 		printf( "before clip: %d\n", ((signed short *) seq->t1)[0] );
569 #endif /*DEBUG_PIXELS*/
570 
571 		vips_executor_set_array( &clip, convi->r, seq->t1 );
572 		vips_executor_set_destination( &clip, q );
573 		vips_executor_run( &clip );
574 
575 #ifdef DEBUG_PIXELS
576 		printf( "after clip: %d\n",
577 			*VIPS_REGION_ADDR( or, r->left, r->top + y ) );
578 #endif /*DEBUG_PIXELS*/
579 	}
580 
581 	VIPS_GATE_STOP( "vips_convi_gen_vector: work" );
582 
583 	VIPS_COUNT_PIXELS( or, "vips_convi_gen_vector" );
584 
585 	return( 0 );
586 }
587 
588 /* INT inner loops.
589  */
590 #define CONV_INT( TYPE, CLIP ) { \
591 	TYPE * restrict p = (TYPE *) VIPS_REGION_ADDR( ir, le, y ); \
592 	TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
593 	int * restrict offsets = seq->offsets; \
594 	\
595 	for( x = 0; x < sz; x++ ) {  \
596 		int sum; \
597 		int i; \
598 		\
599 		sum = 0; \
600 		for ( i = 0; i < nnz; i++ ) \
601 			sum += t[i] * p[offsets[i]]; \
602 		\
603 		sum = ((sum + rounding) / scale) + offset; \
604 		\
605 		CLIP; \
606 		\
607 		q[x] = sum;  \
608 		p += 1; \
609 	}  \
610 }
611 
612 /* FLOAT inner loops.
613  */
614 #define CONV_FLOAT( TYPE ) { \
615 	TYPE * restrict p = (TYPE *) VIPS_REGION_ADDR( ir, le, y ); \
616 	TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
617 	int * restrict offsets = seq->offsets; \
618 	\
619 	for( x = 0; x < sz; x++ ) {  \
620 		double sum; \
621 		int i; \
622 		\
623 		sum = 0; \
624 		for ( i = 0; i < nnz; i++ ) \
625 			sum += t[i] * p[offsets[i]]; \
626  		\
627 		sum = (sum / scale) + offset; \
628 		\
629 		q[x] = sum;  \
630 		p += 1; \
631 	}  \
632 }
633 
634 /* Various integer range clips. Record over/under flows.
635  */
636 #define CLIP_UCHAR( V ) \
637 G_STMT_START { \
638 	if( (V) < 0 ) \
639 		(V) = 0; \
640 	else if( (V) > UCHAR_MAX ) \
641 		(V) = UCHAR_MAX; \
642 } G_STMT_END
643 
644 #define CLIP_CHAR( V ) \
645 G_STMT_START { \
646 	if( (V) < SCHAR_MIN ) \
647 		(V) = SCHAR_MIN; \
648 	else if( (V) > SCHAR_MAX ) \
649 		(V) = SCHAR_MAX; \
650 } G_STMT_END
651 
652 #define CLIP_USHORT( V ) \
653 G_STMT_START { \
654 	if( (V) < 0 ) \
655 		(V) = 0; \
656 	else if( (V) > USHRT_MAX ) \
657 		(V) = USHRT_MAX; \
658 } G_STMT_END
659 
660 #define CLIP_SHORT( V ) \
661 G_STMT_START { \
662 	if( (V) < SHRT_MIN ) \
663 		(V) = SHRT_MIN; \
664 	else if( (V) > SHRT_MAX ) \
665 		(V) = SHRT_MAX; \
666 } G_STMT_END
667 
668 #define CLIP_NONE( V ) {}
669 
670 /* Convolve!
671  */
672 static int
vips_convi_gen(VipsRegion * or,void * vseq,void * a,void * b,gboolean * stop)673 vips_convi_gen( VipsRegion *or,
674 	void *vseq, void *a, void *b, gboolean *stop )
675 {
676 	VipsConviSequence *seq = (VipsConviSequence *) vseq;
677 	VipsConvi *convi = (VipsConvi *) b;
678 	VipsConvolution *convolution = (VipsConvolution *) convi;
679 	VipsImage *M = convolution->M;
680 	int scale = VIPS_RINT( vips_image_get_scale( M ) );
681 	int rounding = scale / 2;
682 	int offset = VIPS_RINT( vips_image_get_offset( M ) );
683 	VipsImage *in = (VipsImage *) a;
684 	VipsRegion *ir = seq->ir;
685 	int * restrict t = convi->coeff;
686 	const int nnz = convi->nnz;
687 	VipsRect *r = &or->valid;
688 	int le = r->left;
689 	int to = r->top;
690 	int bo = VIPS_RECT_BOTTOM( r );
691 	int sz = VIPS_REGION_N_ELEMENTS( or ) *
692 		(vips_band_format_iscomplex( in->BandFmt ) ? 2 : 1);
693 
694 	VipsRect s;
695 	int x, y, z, i;
696 
697 	/* Prepare the section of the input image we need. A little larger
698 	 * than the section of the output image we are producing.
699 	 */
700 	s = *r;
701 	s.width += M->Xsize - 1;
702 	s.height += M->Ysize - 1;
703 	if( vips_region_prepare( ir, &s ) )
704 		return( -1 );
705 
706         /* Fill offset array. Only do this if the bpl has changed since the
707 	 * previous vips_region_prepare().
708 	 */
709 	if( seq->last_bpl != VIPS_REGION_LSKIP( ir ) ) {
710 		seq->last_bpl = VIPS_REGION_LSKIP( ir );
711 
712 		for( i = 0; i < nnz; i++ ) {
713 			z = convi->coeff_pos[i];
714 			x = z % M->Xsize;
715 			y = z / M->Xsize;
716 
717 			seq->offsets[i] =
718 				(VIPS_REGION_ADDR( ir, x + le, y + to ) -
719 				 VIPS_REGION_ADDR( ir, le, to )) /
720 					VIPS_IMAGE_SIZEOF_ELEMENT( ir->im );
721 		}
722 	}
723 
724 	VIPS_GATE_START( "vips_convi_gen: work" );
725 
726 	for( y = to; y < bo; y++ ) {
727 		switch( in->BandFmt ) {
728 		case VIPS_FORMAT_UCHAR:
729 			CONV_INT( unsigned char, CLIP_UCHAR( sum ) );
730 			break;
731 
732 		case VIPS_FORMAT_CHAR:
733 			CONV_INT( signed char, CLIP_CHAR( sum ) );
734 			break;
735 
736 		case VIPS_FORMAT_USHORT:
737 			CONV_INT( unsigned short, CLIP_USHORT( sum ) );
738 			break;
739 
740 		case VIPS_FORMAT_SHORT:
741 			CONV_INT( signed short, CLIP_SHORT( sum ) );
742 			break;
743 
744 		case VIPS_FORMAT_UINT:
745 			CONV_INT( unsigned int, CLIP_NONE( sum ) );
746 			break;
747 
748 		case VIPS_FORMAT_INT:
749 			CONV_INT( signed int, CLIP_NONE( sum ) );
750 			break;
751 
752 		case VIPS_FORMAT_FLOAT:
753 		case VIPS_FORMAT_COMPLEX:
754 			CONV_FLOAT( float );
755 			break;
756 
757 		case VIPS_FORMAT_DOUBLE:
758 		case VIPS_FORMAT_DPCOMPLEX:
759 			CONV_FLOAT( double );
760 			break;
761 
762 		default:
763 			g_assert_not_reached();
764 		}
765 	}
766 
767 	VIPS_GATE_STOP( "vips_convi_gen: work" );
768 
769 	VIPS_COUNT_PIXELS( or, "vips_convi_gen" );
770 
771 	return( 0 );
772 }
773 
774 /* Make an int version of a mask.
775  *
776  * We rint() everything, then adjust the scale try to match the overall
777  * effect.
778  */
779 int
vips__image_intize(VipsImage * in,VipsImage ** out)780 vips__image_intize( VipsImage *in, VipsImage **out )
781 {
782 	VipsImage *t;
783 	int x, y;
784 	double double_result;
785 	double out_scale;
786 	double out_offset;
787 	int int_result;
788 
789 	if( vips_check_matrix( "vips2imask", in, &t ) )
790 		return( -1 );
791 	if( !(*out = vips_image_new_matrix( t->Xsize, t->Ysize )) ) {
792 		g_object_unref( t );
793 		return( -1 );
794 	}
795 
796 	/* We want to make an intmask which has the same input to output ratio
797 	 * as the double image.
798 	 *
799 	 * Imagine convolving with the double image, what's the ratio of
800 	 * brightness between input and output? We want the same ratio for the
801 	 * int version, if we can.
802 	 *
803 	 * Imagine an input image where every pixel is 1, what will the output
804 	 * be?
805 	 */
806 	double_result = 0;
807 	for( y = 0; y < t->Ysize; y++ )
808 		for( x = 0; x < t->Xsize; x++ )
809 			double_result += *VIPS_MATRIX( t, x, y );
810 	double_result /= vips_image_get_scale( t );
811 
812 	for( y = 0; y < t->Ysize; y++ )
813 		for( x = 0; x < t->Xsize; x++ )
814 			*VIPS_MATRIX( *out, x, y ) =
815 				VIPS_RINT( *VIPS_MATRIX( t, x, y ) );
816 
817 	out_scale = VIPS_RINT( vips_image_get_scale( t ) );
818 	if( out_scale == 0 )
819 		out_scale = 1;
820 	out_offset = VIPS_RINT( vips_image_get_offset( t ) );
821 
822 	/* Now convolve a 1 everywhere image with the int version we've made,
823 	 * what do we get?
824 	 */
825 	int_result = 0;
826 	for( y = 0; y < t->Ysize; y++ )
827 		for( x = 0; x < t->Xsize; x++ )
828 			int_result += *VIPS_MATRIX( *out, x, y );
829 	int_result /= out_scale;
830 
831 	/* And adjust the scale to get as close to a match as we can.
832 	 */
833 	out_scale = VIPS_RINT( out_scale + (int_result - double_result) );
834 	if( out_scale == 0 )
835 		out_scale = 1;
836 
837 	vips_image_set_double( *out, "scale", out_scale );
838 	vips_image_set_double( *out, "offset", out_offset );
839 
840 	g_object_unref( t );
841 
842 	return( 0 );
843 }
844 
845 /* Make an int version of a mask. Each element is 8.8 float, with the same
846  * exponent for each element (so just 8 bits in @out).
847  *
848  * @out is a w x h int array.
849  */
850 static int
vips_convi_intize(VipsConvi * convi,VipsImage * M)851 vips_convi_intize( VipsConvi *convi, VipsImage *M )
852 {
853 	VipsImage *t;
854 	double scale;
855 	double *scaled;
856 	double mx;
857 	double mn;
858 	int shift;
859 	int i;
860 
861 	if( vips_check_matrix( "vips2imask", M, &t ) )
862 		return( -1 );
863 
864 	/* Bake the scale into the mask to make a double version.
865 	 */
866 	scale = vips_image_get_scale( t );
867         if( !(scaled = VIPS_ARRAY( convi, convi->n_point, double )) ) {
868 		g_object_unref( t );
869 		return( -1 );
870 	}
871 	for( i = 0; i < convi->n_point; i++ )
872 		scaled[i] = VIPS_MATRIX( t, 0, 0 )[i] / scale;
873 	g_object_unref( t );
874 
875 #ifdef DEBUG_COMPILE
876 {
877 	int x, y;
878 
879 	printf( "vips_convi_intize: double version\n" );
880 	for( y = 0; y < t->Ysize; y++ ) {
881 		printf( "\t" );
882 		for( x = 0; x < t->Xsize; x++ )
883 			printf( "%g ", scaled[y * t->Xsize + x] );
884 		printf( "\n" );
885 	}
886 }
887 #endif /*DEBUG_COMPILE*/
888 
889 	mx = scaled[0];
890 	mn = scaled[0];
891 	for( i = 1; i < convi->n_point; i++ ) {
892 		if( scaled[i] > mx )
893 			mx = scaled[i];
894 		if( scaled[i] < mn )
895 			mn = scaled[i];
896 	}
897 
898 	/* The mask max rounded up to the next power of two gives the exponent
899 	 * all elements share. Values are eg. -3 for 1/8, 3 for 8.
900 	 *
901 	 * Add one so we round up stuff exactly on x.0. We multiply by 128
902 	 * later, so 1.0 (for example) would become 128, which is outside
903 	 * signed 8 bit.
904 	 */
905 	shift = ceil( log2( mx ) + 1 );
906 
907 	/* We need to sum n_points, so we have to shift right before adding a
908 	 * new value to make sure we have enough range.
909 	 */
910 	convi->sexp = ceil( log2( convi->n_point ) );
911 	if( convi->sexp > 10 ) {
912 		g_info( "vips_convi_intize: mask too large" );
913 		return( -1 );
914 	}
915 
916 	/* With that already done, the final shift must be ...
917 	 */
918 	convi->exp = 7 - shift - convi->sexp;
919 
920 	if( !(convi->mant = VIPS_ARRAY( convi, convi->n_point, int )) )
921 		return( -1 );
922 	for( i = 0; i < convi->n_point; i++ ) {
923 		/* 128 since this is signed.
924 		 */
925 		convi->mant[i] = VIPS_RINT( 128 * scaled[i] * pow(2, -shift) );
926 
927 		if( convi->mant[i] < -128 ||
928 			convi->mant[i] > 127 ) {
929 			g_info( "vips_convi_intize: mask range too large" );
930 			return( -1 );
931 		}
932 	}
933 
934 #ifdef DEBUG_COMPILE
935 {
936 	int x, y;
937 
938 	printf( "vips_convi_intize:\n" );
939 	printf( "sexp = %d\n", convi->sexp );
940 	printf( "exp = %d\n", convi->exp );
941 	for( y = 0; y < t->Ysize; y++ ) {
942 		printf( "\t" );
943 		for( x = 0; x < t->Xsize; x++ )
944 			printf( "%4d ", convi->mant[y * t->Xsize + x] );
945 		printf( "\n" );
946 	}
947 }
948 #endif /*DEBUG_COMPILE*/
949 
950 	/* Verify accuracy.
951 	 */
952 {
953 	double true_sum;
954 	int int_sum;
955 	int true_value;
956 	int int_value;
957 
958 	true_sum = 0.0;
959 	int_sum = 0;
960 	for( i = 0; i < convi->n_point; i++ ) {
961 		int value;
962 
963 		true_sum += 128 * scaled[i];
964 		value = 128 * convi->mant[i];
965 		value = (value + (1 << (convi->sexp - 1))) >> convi->sexp;
966 		int_sum += value;
967 		int_sum = VIPS_CLIP( SHRT_MIN, int_sum, SHRT_MAX );
968 	}
969 
970 	true_value = VIPS_CLIP( 0, true_sum, 255 );
971 
972 	if( convi->exp > 0 )
973 		int_value = (int_sum + (1 << (convi->exp - 1))) >> convi->exp;
974 	else
975 		int_value = VIPS_LSHIFT_INT( int_sum, convi->exp );
976 	int_value = VIPS_CLIP( 0, int_value, 255 );
977 
978 	if( VIPS_ABS( true_value - int_value ) > 2 ) {
979 		g_info( "vips_convi_intize: too inaccurate" );
980 		return( -1 );
981 	}
982 }
983 
984 	return( 0 );
985 }
986 
987 static int
vips_convi_build(VipsObject * object)988 vips_convi_build( VipsObject *object )
989 {
990 	VipsConvolution *convolution = (VipsConvolution *) object;
991 	VipsConvi *convi = (VipsConvi *) object;
992 	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
993 
994 	VipsImage *in;
995 	VipsImage *M;
996 	VipsGenerateFn generate;
997 	double *coeff;
998         int i;
999 
1000 	if( VIPS_OBJECT_CLASS( vips_convi_parent_class )->build( object ) )
1001 		return( -1 );
1002 
1003 	in = convolution->in;
1004 	M = convolution->M;
1005 	convi->n_point = M->Xsize * M->Ysize;
1006 
1007 	if( vips_embed( in, &t[0],
1008 		M->Xsize / 2, M->Ysize / 2,
1009 		in->Xsize + M->Xsize - 1, in->Ysize + M->Ysize - 1,
1010 		"extend", VIPS_EXTEND_COPY,
1011 		NULL ) )
1012 		return( -1 );
1013 	in = t[0];
1014 
1015 	/* Default to the C path.
1016 	 */
1017 	generate = vips_convi_gen;
1018 
1019 	/* For uchar input, try to make a vector path.
1020 	 */
1021 	if( vips_vector_isenabled() &&
1022 		in->BandFmt == VIPS_FORMAT_UCHAR ) {
1023 		if( !vips_convi_intize( convi, M ) &&
1024 			!vips_convi_compile( convi, in ) ) {
1025 			generate = vips_convi_gen_vector;
1026 			g_info( "convi: using vector path" );
1027 		}
1028 		else
1029 			vips_convi_compile_free( convi );
1030 	}
1031 
1032 	/* Make the data for the C path.
1033 	 */
1034 	if( generate == vips_convi_gen ) {
1035 		g_info( "convi: using C path" );
1036 
1037 		/* Make an int version of our mask.
1038 		 */
1039 		if( vips__image_intize( M, &t[1] ) )
1040 			return( -1 );
1041 		M = t[1];
1042 
1043 		coeff = VIPS_MATRIX( M, 0, 0 );
1044 		if( !(convi->coeff = VIPS_ARRAY( object, convi->n_point, int )) ||
1045 			!(convi->coeff_pos =
1046 				VIPS_ARRAY( object, convi->n_point, int )) )
1047 			return( -1 );
1048 
1049 		/* Squeeze out zero mask elements.
1050 		 */
1051 		convi->nnz = 0;
1052 		for( i = 0; i < convi->n_point; i++ )
1053 			if( coeff[i] ) {
1054 				convi->coeff[convi->nnz] = coeff[i];
1055 				convi->coeff_pos[convi->nnz] = i;
1056 				convi->nnz += 1;
1057 			}
1058 
1059 		/* Was the whole mask zero? We must have at least 1 element
1060 		 * in there: set it to zero.
1061 		 */
1062 		if( convi->nnz == 0 ) {
1063 			convi->coeff[0] = 0;
1064 			convi->coeff_pos[0] = 0;
1065 			convi->nnz = 1;
1066 		}
1067 	}
1068 
1069 	g_object_set( convi, "out", vips_image_new(), NULL );
1070 	if( vips_image_pipelinev( convolution->out,
1071 		VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
1072 		return( -1 );
1073 
1074 	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
1075 	 * would be 1x1.
1076 	 */
1077 	convolution->out->Xsize -= M->Xsize - 1;
1078 	convolution->out->Ysize -= M->Ysize - 1;
1079 
1080 	if( vips_image_generate( convolution->out,
1081 		vips_convi_start, generate, vips_convi_stop, in, convi ) )
1082 		return( -1 );
1083 
1084 	convolution->out->Xoffset = -M->Xsize / 2;
1085 	convolution->out->Yoffset = -M->Ysize / 2;
1086 
1087 	return( 0 );
1088 }
1089 
1090 static void
vips_convi_class_init(VipsConviClass * class)1091 vips_convi_class_init( VipsConviClass *class )
1092 {
1093 	GObjectClass *gobject_class = G_OBJECT_CLASS( class );
1094 	VipsObjectClass *object_class = (VipsObjectClass *) class;
1095 
1096 	gobject_class->dispose = vips_convi_dispose;
1097 
1098 	object_class->nickname = "convi";
1099 	object_class->description = _( "int convolution operation" );
1100 	object_class->build = vips_convi_build;
1101 }
1102 
1103 static void
vips_convi_init(VipsConvi * convi)1104 vips_convi_init( VipsConvi *convi )
1105 {
1106         convi->nnz = 0;
1107         convi->coeff = NULL;
1108         convi->coeff_pos = NULL;
1109 }
1110 
1111 /**
1112  * vips_convi: (method)
1113  * @in: input image
1114  * @out: (out): output image
1115  * @mask: convolve with this mask
1116  * @...: %NULL-terminated list of optional named arguments
1117  *
1118  * Integer convolution. This is a low-level operation, see vips_conv() for
1119  * something more convenient.
1120  *
1121  * @mask is converted to an integer mask with rint() of each element, rint of
1122  * scale and rint of offset. Each output pixel is then calculated as
1123  *
1124  * |[
1125  * sigma[i]{pixel[i] * mask[i]} / scale + offset
1126  * ]|
1127  *
1128  * The output image always has the same #VipsBandFormat as the input image.
1129  *
1130  * For #VIPS_FORMAT_UCHAR images, vips_convi() uses a fast vector path based on
1131  * half-float arithmetic. This can produce slightly different results.
1132  * Disable the vector path with `--vips-novector` or `VIPS_NOVECTOR` or
1133  * vips_vector_set_enabled().
1134  *
1135  * See also: vips_conv().
1136  *
1137  * Returns: 0 on success, -1 on error
1138  */
1139 int
vips_convi(VipsImage * in,VipsImage ** out,VipsImage * mask,...)1140 vips_convi( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
1141 {
1142 	va_list ap;
1143 	int result;
1144 
1145 	va_start( ap, mask );
1146 	result = vips_call_split( "convi", ap, in, out, mask );
1147 	va_end( ap );
1148 
1149 	return( result );
1150 }
1151 
1152