1 /*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27 /*
28 * FUNCTION
29 * Internal functions for mlib_ImageConv* on U8/S16/U16 types and
30 * MLIB_EDGE_DST_NO_WRITE mask
31 */
32
33 #include "mlib_image.h"
34 #include "mlib_ImageConv.h"
35 #include "mlib_c_ImageConv.h"
36
37 /*
38 This define switches between functions of different data types
39 */
40 #define IMG_TYPE 1
41
42 /***************************************************************/
43 #if IMG_TYPE == 1
44
45 #define DTYPE mlib_u8
46 #define CONV_FUNC(KERN) mlib_c_conv##KERN##nw_u8
47 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
48 #define DSCALE (1 << 24)
49 #define FROM_S32(x) (((x) >> 24) ^ 128)
50 #define S64TOS32(x) (x)
51 #define SAT_OFF -(1u << 31)
52
53 #elif IMG_TYPE == 2
54
55 #define DTYPE mlib_s16
56 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_s16
57 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
58 #define DSCALE 65536.0
59 #define FROM_S32(x) ((x) >> 16)
60 #define S64TOS32(x) ((x) & 0xffffffff)
61 #define SAT_OFF
62
63 #elif IMG_TYPE == 3
64
65 #define DTYPE mlib_u16
66 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_u16
67 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
68 #define DSCALE 65536.0
69 #define FROM_S32(x) (((x) >> 16) ^ 0x8000)
70 #define S64TOS32(x) (x)
71 #define SAT_OFF -(1u << 31)
72
73 #endif /* IMG_TYPE == 1 */
74
75 /***************************************************************/
76 #define BUFF_SIZE 1600
77
78 #define CACHE_SIZE (64*1024)
79
80 /***************************************************************/
81 #define FTYPE mlib_d64
82
83 #ifndef MLIB_USE_FTOI_CLAMPING
84
85 #define CLAMP_S32(x) \
86 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
87
88 #else
89
90 #define CLAMP_S32(x) ((mlib_s32)(x))
91
92 #endif /* MLIB_USE_FTOI_CLAMPING */
93
94 /***************************************************************/
95 #define D2I(x) CLAMP_S32((x) SAT_OFF)
96
97 /***************************************************************/
98 #ifdef VM_LITTLE_ENDIAN
99
100 #define STORE2(res0, res1) \
101 dp[0 ] = res1; \
102 dp[chan1] = res0
103
104 #else
105
106 #define STORE2(res0, res1) \
107 dp[0 ] = res0; \
108 dp[chan1] = res1
109
110 #endif /* VM_LITTLE_ENDIAN */
111
112 /***************************************************************/
113 #ifdef _NO_LONGLONG
114
115 #define LOAD_BUFF(buff) \
116 buff[i ] = sp[0]; \
117 buff[i + 1] = sp[chan1]
118
119 #else /* _NO_LONGLONG */
120
121 #ifdef VM_LITTLE_ENDIAN
122
123 #define LOAD_BUFF(buff) \
124 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
125
126 #else /* VM_LITTLE_ENDIAN */
127
128 #define LOAD_BUFF(buff) \
129 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
130
131 #endif /* VM_LITTLE_ENDIAN */
132 #endif /* _NO_LONGLONG */
133
134 /***************************************************************/
135 typedef union {
136 mlib_d64 d64;
137 struct {
138 mlib_s32 i0;
139 mlib_s32 i1;
140 } i32s;
141 struct {
142 mlib_s32 f0;
143 mlib_s32 f1;
144 } f32s;
145 } d64_2x32;
146
147 /***************************************************************/
148 #define BUFF_LINE 256
149
150 /***************************************************************/
151 #define DEF_VARS(type) \
152 type *adr_src, *sl, *sp = NULL; \
153 type *adr_dst, *dl, *dp = NULL; \
154 FTYPE *pbuff = buff; \
155 mlib_s32 wid, hgt, sll, dll; \
156 mlib_s32 nchannel, chan1; \
157 mlib_s32 i, j, c
158
159 /***************************************************************/
160 #define LOAD_KERNEL3() \
161 FTYPE scalef = DSCALE; \
162 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8; \
163 FTYPE p00, p01, p02, p03, \
164 p10, p11, p12, p13, \
165 p20, p21, p22, p23; \
166 \
167 while (scalef_expon > 30) { \
168 scalef /= (1 << 30); \
169 scalef_expon -= 30; \
170 } \
171 \
172 scalef /= (1 << scalef_expon); \
173 \
174 /* keep kernel in regs */ \
175 k0 = scalef * kern[0]; k1 = scalef * kern[1]; k2 = scalef * kern[2]; \
176 k3 = scalef * kern[3]; k4 = scalef * kern[4]; k5 = scalef * kern[5]; \
177 k6 = scalef * kern[6]; k7 = scalef * kern[7]; k8 = scalef * kern[8]
178
179 /***************************************************************/
180 #define LOAD_KERNEL(SIZE) \
181 FTYPE scalef = DSCALE; \
182 \
183 while (scalef_expon > 30) { \
184 scalef /= (1 << 30); \
185 scalef_expon -= 30; \
186 } \
187 \
188 scalef /= (1 << scalef_expon); \
189 \
190 for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
191
192 /***************************************************************/
193 #define GET_SRC_DST_PARAMETERS(type) \
194 hgt = mlib_ImageGetHeight(src); \
195 wid = mlib_ImageGetWidth(src); \
196 nchannel = mlib_ImageGetChannels(src); \
197 sll = mlib_ImageGetStride(src) / sizeof(type); \
198 dll = mlib_ImageGetStride(dst) / sizeof(type); \
199 adr_src = (type *)mlib_ImageGetData(src); \
200 adr_dst = (type *)mlib_ImageGetData(dst)
201
202 /***************************************************************/
203 #ifndef __sparc
204
205 #if IMG_TYPE == 1
206
207 /* Test for the presence of any "1" bit in bits
208 8 to 31 of val. If present, then val is either
209 negative or >255. If over/underflows of 8 bits
210 are uncommon, then this technique can be a win,
211 since only a single test, rather than two, is
212 necessary to determine if clamping is needed.
213 On the other hand, if over/underflows are common,
214 it adds an extra test.
215 */
216 #define CLAMP_STORE(dst, val) \
217 if (val & 0xffffff00) { \
218 if (val < MLIB_U8_MIN) \
219 dst = MLIB_U8_MIN; \
220 else \
221 dst = MLIB_U8_MAX; \
222 } else { \
223 dst = (mlib_u8)val; \
224 }
225
226 #elif IMG_TYPE == 2
227
228 #define CLAMP_STORE(dst, val) \
229 if (val >= MLIB_S16_MAX) \
230 dst = MLIB_S16_MAX; \
231 else if (val <= MLIB_S16_MIN) \
232 dst = MLIB_S16_MIN; \
233 else \
234 dst = (mlib_s16)val
235
236 #elif IMG_TYPE == 3
237
238 #define CLAMP_STORE(dst, val) \
239 if (val >= MLIB_U16_MAX) \
240 dst = MLIB_U16_MAX; \
241 else if (val <= MLIB_U16_MIN) \
242 dst = MLIB_U16_MIN; \
243 else \
244 dst = (mlib_u16)val
245
246 #endif /* IMG_TYPE == 1 */
247 #endif /* __sparc */
248
249 /***************************************************************/
250 #define KSIZE 3
251
252 mlib_status CONV_FUNC(3x3)(mlib_image *dst,
253 const mlib_image *src,
254 const mlib_s32 *kern,
255 mlib_s32 scalef_expon,
256 mlib_s32 cmask)
257 {
258 FTYPE buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
259 DEF_VARS(DTYPE);
260 DTYPE *sl1;
261 mlib_s32 chan2;
262 mlib_s32 *buffo, *buffi;
263 DTYPE *sl2;
264 #ifndef __sparc
265 mlib_s32 d0, d1;
266 #endif /* __sparc */
267 LOAD_KERNEL3();
268 GET_SRC_DST_PARAMETERS(DTYPE);
269
270 if (wid > BUFF_LINE) {
271 pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE)*wid);
272
273 if (pbuff == NULL) return MLIB_FAILURE;
274 }
275
276 buff0 = pbuff;
277 buff1 = buff0 + wid;
278 buff2 = buff1 + wid;
279 buff3 = buff2 + wid;
280 buffo = (mlib_s32*)(buff3 + wid);
281 buffi = buffo + (wid &~ 1);
282
283 chan1 = nchannel;
284 chan2 = chan1 + chan1;
285
286 wid -= (KSIZE - 1);
287 hgt -= (KSIZE - 1);
288
289 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
290
291 for (c = 0; c < nchannel; c++) {
292 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
293
294 sl = adr_src + c;
295 dl = adr_dst + c;
296
297 sl1 = sl + sll;
298 sl2 = sl1 + sll;
299 #ifdef __SUNPRO_C
300 #pragma pipeloop(0)
301 #endif /* __SUNPRO_C */
302 for (i = 0; i < wid + (KSIZE - 1); i++) {
303 buff0[i] = (FTYPE)sl[i*chan1];
304 buff1[i] = (FTYPE)sl1[i*chan1];
305 buff2[i] = (FTYPE)sl2[i*chan1];
306 }
307
308 sl += KSIZE*sll;
309
310 for (j = 0; j < hgt; j++) {
311 FTYPE s0, s1;
312
313 p02 = buff0[0];
314 p12 = buff1[0];
315 p22 = buff2[0];
316
317 p03 = buff0[1];
318 p13 = buff1[1];
319 p23 = buff2[1];
320
321 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
322 s1 = p03 * k0 + p13 * k3 + p23 * k6;
323
324 sp = sl;
325 dp = dl;
326
327 #ifdef __SUNPRO_C
328 #pragma pipeloop(0)
329 #endif /* __SUNPRO_C */
330 for (i = 0; i <= (wid - 2); i += 2) {
331 #ifdef __sparc
332 #ifdef _NO_LONGLONG
333 mlib_s32 o64_1, o64_2;
334 #else /* _NO_LONGLONG */
335 mlib_s64 o64;
336 #endif /* _NO_LONGLONG */
337 #endif /* __sparc */
338 d64_2x32 dd;
339
340 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
341 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
342
343 LOAD_BUFF(buffi);
344
345 dd.d64 = *(FTYPE *)(buffi + i);
346 buff3[i ] = (FTYPE)dd.i32s.i0;
347 buff3[i + 1] = (FTYPE)dd.i32s.i1;
348
349 #ifndef __sparc
350 d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
351 d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
352
353 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
354 s1 = p03 * k0 + p13 * k3 + p23 * k6;
355
356 dp[0 ] = FROM_S32(d0);
357 dp[chan1] = FROM_S32(d1);
358
359 #else /* __sparc */
360
361 dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
362 dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
363 *(FTYPE *)(buffo + i) = dd.d64;
364
365 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
366 s1 = p03 * k0 + p13 * k3 + p23 * k6;
367
368 #ifdef _NO_LONGLONG
369
370 o64_1 = buffo[i];
371 o64_2 = buffo[i+1];
372 #if IMG_TYPE != 1
373 STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
374 #else
375 STORE2(o64_1 >> 24, o64_2 >> 24);
376 #endif /* IMG_TYPE != 1 */
377
378 #else /* _NO_LONGLONG */
379
380 o64 = *(mlib_s64*)(buffo + i);
381 #if IMG_TYPE != 1
382 STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
383 #else
384 STORE2(o64 >> 56, o64 >> 24);
385 #endif /* IMG_TYPE != 1 */
386 #endif /* _NO_LONGLONG */
387 #endif /* __sparc */
388
389 sp += chan2;
390 dp += chan2;
391 }
392
393 for (; i < wid; i++) {
394 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i];
395 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
396 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
397
398 buffi[i] = (mlib_s32)sp[0];
399 buff3[i] = (FTYPE)buffi[i];
400
401 #ifndef __sparc
402
403 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
404 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
405
406 dp[0] = FROM_S32(d0);
407
408 #else /* __sparc */
409
410 buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
411 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
412 #if IMG_TYPE != 1
413 dp[0] = FROM_S32(buffo[i]);
414 #else
415 dp[0] = buffo[i] >> 24;
416 #endif /* IMG_TYPE != 1 */
417 #endif /* __sparc */
418
419 sp += chan1;
420 dp += chan1;
421 }
422
423 buffi[wid] = (mlib_s32)sp[0];
424 buff3[wid] = (FTYPE)buffi[wid];
425 buffi[wid + 1] = (mlib_s32)sp[chan1];
426 buff3[wid + 1] = (FTYPE)buffi[wid + 1];
427
428 sl += sll;
429 dl += dll;
430
431 buffT = buff0;
432 buff0 = buff1;
433 buff1 = buff2;
434 buff2 = buff3;
435 buff3 = buffT;
436 }
437 }
438
439 #ifdef __sparc
440 #if IMG_TYPE == 1
441 {
442 mlib_s32 amask = (1 << nchannel) - 1;
443
444 if ((cmask & amask) != amask) {
445 mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
446 } else {
447 mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
448 }
449 }
450
451 #endif /* IMG_TYPE == 1 */
452 #endif /* __sparc */
453
454 if (pbuff != buff) mlib_free(pbuff);
455
456 return MLIB_SUCCESS;
457 }
458
459 /***************************************************************/
460 #ifndef __sparc /* for x86, using integer multiplies is faster */
461
462 mlib_status CONV_FUNC_I(3x3)(mlib_image *dst,
463 const mlib_image *src,
464 const mlib_s32 *kern,
465 mlib_s32 scalef_expon,
466 mlib_s32 cmask)
467 {
468 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2;
469 DTYPE *adr_dst, *dl, *dp;
470 mlib_s32 wid, hgt, sll, dll;
471 mlib_s32 nchannel, chan1, chan2;
472 mlib_s32 i, j, c;
473 mlib_s32 shift1, shift2;
474 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
475 mlib_s32 p02, p03,
476 p12, p13,
477 p22, p23;
478
479 #if IMG_TYPE != 1
480 shift1 = 16;
481 #else
482 shift1 = 8;
483 #endif /* IMG_TYPE != 1 */
484
485 shift2 = scalef_expon - shift1;
486
487 /* keep kernel in regs */
488 k0 = kern[0] >> shift1; k1 = kern[1] >> shift1; k2 = kern[2] >> shift1;
489 k3 = kern[3] >> shift1; k4 = kern[4] >> shift1; k5 = kern[5] >> shift1;
490 k6 = kern[6] >> shift1; k7 = kern[7] >> shift1; k8 = kern[8] >> shift1;
491
492 GET_SRC_DST_PARAMETERS(DTYPE);
493
494 chan1 = nchannel;
495 chan2 = chan1 + chan1;
496
497 wid -= (KSIZE - 1);
498 hgt -= (KSIZE - 1);
499
500 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
501
502 for (c = 0; c < chan1; c++) {
503 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
504
505 sl = adr_src + c;
506 dl = adr_dst + c;
507
508 for (j = 0; j < hgt; j++) {
509 mlib_s32 s0, s1;
510 mlib_s32 pix0, pix1;
511
512 dp = dl;
513 sp0 = sl;
514 sp1 = sp0 + sll;
515 sp2 = sp1 + sll;
516
517 p02 = sp0[0];
518 p12 = sp1[0];
519 p22 = sp2[0];
520
521 p03 = sp0[chan1];
522 p13 = sp1[chan1];
523 p23 = sp2[chan1];
524
525 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
526 s1 = p03 * k0 + p13 * k3 + p23 * k6;
527
528 sp0 += chan2;
529 sp1 += chan2;
530 sp2 += chan2;
531
532 #ifdef __SUNPRO_C
533 #pragma pipeloop(0)
534 #endif /* __SUNPRO_C */
535 for (i = 0; i <= (wid - 2); i += 2) {
536 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
537 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
538
539 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
540 pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
541 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
542
543 CLAMP_STORE(dp[0], pix0)
544 CLAMP_STORE(dp[chan1], pix1)
545
546 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
547 s1 = p03 * k0 + p13 * k3 + p23 * k6;
548
549 sp0 += chan2;
550 sp1 += chan2;
551 sp2 += chan2;
552 dp += chan2;
553 }
554
555 if (wid & 1) {
556 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
557 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
558 CLAMP_STORE(dp[0], pix0)
559 }
560
561 sl += sll;
562 dl += dll;
563 }
564 }
565
566 return MLIB_SUCCESS;
567 }
568
569 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
570
571 /***************************************************************/
572 #undef KSIZE
573 #define KSIZE 4
574
575 mlib_status CONV_FUNC(4x4)(mlib_image *dst,
576 const mlib_image *src,
577 const mlib_s32 *kern,
578 mlib_s32 scalef_expon,
579 mlib_s32 cmask)
580 {
581 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
582 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
583 FTYPE k[KSIZE*KSIZE];
584 mlib_s32 d0, d1;
585 FTYPE k0, k1, k2, k3, k4, k5, k6, k7;
586 FTYPE p00, p01, p02, p03, p04,
587 p10, p11, p12, p13, p14,
588 p20, p21, p22, p23,
589 p30, p31, p32, p33;
590 DEF_VARS(DTYPE);
591 DTYPE *sl1;
592 mlib_s32 chan2;
593 mlib_s32 *buffo, *buffi;
594 DTYPE *sl2, *sl3;
595 LOAD_KERNEL(KSIZE*KSIZE);
596 GET_SRC_DST_PARAMETERS(DTYPE);
597
598 if (wid > BUFF_LINE) {
599 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
600
601 if (pbuff == NULL) return MLIB_FAILURE;
602 }
603
604 buff0 = pbuff;
605 buff1 = buff0 + wid;
606 buff2 = buff1 + wid;
607 buff3 = buff2 + wid;
608 buff4 = buff3 + wid;
609 buffd = buff4 + wid;
610 buffo = (mlib_s32*)(buffd + wid);
611 buffi = buffo + (wid &~ 1);
612
613 chan1 = nchannel;
614 chan2 = chan1 + chan1;
615
616 wid -= (KSIZE - 1);
617 hgt -= (KSIZE - 1);
618
619 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
620
621 for (c = 0; c < nchannel; c++) {
622 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
623
624 sl = adr_src + c;
625 dl = adr_dst + c;
626
627 sl1 = sl + sll;
628 sl2 = sl1 + sll;
629 sl3 = sl2 + sll;
630 #ifdef __SUNPRO_C
631 #pragma pipeloop(0)
632 #endif /* __SUNPRO_C */
633 for (i = 0; i < wid + (KSIZE - 1); i++) {
634 buff0[i] = (FTYPE)sl[i*chan1];
635 buff1[i] = (FTYPE)sl1[i*chan1];
636 buff2[i] = (FTYPE)sl2[i*chan1];
637 buff3[i] = (FTYPE)sl3[i*chan1];
638 }
639
640 sl += KSIZE*sll;
641
642 for (j = 0; j < hgt; j++) {
643 d64_2x32 dd;
644
645 /*
646 * First loop on two first lines of kernel
647 */
648 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
649 k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
650
651 sp = sl;
652 dp = dl;
653
654 p02 = buff0[0];
655 p12 = buff1[0];
656 p03 = buff0[1];
657 p13 = buff1[1];
658 p04 = buff0[2];
659
660 #ifdef __SUNPRO_C
661 #pragma pipeloop(0)
662 #endif /* __SUNPRO_C */
663 for (i = 0; i <= (wid - 2); i += 2) {
664 p00 = p02; p10 = p12;
665 p01 = p03; p11 = p13;
666 p02 = p04; p12 = buff1[i + 2];
667 p03 = buff0[i + 3]; p13 = buff1[i + 3];
668 p04 = buff0[i + 4]; p14 = buff1[i + 4];
669
670 LOAD_BUFF(buffi);
671
672 dd.d64 = *(FTYPE *)(buffi + i);
673 buff4[i ] = (FTYPE)dd.i32s.i0;
674 buff4[i + 1] = (FTYPE)dd.i32s.i1;
675
676 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
677 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
678 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
679 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
680
681 sp += chan2;
682 dp += chan2;
683 }
684
685 /*
686 * Second loop on two last lines of kernel
687 */
688 k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
689 k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
690
691 sp = sl;
692 dp = dl;
693
694 p02 = buff2[0];
695 p12 = buff3[0];
696 p03 = buff2[1];
697 p13 = buff3[1];
698 p04 = buff2[2];
699
700 #ifdef __SUNPRO_C
701 #pragma pipeloop(0)
702 #endif /* __SUNPRO_C */
703 for (i = 0; i <= (wid - 2); i += 2) {
704 p00 = p02; p10 = p12;
705 p01 = p03; p11 = p13;
706 p02 = p04; p12 = buff3[i + 2];
707 p03 = buff2[i + 3]; p13 = buff3[i + 3];
708 p04 = buff2[i + 4]; p14 = buff3[i + 4];
709
710 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
711 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
712 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
713 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
714
715 dp[0 ] = FROM_S32(d0);
716 dp[chan1] = FROM_S32(d1);
717
718 sp += chan2;
719 dp += chan2;
720 }
721
722 /* last pixels */
723 for (; i < wid; i++) {
724 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
725 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
726 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
727 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
728
729 buff4[i] = (FTYPE)sp[0];
730
731 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
732 p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
733 p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
734 p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
735
736 dp[0] = FROM_S32(buffo[i]);
737
738 sp += chan1;
739 dp += chan1;
740 }
741
742 buff4[wid ] = (FTYPE)sp[0];
743 buff4[wid + 1] = (FTYPE)sp[chan1];
744 buff4[wid + 2] = (FTYPE)sp[chan2];
745
746 /* next line */
747 sl += sll;
748 dl += dll;
749
750 buffT = buff0;
751 buff0 = buff1;
752 buff1 = buff2;
753 buff2 = buff3;
754 buff3 = buff4;
755 buff4 = buffT;
756 }
757 }
758
759 if (pbuff != buff) mlib_free(pbuff);
760
761 return MLIB_SUCCESS;
762 }
763
764 /***************************************************************/
765 #undef KSIZE
766 #define KSIZE 5
767
768 mlib_status CONV_FUNC(5x5)(mlib_image *dst,
769 const mlib_image *src,
770 const mlib_s32 *kern,
771 mlib_s32 scalef_expon,
772 mlib_s32 cmask)
773 {
774 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
775 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
776 FTYPE k[KSIZE*KSIZE];
777 mlib_s32 d0, d1;
778 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
779 FTYPE p00, p01, p02, p03, p04, p05,
780 p10, p11, p12, p13, p14, p15,
781 p20, p21, p22, p23, p24,
782 p30, p31, p32, p33, p34,
783 p40, p41, p42, p43, p44;
784 DEF_VARS(DTYPE);
785 DTYPE *sl1;
786 mlib_s32 chan2;
787 mlib_s32 *buffo, *buffi;
788 DTYPE *sl2, *sl3, *sl4;
789 LOAD_KERNEL(KSIZE*KSIZE);
790 GET_SRC_DST_PARAMETERS(DTYPE);
791
792 if (wid > BUFF_LINE) {
793 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
794
795 if (pbuff == NULL) return MLIB_FAILURE;
796 }
797
798 buff0 = pbuff;
799 buff1 = buff0 + wid;
800 buff2 = buff1 + wid;
801 buff3 = buff2 + wid;
802 buff4 = buff3 + wid;
803 buff5 = buff4 + wid;
804 buffd = buff5 + wid;
805 buffo = (mlib_s32*)(buffd + wid);
806 buffi = buffo + (wid &~ 1);
807
808 chan1 = nchannel;
809 chan2 = chan1 + chan1;
810
811 wid -= (KSIZE - 1);
812 hgt -= (KSIZE - 1);
813
814 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
815
816 for (c = 0; c < nchannel; c++) {
817 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
818
819 sl = adr_src + c;
820 dl = adr_dst + c;
821
822 sl1 = sl + sll;
823 sl2 = sl1 + sll;
824 sl3 = sl2 + sll;
825 sl4 = sl3 + sll;
826 #ifdef __SUNPRO_C
827 #pragma pipeloop(0)
828 #endif /* __SUNPRO_C */
829 for (i = 0; i < wid + (KSIZE - 1); i++) {
830 buff0[i] = (FTYPE)sl[i*chan1];
831 buff1[i] = (FTYPE)sl1[i*chan1];
832 buff2[i] = (FTYPE)sl2[i*chan1];
833 buff3[i] = (FTYPE)sl3[i*chan1];
834 buff4[i] = (FTYPE)sl4[i*chan1];
835 }
836
837 sl += KSIZE*sll;
838
839 for (j = 0; j < hgt; j++) {
840 d64_2x32 dd;
841
842 /*
843 * First loop
844 */
845 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
846 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
847
848 sp = sl;
849 dp = dl;
850
851 p02 = buff0[0];
852 p12 = buff1[0];
853 p03 = buff0[1];
854 p13 = buff1[1];
855 p04 = buff0[2];
856 p14 = buff1[2];
857
858 #ifdef __SUNPRO_C
859 #pragma pipeloop(0)
860 #endif /* __SUNPRO_C */
861 for (i = 0; i <= (wid - 2); i += 2) {
862 p00 = p02; p10 = p12;
863 p01 = p03; p11 = p13;
864 p02 = p04; p12 = p14;
865
866 LOAD_BUFF(buffi);
867
868 p03 = buff0[i + 3]; p13 = buff1[i + 3];
869 p04 = buff0[i + 4]; p14 = buff1[i + 4];
870 p05 = buff0[i + 5]; p15 = buff1[i + 5];
871
872 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
873 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
874 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
875 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
876
877 sp += chan2;
878 dp += chan2;
879 }
880
881 /*
882 * Second loop
883 */
884 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
885 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
886
887 sp = sl;
888 dp = dl;
889
890 p02 = buff2[0];
891 p12 = buff3[0];
892 p03 = buff2[1];
893 p13 = buff3[1];
894 p04 = buff2[2];
895 p14 = buff3[2];
896
897 #ifdef __SUNPRO_C
898 #pragma pipeloop(0)
899 #endif /* __SUNPRO_C */
900 for (i = 0; i <= (wid - 2); i += 2) {
901 p00 = p02; p10 = p12;
902 p01 = p03; p11 = p13;
903
904 p02 = buff2[i + 2]; p12 = buff3[i + 2];
905 p03 = buff2[i + 3]; p13 = buff3[i + 3];
906 p04 = buff2[i + 4]; p14 = buff3[i + 4];
907 p05 = buff2[i + 5]; p15 = buff3[i + 5];
908
909 dd.d64 = *(FTYPE *)(buffi + i);
910 buff5[i ] = (FTYPE)dd.i32s.i0;
911 buff5[i + 1] = (FTYPE)dd.i32s.i1;
912
913 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
914 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
915 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
916 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
917
918 sp += chan2;
919 dp += chan2;
920 }
921
922 /*
923 * 3 loop
924 */
925 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
926
927 sp = sl;
928 dp = dl;
929
930 p02 = buff4[0];
931 p03 = buff4[1];
932 p04 = buff4[2];
933 p05 = buff4[3];
934
935 #ifdef __SUNPRO_C
936 #pragma pipeloop(0)
937 #endif /* __SUNPRO_C */
938 for (i = 0; i <= (wid - 2); i += 2) {
939 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
940
941 p04 = buff4[i + 4]; p05 = buff4[i + 5];
942
943 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
944 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
945
946 dp[0 ] = FROM_S32(d0);
947 dp[chan1] = FROM_S32(d1);
948
949 sp += chan2;
950 dp += chan2;
951 }
952
953 /* last pixels */
954 for (; i < wid; i++) {
955 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
956 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
957 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
958 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
959 p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
960
961 p40 = buff4[i]; p41 = buff4[i + 1]; p42 = buff4[i + 2];
962 p43 = buff4[i + 3]; p44 = buff4[i + 4];
963
964 buff5[i] = (FTYPE)sp[0];
965
966 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
967 p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
968 p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
969 p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
970 p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
971
972 dp[0] = FROM_S32(buffo[i]);
973
974 sp += chan1;
975 dp += chan1;
976 }
977
978 buff5[wid ] = (FTYPE)sp[0];
979 buff5[wid + 1] = (FTYPE)sp[chan1];
980 buff5[wid + 2] = (FTYPE)sp[chan2];
981 buff5[wid + 3] = (FTYPE)sp[chan2 + chan1];
982
983 /* next line */
984 sl += sll;
985 dl += dll;
986
987 buffT = buff0;
988 buff0 = buff1;
989 buff1 = buff2;
990 buff2 = buff3;
991 buff3 = buff4;
992 buff4 = buff5;
993 buff5 = buffT;
994 }
995 }
996
997 if (pbuff != buff) mlib_free(pbuff);
998
999 return MLIB_SUCCESS;
1000 }
1001
1002 /***************************************************************/
1003 #ifndef __sparc /* for x86, using integer multiplies is faster */
1004
1005 mlib_status CONV_FUNC_I(5x5)(mlib_image *dst,
1006 const mlib_image *src,
1007 const mlib_s32 *kern,
1008 mlib_s32 scalef_expon,
1009 mlib_s32 cmask)
1010 {
1011 mlib_s32 buff[BUFF_LINE];
1012 mlib_s32 *buffd;
1013 mlib_s32 k[KSIZE*KSIZE];
1014 mlib_s32 shift1, shift2;
1015 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1016 mlib_s32 p00, p01, p02, p03, p04, p05,
1017 p10, p11, p12, p13, p14, p15;
1018 DTYPE *adr_src, *sl, *sp0, *sp1;
1019 DTYPE *adr_dst, *dl, *dp;
1020 mlib_s32 *pbuff = buff;
1021 mlib_s32 wid, hgt, sll, dll;
1022 mlib_s32 nchannel, chan1, chan2, chan3, chan4;
1023 mlib_s32 i, j, c;
1024
1025 #if IMG_TYPE != 1
1026 shift1 = 16;
1027 #else
1028 shift1 = 8;
1029 #endif /* IMG_TYPE != 1 */
1030
1031 shift2 = scalef_expon - shift1;
1032
1033 for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1034
1035 GET_SRC_DST_PARAMETERS(DTYPE);
1036
1037 if (wid > BUFF_LINE) {
1038 pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1039
1040 if (pbuff == NULL) return MLIB_FAILURE;
1041 }
1042
1043 buffd = pbuff;
1044
1045 chan1 = nchannel;
1046 chan2 = chan1 + chan1;
1047 chan3 = chan2 + chan1;
1048 chan4 = chan3 + chan1;
1049
1050 wid -= (KSIZE - 1);
1051 hgt -= (KSIZE - 1);
1052
1053 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1054
1055 for (c = 0; c < chan1; c++) {
1056 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1057
1058 sl = adr_src + c;
1059 dl = adr_dst + c;
1060
1061 for (j = 0; j < hgt; j++) {
1062 mlib_s32 pix0, pix1;
1063 /*
1064 * First loop
1065 */
1066 sp0 = sl;
1067 sp1 = sp0 + sll;
1068 dp = dl;
1069
1070 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1071 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1072
1073 p02 = sp0[0]; p12 = sp1[0];
1074 p03 = sp0[chan1]; p13 = sp1[chan1];
1075 p04 = sp0[chan2]; p14 = sp1[chan2];
1076 p05 = sp0[chan3]; p15 = sp1[chan3];
1077
1078 sp0 += chan4;
1079 sp1 += chan4;
1080
1081 #ifdef __SUNPRO_C
1082 #pragma pipeloop(0)
1083 #endif /* __SUNPRO_C */
1084 for (i = 0; i <= (wid - 2); i += 2) {
1085 p00 = p02; p10 = p12;
1086 p01 = p03; p11 = p13;
1087 p02 = p04; p12 = p14;
1088 p03 = p05; p13 = p15;
1089
1090 p04 = sp0[0]; p14 = sp1[0];
1091 p05 = sp0[chan1]; p15 = sp1[chan1];
1092
1093 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1094 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1095 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1096 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1097
1098 sp0 += chan2;
1099 sp1 += chan2;
1100 dp += chan2;
1101 }
1102
1103 if (wid & 1) {
1104 p00 = p02; p10 = p12;
1105 p01 = p03; p11 = p13;
1106 p02 = p04; p12 = p14;
1107 p03 = p05; p13 = p15;
1108
1109 p04 = sp0[0]; p14 = sp1[0];
1110
1111 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1112 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1113 }
1114
1115 /*
1116 * Second loop
1117 */
1118 sp0 = sl + 2*sll;
1119 sp1 = sp0 + sll;
1120 dp = dl;
1121
1122 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1123 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1124
1125 p02 = sp0[0]; p12 = sp1[0];
1126 p03 = sp0[chan1]; p13 = sp1[chan1];
1127 p04 = sp0[chan2]; p14 = sp1[chan2];
1128 p05 = sp0[chan3]; p15 = sp1[chan3];
1129
1130 sp0 += chan4;
1131 sp1 += chan4;
1132
1133 #ifdef __SUNPRO_C
1134 #pragma pipeloop(0)
1135 #endif /* __SUNPRO_C */
1136 for (i = 0; i <= (wid - 2); i += 2) {
1137 p00 = p02; p10 = p12;
1138 p01 = p03; p11 = p13;
1139 p02 = p04; p12 = p14;
1140 p03 = p05; p13 = p15;
1141
1142 p04 = sp0[0]; p14 = sp1[0];
1143 p05 = sp0[chan1]; p15 = sp1[chan1];
1144
1145 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1146 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1147 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1148 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1149
1150 sp0 += chan2;
1151 sp1 += chan2;
1152 dp += chan2;
1153 }
1154
1155 if (wid & 1) {
1156 p00 = p02; p10 = p12;
1157 p01 = p03; p11 = p13;
1158 p02 = p04; p12 = p14;
1159 p03 = p05; p13 = p15;
1160
1161 p04 = sp0[0]; p14 = sp1[0];
1162
1163 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1164 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1165 }
1166
1167 /*
1168 * 3 loop
1169 */
1170 dp = dl;
1171 sp0 = sl + 4*sll;
1172
1173 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1174
1175 p02 = sp0[0];
1176 p03 = sp0[chan1];
1177 p04 = sp0[chan2];
1178 p05 = sp0[chan3];
1179
1180 sp0 += chan2 + chan2;
1181
1182 #ifdef __SUNPRO_C
1183 #pragma pipeloop(0)
1184 #endif /* __SUNPRO_C */
1185 for (i = 0; i <= (wid - 2); i += 2) {
1186 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1187
1188 p04 = sp0[0]; p05 = sp0[chan1];
1189
1190 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1191 p03 * k3 + p04 * k4) >> shift2;
1192 pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1193 p04 * k3 + p05 * k4) >> shift2;
1194
1195 CLAMP_STORE(dp[0], pix0)
1196 CLAMP_STORE(dp[chan1], pix1)
1197
1198 dp += chan2;
1199 sp0 += chan2;
1200 }
1201
1202 if (wid & 1) {
1203 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1204
1205 p04 = sp0[0];
1206
1207 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1208 p03 * k3 + p04 * k4) >> shift2;
1209 CLAMP_STORE(dp[0], pix0)
1210 }
1211
1212 /* next line */
1213 sl += sll;
1214 dl += dll;
1215 }
1216 }
1217
1218 if (pbuff != buff) mlib_free(pbuff);
1219
1220 return MLIB_SUCCESS;
1221 }
1222
1223 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1224
1225 /***************************************************************/
1226 #if IMG_TYPE == 1
1227
1228 #undef KSIZE
1229 #define KSIZE 7
1230
1231 mlib_status CONV_FUNC(7x7)(mlib_image *dst,
1232 const mlib_image *src,
1233 const mlib_s32 *kern,
1234 mlib_s32 scalef_expon,
1235 mlib_s32 cmask)
1236 {
1237 FTYPE buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1238 FTYPE k[KSIZE*KSIZE];
1239 mlib_s32 l, m, buff_ind;
1240 mlib_s32 d0, d1;
1241 FTYPE k0, k1, k2, k3, k4, k5, k6;
1242 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1243 DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1244 DEF_VARS(DTYPE);
1245 DTYPE *sl1;
1246 mlib_s32 chan2;
1247 mlib_s32 *buffo, *buffi;
1248 LOAD_KERNEL(KSIZE*KSIZE);
1249 GET_SRC_DST_PARAMETERS(DTYPE);
1250
1251 if (wid > BUFF_LINE) {
1252 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
1253
1254 if (pbuff == NULL) return MLIB_FAILURE;
1255 }
1256
1257 for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
1258 for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1259 buffd = buffs[KSIZE] + wid;
1260 buffo = (mlib_s32*)(buffd + wid);
1261 buffi = buffo + (wid &~ 1);
1262
1263 chan1 = nchannel;
1264 chan2 = chan1 + chan1;
1265
1266 wid -= (KSIZE - 1);
1267 hgt -= (KSIZE - 1);
1268
1269 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1270
1271 for (c = 0; c < nchannel; c++) {
1272 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1273
1274 sl = adr_src + c;
1275 dl = adr_dst + c;
1276
1277 sl1 = sl + sll;
1278 sl2 = sl1 + sll;
1279 sl3 = sl2 + sll;
1280 sl4 = sl3 + sll;
1281 sl5 = sl4 + sll;
1282 sl6 = sl5 + sll;
1283 #ifdef __SUNPRO_C
1284 #pragma pipeloop(0)
1285 #endif /* __SUNPRO_C */
1286 for (i = 0; i < wid + (KSIZE - 1); i++) {
1287 buffs[0][i] = (FTYPE)sl[i*chan1];
1288 buffs[1][i] = (FTYPE)sl1[i*chan1];
1289 buffs[2][i] = (FTYPE)sl2[i*chan1];
1290 buffs[3][i] = (FTYPE)sl3[i*chan1];
1291 buffs[4][i] = (FTYPE)sl4[i*chan1];
1292 buffs[5][i] = (FTYPE)sl5[i*chan1];
1293 buffs[6][i] = (FTYPE)sl6[i*chan1];
1294 }
1295
1296 buff_ind = 0;
1297
1298 #ifdef __SUNPRO_C
1299 #pragma pipeloop(0)
1300 #endif /* __SUNPRO_C */
1301 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1302
1303 sl += KSIZE*sll;
1304
1305 for (j = 0; j < hgt; j++) {
1306 FTYPE **buffc = buffs + buff_ind;
1307 FTYPE *buffn = buffc[KSIZE];
1308 FTYPE *pk = k;
1309
1310 for (l = 0; l < KSIZE; l++) {
1311 FTYPE *buff = buffc[l];
1312 d64_2x32 dd;
1313
1314 sp = sl;
1315 dp = dl;
1316
1317 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1318 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1319
1320 k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1321 k4 = *pk++; k5 = *pk++; k6 = *pk++;
1322
1323 if (l < (KSIZE - 1)) {
1324 #ifdef __SUNPRO_C
1325 #pragma pipeloop(0)
1326 #endif /* __SUNPRO_C */
1327 for (i = 0; i <= (wid - 2); i += 2) {
1328 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1329
1330 p6 = buff[i + 6]; p7 = buff[i + 7];
1331
1332 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1333 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1334 }
1335
1336 } else {
1337 #ifdef __SUNPRO_C
1338 #pragma pipeloop(0)
1339 #endif /* __SUNPRO_C */
1340 for (i = 0; i <= (wid - 2); i += 2) {
1341 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1342
1343 p6 = buff[i + 6]; p7 = buff[i + 7];
1344
1345 LOAD_BUFF(buffi);
1346
1347 dd.d64 = *(FTYPE *)(buffi + i);
1348 buffn[i ] = (FTYPE)dd.i32s.i0;
1349 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1350
1351 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1352 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1353
1354 dp[0 ] = FROM_S32(d0);
1355 dp[chan1] = FROM_S32(d1);
1356
1357 buffd[i ] = 0.0;
1358 buffd[i + 1] = 0.0;
1359
1360 sp += chan2;
1361 dp += chan2;
1362 }
1363 }
1364 }
1365
1366 /* last pixels */
1367 for (; i < wid; i++) {
1368 FTYPE *pk = k, s = 0;
1369 mlib_s32 d0;
1370
1371 for (l = 0; l < KSIZE; l++) {
1372 FTYPE *buff = buffc[l] + i;
1373
1374 for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1375 }
1376
1377 d0 = D2I(s);
1378 dp[0] = FROM_S32(d0);
1379
1380 buffn[i] = (FTYPE)sp[0];
1381
1382 sp += chan1;
1383 dp += chan1;
1384 }
1385
1386 for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
1387
1388 /* next line */
1389 sl += sll;
1390 dl += dll;
1391
1392 buff_ind++;
1393
1394 if (buff_ind >= KSIZE + 1) buff_ind = 0;
1395 }
1396 }
1397
1398 if (pbuff != buff) mlib_free(pbuff);
1399
1400 return MLIB_SUCCESS;
1401 }
1402
1403 #endif /* IMG_TYPE == 1 */
1404
1405 /***************************************************************/
1406 #define MAX_KER 7
1407 #define MAX_N 15
1408
mlib_ImageConv1xN(mlib_image * dst,const mlib_image * src,const mlib_d64 * k,mlib_s32 n,mlib_s32 dn,mlib_s32 cmask)1409 static mlib_status mlib_ImageConv1xN(mlib_image *dst,
1410 const mlib_image *src,
1411 const mlib_d64 *k,
1412 mlib_s32 n,
1413 mlib_s32 dn,
1414 mlib_s32 cmask)
1415 {
1416 FTYPE buff[BUFF_SIZE];
1417 mlib_s32 off, kh;
1418 mlib_s32 d0, d1;
1419 const FTYPE *pk;
1420 FTYPE k0, k1, k2, k3;
1421 FTYPE p0, p1, p2, p3, p4;
1422 DEF_VARS(DTYPE);
1423 DTYPE *sl_c, *dl_c, *sl0;
1424 mlib_s32 l, hsize, max_hsize;
1425 GET_SRC_DST_PARAMETERS(DTYPE);
1426
1427 hgt -= (n - 1);
1428 adr_dst += dn*dll;
1429
1430 max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
1431
1432 if (!max_hsize) max_hsize = 1;
1433
1434 if (max_hsize > BUFF_SIZE) {
1435 pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
1436 }
1437
1438 chan1 = nchannel;
1439
1440 sl_c = adr_src;
1441 dl_c = adr_dst;
1442
1443 for (l = 0; l < hgt; l += hsize) {
1444 hsize = hgt - l;
1445
1446 if (hsize > max_hsize) hsize = max_hsize;
1447
1448 for (c = 0; c < nchannel; c++) {
1449 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1450
1451 sl = sl_c + c;
1452 dl = dl_c + c;
1453
1454 #ifdef __SUNPRO_C
1455 #pragma pipeloop(0)
1456 #endif /* __SUNPRO_C */
1457 for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
1458
1459 for (i = 0; i < wid; i++) {
1460 sl0 = sl;
1461
1462 for (off = 0; off < (n - 4); off += 4) {
1463 pk = k + off;
1464 sp = sl0;
1465
1466 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1467 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1468 sp += 3*sll;
1469
1470 #ifdef __SUNPRO_C
1471 #pragma pipeloop(0)
1472 #endif /* __SUNPRO_C */
1473 for (j = 0; j < hsize; j += 2) {
1474 p0 = p2; p1 = p3; p2 = p4;
1475 p3 = sp[0];
1476 p4 = sp[sll];
1477
1478 pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1479 pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1480
1481 sp += 2*sll;
1482 }
1483
1484 sl0 += 4*sll;
1485 }
1486
1487 pk = k + off;
1488 sp = sl0;
1489
1490 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1491 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1492
1493 dp = dl;
1494 kh = n - off;
1495
1496 if (kh == 4) {
1497 sp += 3*sll;
1498
1499 #ifdef __SUNPRO_C
1500 #pragma pipeloop(0)
1501 #endif /* __SUNPRO_C */
1502 for (j = 0; j <= (hsize - 2); j += 2) {
1503 p0 = p2; p1 = p3; p2 = p4;
1504 p3 = sp[0];
1505 p4 = sp[sll];
1506
1507 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1508 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
1509
1510 dp[0 ] = FROM_S32(d0);
1511 dp[dll] = FROM_S32(d1);
1512
1513 pbuff[j] = 0;
1514 pbuff[j + 1] = 0;
1515
1516 sp += 2*sll;
1517 dp += 2*dll;
1518 }
1519
1520 if (j < hsize) {
1521 p0 = p2; p1 = p3; p2 = p4;
1522 p3 = sp[0];
1523
1524 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1525
1526 pbuff[j] = 0;
1527
1528 dp[0] = FROM_S32(d0);
1529 }
1530
1531 } else if (kh == 3) {
1532 sp += 2*sll;
1533
1534 #ifdef __SUNPRO_C
1535 #pragma pipeloop(0)
1536 #endif /* __SUNPRO_C */
1537 for (j = 0; j <= (hsize - 2); j += 2) {
1538 p0 = p2; p1 = p3;
1539 p2 = sp[0];
1540 p3 = sp[sll];
1541
1542 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1543 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
1544
1545 dp[0 ] = FROM_S32(d0);
1546 dp[dll] = FROM_S32(d1);
1547
1548 pbuff[j] = 0;
1549 pbuff[j + 1] = 0;
1550
1551 sp += 2*sll;
1552 dp += 2*dll;
1553 }
1554
1555 if (j < hsize) {
1556 p0 = p2; p1 = p3;
1557 p2 = sp[0];
1558
1559 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1560
1561 pbuff[j] = 0;
1562
1563 dp[0] = FROM_S32(d0);
1564 }
1565
1566 } else if (kh == 2) {
1567 sp += sll;
1568
1569 #ifdef __SUNPRO_C
1570 #pragma pipeloop(0)
1571 #endif /* __SUNPRO_C */
1572 for (j = 0; j <= (hsize - 2); j += 2) {
1573 p0 = p2;
1574 p1 = sp[0];
1575 p2 = sp[sll];
1576
1577 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1578 d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
1579
1580 dp[0 ] = FROM_S32(d0);
1581 dp[dll] = FROM_S32(d1);
1582
1583 pbuff[j] = 0;
1584 pbuff[j + 1] = 0;
1585
1586 sp += 2*sll;
1587 dp += 2*dll;
1588 }
1589
1590 if (j < hsize) {
1591 p0 = p2;
1592 p1 = sp[0];
1593
1594 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1595
1596 pbuff[j] = 0;
1597
1598 dp[0] = FROM_S32(d0);
1599 }
1600
1601 } else /* if (kh == 1) */ {
1602 #ifdef __SUNPRO_C
1603 #pragma pipeloop(0)
1604 #endif /* __SUNPRO_C */
1605 for (j = 0; j < hsize; j++) {
1606 p0 = sp[0];
1607
1608 d0 = D2I(p0*k0 + pbuff[j]);
1609
1610 dp[0] = FROM_S32(d0);
1611
1612 pbuff[j] = 0;
1613
1614 sp += sll;
1615 dp += dll;
1616 }
1617 }
1618
1619 sl += chan1;
1620 dl += chan1;
1621 }
1622 }
1623
1624 sl_c += max_hsize*sll;
1625 dl_c += max_hsize*dll;
1626 }
1627
1628 if (pbuff != buff) mlib_free(pbuff);
1629
1630 return MLIB_SUCCESS;
1631 }
1632
1633 /***************************************************************/
CONV_FUNC(MxN)1634 mlib_status CONV_FUNC(MxN)(mlib_image *dst,
1635 const mlib_image *src,
1636 const mlib_s32 *kernel,
1637 mlib_s32 m,
1638 mlib_s32 n,
1639 mlib_s32 dm,
1640 mlib_s32 dn,
1641 mlib_s32 scale,
1642 mlib_s32 cmask)
1643 {
1644 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1645 FTYPE **buffs = buffs_arr, *buffd;
1646 FTYPE akernel[256], *k = akernel, fscale = DSCALE;
1647 mlib_s32 mn, l, off, kw, bsize, buff_ind;
1648 mlib_s32 d0, d1;
1649 FTYPE k0, k1, k2, k3, k4, k5, k6;
1650 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1651 d64_2x32 dd;
1652 DEF_VARS(DTYPE);
1653 mlib_s32 chan2;
1654 mlib_s32 *buffo, *buffi;
1655 mlib_status status = MLIB_SUCCESS;
1656
1657 GET_SRC_DST_PARAMETERS(DTYPE);
1658
1659 if (scale > 30) {
1660 fscale *= 1.0/(1 << 30);
1661 scale -= 30;
1662 }
1663
1664 fscale /= (1 << scale);
1665
1666 mn = m*n;
1667
1668 if (mn > 256) {
1669 k = mlib_malloc(mn*sizeof(mlib_d64));
1670
1671 if (k == NULL) return MLIB_FAILURE;
1672 }
1673
1674 for (i = 0; i < mn; i++) {
1675 k[i] = kernel[i]*fscale;
1676 }
1677
1678 if (m == 1) {
1679 status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1680 FREE_AND_RETURN_STATUS;
1681 }
1682
1683 bsize = (n + 3)*wid;
1684
1685 if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1686 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1687
1688 if (pbuff == NULL) {
1689 status = MLIB_FAILURE;
1690 FREE_AND_RETURN_STATUS;
1691 }
1692 buffs = (FTYPE **)(pbuff + bsize);
1693 }
1694
1695 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1696 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1697 buffd = buffs[n] + wid;
1698 buffo = (mlib_s32*)(buffd + wid);
1699 buffi = buffo + (wid &~ 1);
1700
1701 chan1 = nchannel;
1702 chan2 = chan1 + chan1;
1703
1704 wid -= (m - 1);
1705 hgt -= (n - 1);
1706 adr_dst += dn*dll + dm*nchannel;
1707
1708 for (c = 0; c < nchannel; c++) {
1709 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1710
1711 sl = adr_src + c;
1712 dl = adr_dst + c;
1713
1714 for (l = 0; l < n; l++) {
1715 FTYPE *buff = buffs[l];
1716
1717 #ifdef __SUNPRO_C
1718 #pragma pipeloop(0)
1719 #endif /* __SUNPRO_C */
1720 for (i = 0; i < wid + (m - 1); i++) {
1721 buff[i] = (FTYPE)sl[i*chan1];
1722 }
1723
1724 sl += sll;
1725 }
1726
1727 buff_ind = 0;
1728
1729 #ifdef __SUNPRO_C
1730 #pragma pipeloop(0)
1731 #endif /* __SUNPRO_C */
1732 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1733
1734 for (j = 0; j < hgt; j++) {
1735 FTYPE **buffc = buffs + buff_ind;
1736 FTYPE *buffn = buffc[n];
1737 FTYPE *pk = k;
1738
1739 for (l = 0; l < n; l++) {
1740 FTYPE *buff_l = buffc[l];
1741
1742 for (off = 0; off < m;) {
1743 FTYPE *buff = buff_l + off;
1744
1745 kw = m - off;
1746
1747 if (kw > 2*MAX_KER) kw = MAX_KER; else
1748 if (kw > MAX_KER) kw = kw/2;
1749 off += kw;
1750
1751 sp = sl;
1752 dp = dl;
1753
1754 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1755 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1756
1757 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1758 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1759 pk += kw;
1760
1761 if (kw == 7) {
1762
1763 if (l < (n - 1) || off < m) {
1764 #ifdef __SUNPRO_C
1765 #pragma pipeloop(0)
1766 #endif /* __SUNPRO_C */
1767 for (i = 0; i <= (wid - 2); i += 2) {
1768 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1769
1770 p6 = buff[i + 6]; p7 = buff[i + 7];
1771
1772 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1773 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1774 }
1775
1776 } else {
1777 #ifdef __SUNPRO_C
1778 #pragma pipeloop(0)
1779 #endif /* __SUNPRO_C */
1780 for (i = 0; i <= (wid - 2); i += 2) {
1781 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1782
1783 p6 = buff[i + 6]; p7 = buff[i + 7];
1784
1785 LOAD_BUFF(buffi);
1786
1787 dd.d64 = *(FTYPE *)(buffi + i);
1788 buffn[i ] = (FTYPE)dd.i32s.i0;
1789 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1790
1791 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1792 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1793
1794 dp[0 ] = FROM_S32(d0);
1795 dp[chan1] = FROM_S32(d1);
1796
1797 buffd[i ] = 0.0;
1798 buffd[i + 1] = 0.0;
1799
1800 sp += chan2;
1801 dp += chan2;
1802 }
1803 }
1804
1805 } else if (kw == 6) {
1806
1807 if (l < (n - 1) || off < m) {
1808 #ifdef __SUNPRO_C
1809 #pragma pipeloop(0)
1810 #endif /* __SUNPRO_C */
1811 for (i = 0; i <= (wid - 2); i += 2) {
1812 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1813
1814 p5 = buff[i + 5]; p6 = buff[i + 6];
1815
1816 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1817 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1818 }
1819
1820 } else {
1821 #ifdef __SUNPRO_C
1822 #pragma pipeloop(0)
1823 #endif /* __SUNPRO_C */
1824 for (i = 0; i <= (wid - 2); i += 2) {
1825 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1826
1827 p5 = buff[i + 5]; p6 = buff[i + 6];
1828
1829 buffn[i ] = (FTYPE)sp[0];
1830 buffn[i + 1] = (FTYPE)sp[chan1];
1831
1832 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
1833 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1834
1835 dp[0 ] = FROM_S32(d0);
1836 dp[chan1] = FROM_S32(d1);
1837
1838 buffd[i ] = 0.0;
1839 buffd[i + 1] = 0.0;
1840
1841 sp += chan2;
1842 dp += chan2;
1843 }
1844 }
1845
1846 } else if (kw == 5) {
1847
1848 if (l < (n - 1) || off < m) {
1849 #ifdef __SUNPRO_C
1850 #pragma pipeloop(0)
1851 #endif /* __SUNPRO_C */
1852 for (i = 0; i <= (wid - 2); i += 2) {
1853 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1854
1855 p4 = buff[i + 4]; p5 = buff[i + 5];
1856
1857 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1858 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1859 }
1860
1861 } else {
1862 #ifdef __SUNPRO_C
1863 #pragma pipeloop(0)
1864 #endif /* __SUNPRO_C */
1865 for (i = 0; i <= (wid - 2); i += 2) {
1866 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1867
1868 p4 = buff[i + 4]; p5 = buff[i + 5];
1869
1870 buffn[i ] = (FTYPE)sp[0];
1871 buffn[i + 1] = (FTYPE)sp[chan1];
1872
1873 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
1874 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1875
1876 dp[0 ] = FROM_S32(d0);
1877 dp[chan1] = FROM_S32(d1);
1878
1879 buffd[i ] = 0.0;
1880 buffd[i + 1] = 0.0;
1881
1882 sp += chan2;
1883 dp += chan2;
1884 }
1885 }
1886
1887 } else if (kw == 4) {
1888
1889 if (l < (n - 1) || off < m) {
1890 #ifdef __SUNPRO_C
1891 #pragma pipeloop(0)
1892 #endif /* __SUNPRO_C */
1893 for (i = 0; i <= (wid - 2); i += 2) {
1894 p0 = p2; p1 = p3; p2 = p4;
1895
1896 p3 = buff[i + 3]; p4 = buff[i + 4];
1897
1898 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1899 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1900 }
1901
1902 } else {
1903 #ifdef __SUNPRO_C
1904 #pragma pipeloop(0)
1905 #endif /* __SUNPRO_C */
1906 for (i = 0; i <= (wid - 2); i += 2) {
1907 p0 = p2; p1 = p3; p2 = p4;
1908
1909 p3 = buff[i + 3]; p4 = buff[i + 4];
1910
1911 buffn[i ] = (FTYPE)sp[0];
1912 buffn[i + 1] = (FTYPE)sp[chan1];
1913
1914 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
1915 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1916
1917 dp[0 ] = FROM_S32(d0);
1918 dp[chan1] = FROM_S32(d1);
1919
1920 buffd[i ] = 0.0;
1921 buffd[i + 1] = 0.0;
1922
1923 sp += chan2;
1924 dp += chan2;
1925 }
1926 }
1927
1928 } else if (kw == 3) {
1929
1930 if (l < (n - 1) || off < m) {
1931 #ifdef __SUNPRO_C
1932 #pragma pipeloop(0)
1933 #endif /* __SUNPRO_C */
1934 for (i = 0; i <= (wid - 2); i += 2) {
1935 p0 = p2; p1 = p3;
1936
1937 p2 = buff[i + 2]; p3 = buff[i + 3];
1938
1939 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
1940 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1941 }
1942
1943 } else {
1944 #ifdef __SUNPRO_C
1945 #pragma pipeloop(0)
1946 #endif /* __SUNPRO_C */
1947 for (i = 0; i <= (wid - 2); i += 2) {
1948 p0 = p2; p1 = p3;
1949
1950 p2 = buff[i + 2]; p3 = buff[i + 3];
1951
1952 buffn[i ] = (FTYPE)sp[0];
1953 buffn[i + 1] = (FTYPE)sp[chan1];
1954
1955 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
1956 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1957
1958 dp[0 ] = FROM_S32(d0);
1959 dp[chan1] = FROM_S32(d1);
1960
1961 buffd[i ] = 0.0;
1962 buffd[i + 1] = 0.0;
1963
1964 sp += chan2;
1965 dp += chan2;
1966 }
1967 }
1968
1969 } else /*if (kw == 2)*/ {
1970
1971 if (l < (n - 1) || off < m) {
1972 #ifdef __SUNPRO_C
1973 #pragma pipeloop(0)
1974 #endif /* __SUNPRO_C */
1975 for (i = 0; i <= (wid - 2); i += 2) {
1976 p0 = p2;
1977
1978 p1 = buff[i + 1]; p2 = buff[i + 2];
1979
1980 buffd[i ] += p0*k0 + p1*k1;
1981 buffd[i + 1] += p1*k0 + p2*k1;
1982 }
1983
1984 } else {
1985 #ifdef __SUNPRO_C
1986 #pragma pipeloop(0)
1987 #endif /* __SUNPRO_C */
1988 for (i = 0; i <= (wid - 2); i += 2) {
1989 p0 = p2;
1990
1991 p1 = buff[i + 1]; p2 = buff[i + 2];
1992
1993 buffn[i ] = (FTYPE)sp[0];
1994 buffn[i + 1] = (FTYPE)sp[chan1];
1995
1996 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]);
1997 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1998
1999 dp[0 ] = FROM_S32(d0);
2000 dp[chan1] = FROM_S32(d1);
2001
2002 buffd[i ] = 0.0;
2003 buffd[i + 1] = 0.0;
2004
2005 sp += chan2;
2006 dp += chan2;
2007 }
2008 }
2009 }
2010 }
2011 }
2012
2013 /* last pixels */
2014 for (; i < wid; i++) {
2015 FTYPE *pk = k, s = 0;
2016 mlib_s32 x, d0;
2017
2018 for (l = 0; l < n; l++) {
2019 FTYPE *buff = buffc[l] + i;
2020
2021 for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2022 }
2023
2024 d0 = D2I(s);
2025 dp[0] = FROM_S32(d0);
2026
2027 buffn[i] = (FTYPE)sp[0];
2028
2029 sp += chan1;
2030 dp += chan1;
2031 }
2032
2033 for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
2034
2035 /* next line */
2036 sl += sll;
2037 dl += dll;
2038
2039 buff_ind++;
2040
2041 if (buff_ind >= n + 1) buff_ind = 0;
2042 }
2043 }
2044
2045 FREE_AND_RETURN_STATUS;
2046 }
2047
2048 /***************************************************************/
2049 #ifndef __sparc /* for x86, using integer multiplies is faster */
2050
2051 #define STORE_RES(res, x) \
2052 x >>= shift2; \
2053 CLAMP_STORE(res, x)
2054
CONV_FUNC_I(MxN)2055 mlib_status CONV_FUNC_I(MxN)(mlib_image *dst,
2056 const mlib_image *src,
2057 const mlib_s32 *kernel,
2058 mlib_s32 m,
2059 mlib_s32 n,
2060 mlib_s32 dm,
2061 mlib_s32 dn,
2062 mlib_s32 scale,
2063 mlib_s32 cmask)
2064 {
2065 mlib_s32 buff[BUFF_SIZE], *buffd = buff;
2066 mlib_s32 l, off, kw;
2067 mlib_s32 d0, d1, shift1, shift2;
2068 mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2069 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2070 DTYPE *adr_src, *sl, *sp = NULL;
2071 DTYPE *adr_dst, *dl, *dp = NULL;
2072 mlib_s32 wid, hgt, sll, dll;
2073 mlib_s32 nchannel, chan1;
2074 mlib_s32 i, j, c;
2075 mlib_s32 chan2;
2076 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2077 GET_SRC_DST_PARAMETERS(DTYPE);
2078
2079 #if IMG_TYPE != 1
2080 shift1 = 16;
2081 #else
2082 shift1 = 8;
2083 #endif /* IMG_TYPE != 1 */
2084 shift2 = scale - shift1;
2085
2086 chan1 = nchannel;
2087 chan2 = chan1 + chan1;
2088
2089 wid -= (m - 1);
2090 hgt -= (n - 1);
2091 adr_dst += dn*dll + dm*nchannel;
2092
2093 if (wid > BUFF_SIZE) {
2094 buffd = mlib_malloc(sizeof(mlib_s32)*wid);
2095
2096 if (buffd == NULL) return MLIB_FAILURE;
2097 }
2098
2099 if (m*n > MAX_N*MAX_N) {
2100 k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2101
2102 if (k == NULL) {
2103 if (buffd != buff) mlib_free(buffd);
2104 return MLIB_FAILURE;
2105 }
2106 }
2107
2108 for (i = 0; i < m*n; i++) {
2109 k[i] = kernel[i] >> shift1;
2110 }
2111
2112 for (c = 0; c < nchannel; c++) {
2113 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2114
2115 sl = adr_src + c;
2116 dl = adr_dst + c;
2117
2118 #ifdef __SUNPRO_C
2119 #pragma pipeloop(0)
2120 #endif /* __SUNPRO_C */
2121 for (i = 0; i < wid; i++) buffd[i] = 0;
2122
2123 for (j = 0; j < hgt; j++) {
2124 mlib_s32 *pk = k;
2125
2126 for (l = 0; l < n; l++) {
2127 DTYPE *sp0 = sl + l*sll;
2128
2129 for (off = 0; off < m;) {
2130 sp = sp0 + off*chan1;
2131 dp = dl;
2132
2133 kw = m - off;
2134
2135 if (kw > 2*MAX_KER) kw = MAX_KER; else
2136 if (kw > MAX_KER) kw = kw/2;
2137 off += kw;
2138
2139 p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
2140 p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
2141
2142 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2143 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2144 pk += kw;
2145
2146 sp += (kw - 1)*chan1;
2147
2148 if (kw == 7) {
2149
2150 if (l < (n - 1) || off < m) {
2151 #ifdef __SUNPRO_C
2152 #pragma pipeloop(0)
2153 #endif /* __SUNPRO_C */
2154 for (i = 0; i <= (wid - 2); i += 2) {
2155 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2156 p6 = sp[0];
2157 p7 = sp[chan1];
2158
2159 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2160 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2161
2162 sp += chan2;
2163 }
2164
2165 } else {
2166 #ifdef __SUNPRO_C
2167 #pragma pipeloop(0)
2168 #endif /* __SUNPRO_C */
2169 for (i = 0; i <= (wid - 2); i += 2) {
2170 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2171 p6 = sp[0];
2172 p7 = sp[chan1];
2173
2174 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
2175 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2176
2177 STORE_RES(dp[0 ], d0);
2178 STORE_RES(dp[chan1], d1);
2179
2180 buffd[i ] = 0;
2181 buffd[i + 1] = 0;
2182
2183 sp += chan2;
2184 dp += chan2;
2185 }
2186 }
2187
2188 } else if (kw == 6) {
2189
2190 if (l < (n - 1) || off < m) {
2191 #ifdef __SUNPRO_C
2192 #pragma pipeloop(0)
2193 #endif /* __SUNPRO_C */
2194 for (i = 0; i <= (wid - 2); i += 2) {
2195 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2196 p5 = sp[0];
2197 p6 = sp[chan1];
2198
2199 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2200 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2201
2202 sp += chan2;
2203 }
2204
2205 } else {
2206 #ifdef __SUNPRO_C
2207 #pragma pipeloop(0)
2208 #endif /* __SUNPRO_C */
2209 for (i = 0; i <= (wid - 2); i += 2) {
2210 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2211 p5 = sp[0];
2212 p6 = sp[chan1];
2213
2214 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
2215 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2216
2217 STORE_RES(dp[0 ], d0);
2218 STORE_RES(dp[chan1], d1);
2219
2220 buffd[i ] = 0;
2221 buffd[i + 1] = 0;
2222
2223 sp += chan2;
2224 dp += chan2;
2225 }
2226 }
2227
2228 } else if (kw == 5) {
2229
2230 if (l < (n - 1) || off < m) {
2231 #ifdef __SUNPRO_C
2232 #pragma pipeloop(0)
2233 #endif /* __SUNPRO_C */
2234 for (i = 0; i <= (wid - 2); i += 2) {
2235 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2236 p4 = sp[0];
2237 p5 = sp[chan1];
2238
2239 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2240 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2241
2242 sp += chan2;
2243 }
2244
2245 } else {
2246 #ifdef __SUNPRO_C
2247 #pragma pipeloop(0)
2248 #endif /* __SUNPRO_C */
2249 for (i = 0; i <= (wid - 2); i += 2) {
2250 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2251 p4 = sp[0];
2252 p5 = sp[chan1];
2253
2254 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
2255 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2256
2257 STORE_RES(dp[0 ], d0);
2258 STORE_RES(dp[chan1], d1);
2259
2260 buffd[i ] = 0;
2261 buffd[i + 1] = 0;
2262
2263 sp += chan2;
2264 dp += chan2;
2265 }
2266 }
2267
2268 } else if (kw == 4) {
2269
2270 if (l < (n - 1) || off < m) {
2271 #ifdef __SUNPRO_C
2272 #pragma pipeloop(0)
2273 #endif /* __SUNPRO_C */
2274 for (i = 0; i <= (wid - 2); i += 2) {
2275 p0 = p2; p1 = p3; p2 = p4;
2276 p3 = sp[0];
2277 p4 = sp[chan1];
2278
2279 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2280 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2281
2282 sp += chan2;
2283 }
2284
2285 } else {
2286 #ifdef __SUNPRO_C
2287 #pragma pipeloop(0)
2288 #endif /* __SUNPRO_C */
2289 for (i = 0; i <= (wid - 2); i += 2) {
2290 p0 = p2; p1 = p3; p2 = p4;
2291 p3 = sp[0];
2292 p4 = sp[chan1];
2293
2294 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
2295 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2296
2297 STORE_RES(dp[0 ], d0);
2298 STORE_RES(dp[chan1], d1);
2299
2300 buffd[i ] = 0;
2301 buffd[i + 1] = 0;
2302
2303 sp += chan2;
2304 dp += chan2;
2305 }
2306 }
2307
2308 } else if (kw == 3) {
2309
2310 if (l < (n - 1) || off < m) {
2311 #ifdef __SUNPRO_C
2312 #pragma pipeloop(0)
2313 #endif /* __SUNPRO_C */
2314 for (i = 0; i <= (wid - 2); i += 2) {
2315 p0 = p2; p1 = p3;
2316 p2 = sp[0];
2317 p3 = sp[chan1];
2318
2319 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
2320 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2321
2322 sp += chan2;
2323 }
2324
2325 } else {
2326 #ifdef __SUNPRO_C
2327 #pragma pipeloop(0)
2328 #endif /* __SUNPRO_C */
2329 for (i = 0; i <= (wid - 2); i += 2) {
2330 p0 = p2; p1 = p3;
2331 p2 = sp[0];
2332 p3 = sp[chan1];
2333
2334 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
2335 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2336
2337 STORE_RES(dp[0 ], d0);
2338 STORE_RES(dp[chan1], d1);
2339
2340 buffd[i ] = 0;
2341 buffd[i + 1] = 0;
2342
2343 sp += chan2;
2344 dp += chan2;
2345 }
2346 }
2347
2348 } else if (kw == 2) {
2349
2350 if (l < (n - 1) || off < m) {
2351 #ifdef __SUNPRO_C
2352 #pragma pipeloop(0)
2353 #endif /* __SUNPRO_C */
2354 for (i = 0; i <= (wid - 2); i += 2) {
2355 p0 = p2;
2356 p1 = sp[0];
2357 p2 = sp[chan1];
2358
2359 buffd[i ] += p0*k0 + p1*k1;
2360 buffd[i + 1] += p1*k0 + p2*k1;
2361
2362 sp += chan2;
2363 }
2364
2365 } else {
2366 #ifdef __SUNPRO_C
2367 #pragma pipeloop(0)
2368 #endif /* __SUNPRO_C */
2369 for (i = 0; i <= (wid - 2); i += 2) {
2370 p0 = p2;
2371 p1 = sp[0];
2372 p2 = sp[chan1];
2373
2374 d0 = (p0*k0 + p1*k1 + buffd[i ]);
2375 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2376
2377 STORE_RES(dp[0 ], d0);
2378 STORE_RES(dp[chan1], d1);
2379
2380 buffd[i ] = 0;
2381 buffd[i + 1] = 0;
2382
2383 sp += chan2;
2384 dp += chan2;
2385 }
2386 }
2387
2388 } else /*if (kw == 1)*/ {
2389
2390 if (l < (n - 1) || off < m) {
2391 #ifdef __SUNPRO_C
2392 #pragma pipeloop(0)
2393 #endif /* __SUNPRO_C */
2394 for (i = 0; i <= (wid - 2); i += 2) {
2395 p0 = sp[0];
2396 p1 = sp[chan1];
2397
2398 buffd[i ] += p0*k0;
2399 buffd[i + 1] += p1*k0;
2400
2401 sp += chan2;
2402 }
2403
2404 } else {
2405 #ifdef __SUNPRO_C
2406 #pragma pipeloop(0)
2407 #endif /* __SUNPRO_C */
2408 for (i = 0; i <= (wid - 2); i += 2) {
2409 p0 = sp[0];
2410 p1 = sp[chan1];
2411
2412 d0 = (p0*k0 + buffd[i ]);
2413 d1 = (p1*k0 + buffd[i + 1]);
2414
2415 STORE_RES(dp[0 ], d0);
2416 STORE_RES(dp[chan1], d1);
2417
2418 buffd[i ] = 0;
2419 buffd[i + 1] = 0;
2420
2421 sp += chan2;
2422 dp += chan2;
2423 }
2424 }
2425 }
2426 }
2427 }
2428
2429 /* last pixels */
2430 for (; i < wid; i++) {
2431 mlib_s32 *pk = k, s = 0;
2432 mlib_s32 x;
2433
2434 for (l = 0; l < n; l++) {
2435 sp = sl + l*sll + i*chan1;
2436
2437 for (x = 0; x < m; x++) {
2438 s += sp[0] * pk[0];
2439 sp += chan1;
2440 pk ++;
2441 }
2442 }
2443
2444 STORE_RES(dp[0], s);
2445
2446 sp += chan1;
2447 dp += chan1;
2448 }
2449
2450 sl += sll;
2451 dl += dll;
2452 }
2453 }
2454
2455 if (buffd != buff) mlib_free(buffd);
2456 if (k != k_locl) mlib_free(k);
2457
2458 return MLIB_SUCCESS;
2459 }
2460
2461 /***************************************************************/
2462 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
2463
2464 /***************************************************************/
2465