1 /*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27 /*
28 * FUNCTION
29 * Internal functions for mlib_ImageConv* on U8/S16/U16 types and
30 * MLIB_EDGE_DST_NO_WRITE mask
31 */
32
33 #include "mlib_image.h"
34 #include "mlib_c_ImageConv.h"
35
36 /*
37 This define switches between functions of different data types
38 */
39 #define IMG_TYPE 3
40
41 /***************************************************************/
42 #if IMG_TYPE == 1
43
44 #define DTYPE mlib_u8
45 #define CONV_FUNC(KERN) mlib_c_conv##KERN##nw_u8
46 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
47 #define DSCALE (1 << 24)
48 #define FROM_S32(x) (((x) >> 24) ^ 128)
49 #define S64TOS32(x) (x)
50 #define SAT_OFF -(1u << 31)
51
52 #elif IMG_TYPE == 2
53
54 #define DTYPE mlib_s16
55 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_s16
56 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
57 #define DSCALE 65536.0
58 #define FROM_S32(x) ((x) >> 16)
59 #define S64TOS32(x) ((x) & 0xffffffff)
60 #define SAT_OFF
61
62 #elif IMG_TYPE == 3
63
64 #define DTYPE mlib_u16
65 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_u16
66 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
67 #define DSCALE 65536.0
68 #define FROM_S32(x) (((x) >> 16) ^ 0x8000)
69 #define S64TOS32(x) (x)
70 #define SAT_OFF -(1u << 31)
71
72 #endif /* IMG_TYPE == 1 */
73
74 /***************************************************************/
75 #define BUFF_SIZE 1600
76
77 #define CACHE_SIZE (64*1024)
78
79 /***************************************************************/
80 #define FTYPE mlib_d64
81
82 #ifndef MLIB_USE_FTOI_CLAMPING
83
84 #define CLAMP_S32(x) \
85 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
86
87 #else
88
89 #define CLAMP_S32(x) ((mlib_s32)(x))
90
91 #endif /* MLIB_USE_FTOI_CLAMPING */
92
93 /***************************************************************/
94 #define D2I(x) CLAMP_S32((x) SAT_OFF)
95
96 /***************************************************************/
97 #ifdef VM_LITTLE_ENDIAN
98
99 #define STORE2(res0, res1) \
100 dp[0 ] = res1; \
101 dp[chan1] = res0
102
103 #else
104
105 #define STORE2(res0, res1) \
106 dp[0 ] = res0; \
107 dp[chan1] = res1
108
109 #endif /* VM_LITTLE_ENDIAN */
110
111 /***************************************************************/
112 #ifdef _NO_LONGLONG
113
114 #define LOAD_BUFF(buff) \
115 buff[i ] = sp[0]; \
116 buff[i + 1] = sp[chan1]
117
118 #else /* _NO_LONGLONG */
119
120 #ifdef VM_LITTLE_ENDIAN
121
122 #define LOAD_BUFF(buff) \
123 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
124
125 #else /* VM_LITTLE_ENDIAN */
126
127 #define LOAD_BUFF(buff) \
128 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
129
130 #endif /* VM_LITTLE_ENDIAN */
131 #endif /* _NO_LONGLONG */
132
133 /***************************************************************/
134 typedef union {
135 mlib_d64 d64;
136 struct {
137 mlib_s32 i0;
138 mlib_s32 i1;
139 } i32s;
140 struct {
141 mlib_s32 f0;
142 mlib_s32 f1;
143 } f32s;
144 } d64_2x32;
145
146 /***************************************************************/
147 #define BUFF_LINE 256
148
149 /***************************************************************/
150 #define DEF_VARS(type) \
151 type *adr_src, *sl, *sp = NULL; \
152 type *adr_dst, *dl, *dp = NULL; \
153 FTYPE *pbuff = buff; \
154 mlib_s32 wid, hgt, sll, dll; \
155 mlib_s32 nchannel, chan1; \
156 mlib_s32 i, j, c
157
158 /***************************************************************/
159 #define LOAD_KERNEL3() \
160 FTYPE scalef = DSCALE; \
161 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8; \
162 FTYPE p00, p01, p02, p03, \
163 p10, p11, p12, p13, \
164 p20, p21, p22, p23; \
165 \
166 while (scalef_expon > 30) { \
167 scalef /= (1 << 30); \
168 scalef_expon -= 30; \
169 } \
170 \
171 scalef /= (1 << scalef_expon); \
172 \
173 /* keep kernel in regs */ \
174 k0 = scalef * kern[0]; k1 = scalef * kern[1]; k2 = scalef * kern[2]; \
175 k3 = scalef * kern[3]; k4 = scalef * kern[4]; k5 = scalef * kern[5]; \
176 k6 = scalef * kern[6]; k7 = scalef * kern[7]; k8 = scalef * kern[8]
177
178 /***************************************************************/
179 #define LOAD_KERNEL(SIZE) \
180 FTYPE scalef = DSCALE; \
181 \
182 while (scalef_expon > 30) { \
183 scalef /= (1 << 30); \
184 scalef_expon -= 30; \
185 } \
186 \
187 scalef /= (1 << scalef_expon); \
188 \
189 for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
190
191 /***************************************************************/
192 #define GET_SRC_DST_PARAMETERS(type) \
193 hgt = mlib_ImageGetHeight(src); \
194 wid = mlib_ImageGetWidth(src); \
195 nchannel = mlib_ImageGetChannels(src); \
196 sll = mlib_ImageGetStride(src) / sizeof(type); \
197 dll = mlib_ImageGetStride(dst) / sizeof(type); \
198 adr_src = (type *)mlib_ImageGetData(src); \
199 adr_dst = (type *)mlib_ImageGetData(dst)
200
201 /***************************************************************/
202 #ifndef __sparc
203
204 #if IMG_TYPE == 1
205
206 /* Test for the presence of any "1" bit in bits
207 8 to 31 of val. If present, then val is either
208 negative or >255. If over/underflows of 8 bits
209 are uncommon, then this technique can be a win,
210 since only a single test, rather than two, is
211 necessary to determine if clamping is needed.
212 On the other hand, if over/underflows are common,
213 it adds an extra test.
214 */
215 #define CLAMP_STORE(dst, val) \
216 if (val & 0xffffff00) { \
217 if (val < MLIB_U8_MIN) \
218 dst = MLIB_U8_MIN; \
219 else \
220 dst = MLIB_U8_MAX; \
221 } else { \
222 dst = (mlib_u8)val; \
223 }
224
225 #elif IMG_TYPE == 2
226
227 #define CLAMP_STORE(dst, val) \
228 if (val >= MLIB_S16_MAX) \
229 dst = MLIB_S16_MAX; \
230 else if (val <= MLIB_S16_MIN) \
231 dst = MLIB_S16_MIN; \
232 else \
233 dst = (mlib_s16)val
234
235 #elif IMG_TYPE == 3
236
237 #define CLAMP_STORE(dst, val) \
238 if (val >= MLIB_U16_MAX) \
239 dst = MLIB_U16_MAX; \
240 else if (val <= MLIB_U16_MIN) \
241 dst = MLIB_U16_MIN; \
242 else \
243 dst = (mlib_u16)val
244
245 #endif /* IMG_TYPE == 1 */
246 #endif /* __sparc */
247
248 /***************************************************************/
249 #define KSIZE 3
250
251 mlib_status CONV_FUNC(3x3)(mlib_image *dst,
252 const mlib_image *src,
253 const mlib_s32 *kern,
254 mlib_s32 scalef_expon,
255 mlib_s32 cmask)
256 {
257 FTYPE buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
258 DEF_VARS(DTYPE);
259 DTYPE *sl1;
260 mlib_s32 chan2;
261 mlib_s32 *buffo, *buffi;
262 DTYPE *sl2;
263 #ifndef __sparc
264 mlib_s32 d0, d1;
265 #endif /* __sparc */
266 LOAD_KERNEL3();
267 GET_SRC_DST_PARAMETERS(DTYPE);
268
269 if (wid > BUFF_LINE) {
270 pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE)*wid);
271
272 if (pbuff == NULL) return MLIB_FAILURE;
273 }
274
275 buff0 = pbuff;
276 buff1 = buff0 + wid;
277 buff2 = buff1 + wid;
278 buff3 = buff2 + wid;
279 buffo = (mlib_s32*)(buff3 + wid);
280 buffi = buffo + (wid &~ 1);
281
282 chan1 = nchannel;
283 chan2 = chan1 + chan1;
284
285 wid -= (KSIZE - 1);
286 hgt -= (KSIZE - 1);
287
288 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
289
290 for (c = 0; c < nchannel; c++) {
291 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
292
293 sl = adr_src + c;
294 dl = adr_dst + c;
295
296 sl1 = sl + sll;
297 sl2 = sl1 + sll;
298 #ifdef __SUNPRO_C
299 #pragma pipeloop(0)
300 #endif /* __SUNPRO_C */
301 for (i = 0; i < wid + (KSIZE - 1); i++) {
302 buff0[i] = (FTYPE)sl[i*chan1];
303 buff1[i] = (FTYPE)sl1[i*chan1];
304 buff2[i] = (FTYPE)sl2[i*chan1];
305 }
306
307 sl += KSIZE*sll;
308
309 for (j = 0; j < hgt; j++) {
310 FTYPE s0, s1;
311
312 p02 = buff0[0];
313 p12 = buff1[0];
314 p22 = buff2[0];
315
316 p03 = buff0[1];
317 p13 = buff1[1];
318 p23 = buff2[1];
319
320 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
321 s1 = p03 * k0 + p13 * k3 + p23 * k6;
322
323 sp = sl;
324 dp = dl;
325
326 #ifdef __SUNPRO_C
327 #pragma pipeloop(0)
328 #endif /* __SUNPRO_C */
329 for (i = 0; i <= (wid - 2); i += 2) {
330 #ifdef __sparc
331 #ifdef _NO_LONGLONG
332 mlib_s32 o64_1, o64_2;
333 #else /* _NO_LONGLONG */
334 mlib_s64 o64;
335 #endif /* _NO_LONGLONG */
336 #endif /* __sparc */
337 d64_2x32 dd;
338
339 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
340 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
341
342 LOAD_BUFF(buffi);
343
344 dd.d64 = *(FTYPE *)(buffi + i);
345 buff3[i ] = (FTYPE)dd.i32s.i0;
346 buff3[i + 1] = (FTYPE)dd.i32s.i1;
347
348 #ifndef __sparc
349 d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
350 d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
351
352 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
353 s1 = p03 * k0 + p13 * k3 + p23 * k6;
354
355 dp[0 ] = FROM_S32(d0);
356 dp[chan1] = FROM_S32(d1);
357
358 #else /* __sparc */
359
360 dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
361 dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
362 *(FTYPE *)(buffo + i) = dd.d64;
363
364 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
365 s1 = p03 * k0 + p13 * k3 + p23 * k6;
366
367 #ifdef _NO_LONGLONG
368
369 o64_1 = buffo[i];
370 o64_2 = buffo[i+1];
371 #if IMG_TYPE != 1
372 STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
373 #else
374 STORE2(o64_1 >> 24, o64_2 >> 24);
375 #endif /* IMG_TYPE != 1 */
376
377 #else /* _NO_LONGLONG */
378
379 o64 = *(mlib_s64*)(buffo + i);
380 #if IMG_TYPE != 1
381 STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
382 #else
383 STORE2(o64 >> 56, o64 >> 24);
384 #endif /* IMG_TYPE != 1 */
385 #endif /* _NO_LONGLONG */
386 #endif /* __sparc */
387
388 sp += chan2;
389 dp += chan2;
390 }
391
392 for (; i < wid; i++) {
393 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i];
394 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
395 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
396
397 buffi[i] = (mlib_s32)sp[0];
398 buff3[i] = (FTYPE)buffi[i];
399
400 #ifndef __sparc
401
402 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
403 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
404
405 dp[0] = FROM_S32(d0);
406
407 #else /* __sparc */
408
409 buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
410 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
411 #if IMG_TYPE != 1
412 dp[0] = FROM_S32(buffo[i]);
413 #else
414 dp[0] = buffo[i] >> 24;
415 #endif /* IMG_TYPE != 1 */
416 #endif /* __sparc */
417
418 sp += chan1;
419 dp += chan1;
420 }
421
422 buffi[wid] = (mlib_s32)sp[0];
423 buff3[wid] = (FTYPE)buffi[wid];
424 buffi[wid + 1] = (mlib_s32)sp[chan1];
425 buff3[wid + 1] = (FTYPE)buffi[wid + 1];
426
427 sl += sll;
428 dl += dll;
429
430 buffT = buff0;
431 buff0 = buff1;
432 buff1 = buff2;
433 buff2 = buff3;
434 buff3 = buffT;
435 }
436 }
437
438 #ifdef __sparc
439 #if IMG_TYPE == 1
440 {
441 mlib_s32 amask = (1 << nchannel) - 1;
442
443 if ((cmask & amask) != amask) {
444 mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
445 } else {
446 mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
447 }
448 }
449
450 #endif /* IMG_TYPE == 1 */
451 #endif /* __sparc */
452
453 if (pbuff != buff) mlib_free(pbuff);
454
455 return MLIB_SUCCESS;
456 }
457
458 /***************************************************************/
459 #ifndef __sparc /* for x86, using integer multiplies is faster */
460
461 mlib_status CONV_FUNC_I(3x3)(mlib_image *dst,
462 const mlib_image *src,
463 const mlib_s32 *kern,
464 mlib_s32 scalef_expon,
465 mlib_s32 cmask)
466 {
467 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2;
468 DTYPE *adr_dst, *dl, *dp;
469 mlib_s32 wid, hgt, sll, dll;
470 mlib_s32 nchannel, chan1, chan2;
471 mlib_s32 i, j, c;
472 mlib_s32 shift1, shift2;
473 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
474 mlib_s32 p02, p03,
475 p12, p13,
476 p22, p23;
477
478 #if IMG_TYPE != 1
479 shift1 = 16;
480 #else
481 shift1 = 8;
482 #endif /* IMG_TYPE != 1 */
483
484 shift2 = scalef_expon - shift1;
485
486 /* keep kernel in regs */
487 k0 = kern[0] >> shift1; k1 = kern[1] >> shift1; k2 = kern[2] >> shift1;
488 k3 = kern[3] >> shift1; k4 = kern[4] >> shift1; k5 = kern[5] >> shift1;
489 k6 = kern[6] >> shift1; k7 = kern[7] >> shift1; k8 = kern[8] >> shift1;
490
491 GET_SRC_DST_PARAMETERS(DTYPE);
492
493 chan1 = nchannel;
494 chan2 = chan1 + chan1;
495
496 wid -= (KSIZE - 1);
497 hgt -= (KSIZE - 1);
498
499 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
500
501 for (c = 0; c < chan1; c++) {
502 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
503
504 sl = adr_src + c;
505 dl = adr_dst + c;
506
507 for (j = 0; j < hgt; j++) {
508 mlib_s32 s0, s1;
509 mlib_s32 pix0, pix1;
510
511 dp = dl;
512 sp0 = sl;
513 sp1 = sp0 + sll;
514 sp2 = sp1 + sll;
515
516 p02 = sp0[0];
517 p12 = sp1[0];
518 p22 = sp2[0];
519
520 p03 = sp0[chan1];
521 p13 = sp1[chan1];
522 p23 = sp2[chan1];
523
524 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
525 s1 = p03 * k0 + p13 * k3 + p23 * k6;
526
527 sp0 += chan2;
528 sp1 += chan2;
529 sp2 += chan2;
530
531 #ifdef __SUNPRO_C
532 #pragma pipeloop(0)
533 #endif /* __SUNPRO_C */
534 for (i = 0; i <= (wid - 2); i += 2) {
535 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
536 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
537
538 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
539 pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
540 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
541
542 CLAMP_STORE(dp[0], pix0);
543 CLAMP_STORE(dp[chan1], pix1);
544
545 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
546 s1 = p03 * k0 + p13 * k3 + p23 * k6;
547
548 sp0 += chan2;
549 sp1 += chan2;
550 sp2 += chan2;
551 dp += chan2;
552 }
553
554 if (wid & 1) {
555 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
556 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
557 CLAMP_STORE(dp[0], pix0);
558 }
559
560 sl += sll;
561 dl += dll;
562 }
563 }
564
565 return MLIB_SUCCESS;
566 }
567
568 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
569
570 /***************************************************************/
571 #undef KSIZE
572 #define KSIZE 4
573
574 mlib_status CONV_FUNC(4x4)(mlib_image *dst,
575 const mlib_image *src,
576 const mlib_s32 *kern,
577 mlib_s32 scalef_expon,
578 mlib_s32 cmask)
579 {
580 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
581 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
582 FTYPE k[KSIZE*KSIZE];
583 mlib_s32 d0, d1;
584 FTYPE k0, k1, k2, k3, k4, k5, k6, k7;
585 FTYPE p00, p01, p02, p03, p04,
586 p10, p11, p12, p13, p14,
587 p20, p21, p22, p23,
588 p30, p31, p32, p33;
589 DEF_VARS(DTYPE);
590 DTYPE *sl1;
591 mlib_s32 chan2;
592 mlib_s32 *buffo, *buffi;
593 DTYPE *sl2, *sl3;
594 LOAD_KERNEL(KSIZE*KSIZE);
595 GET_SRC_DST_PARAMETERS(DTYPE);
596
597 if (wid > BUFF_LINE) {
598 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
599
600 if (pbuff == NULL) return MLIB_FAILURE;
601 }
602
603 buff0 = pbuff;
604 buff1 = buff0 + wid;
605 buff2 = buff1 + wid;
606 buff3 = buff2 + wid;
607 buff4 = buff3 + wid;
608 buffd = buff4 + wid;
609 buffo = (mlib_s32*)(buffd + wid);
610 buffi = buffo + (wid &~ 1);
611
612 chan1 = nchannel;
613 chan2 = chan1 + chan1;
614
615 wid -= (KSIZE - 1);
616 hgt -= (KSIZE - 1);
617
618 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
619
620 for (c = 0; c < nchannel; c++) {
621 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
622
623 sl = adr_src + c;
624 dl = adr_dst + c;
625
626 sl1 = sl + sll;
627 sl2 = sl1 + sll;
628 sl3 = sl2 + sll;
629 #ifdef __SUNPRO_C
630 #pragma pipeloop(0)
631 #endif /* __SUNPRO_C */
632 for (i = 0; i < wid + (KSIZE - 1); i++) {
633 buff0[i] = (FTYPE)sl[i*chan1];
634 buff1[i] = (FTYPE)sl1[i*chan1];
635 buff2[i] = (FTYPE)sl2[i*chan1];
636 buff3[i] = (FTYPE)sl3[i*chan1];
637 }
638
639 sl += KSIZE*sll;
640
641 for (j = 0; j < hgt; j++) {
642 d64_2x32 dd;
643
644 /*
645 * First loop on two first lines of kernel
646 */
647 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
648 k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
649
650 sp = sl;
651 dp = dl;
652
653 p02 = buff0[0];
654 p12 = buff1[0];
655 p03 = buff0[1];
656 p13 = buff1[1];
657 p04 = buff0[2];
658
659 #ifdef __SUNPRO_C
660 #pragma pipeloop(0)
661 #endif /* __SUNPRO_C */
662 for (i = 0; i <= (wid - 2); i += 2) {
663 p00 = p02; p10 = p12;
664 p01 = p03; p11 = p13;
665 p02 = p04; p12 = buff1[i + 2];
666 p03 = buff0[i + 3]; p13 = buff1[i + 3];
667 p04 = buff0[i + 4]; p14 = buff1[i + 4];
668
669 LOAD_BUFF(buffi);
670
671 dd.d64 = *(FTYPE *)(buffi + i);
672 buff4[i ] = (FTYPE)dd.i32s.i0;
673 buff4[i + 1] = (FTYPE)dd.i32s.i1;
674
675 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
676 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
677 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
678 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
679
680 sp += chan2;
681 dp += chan2;
682 }
683
684 /*
685 * Second loop on two last lines of kernel
686 */
687 k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
688 k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
689
690 sp = sl;
691 dp = dl;
692
693 p02 = buff2[0];
694 p12 = buff3[0];
695 p03 = buff2[1];
696 p13 = buff3[1];
697 p04 = buff2[2];
698
699 #ifdef __SUNPRO_C
700 #pragma pipeloop(0)
701 #endif /* __SUNPRO_C */
702 for (i = 0; i <= (wid - 2); i += 2) {
703 p00 = p02; p10 = p12;
704 p01 = p03; p11 = p13;
705 p02 = p04; p12 = buff3[i + 2];
706 p03 = buff2[i + 3]; p13 = buff3[i + 3];
707 p04 = buff2[i + 4]; p14 = buff3[i + 4];
708
709 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
710 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
711 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
712 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
713
714 dp[0 ] = FROM_S32(d0);
715 dp[chan1] = FROM_S32(d1);
716
717 sp += chan2;
718 dp += chan2;
719 }
720
721 /* last pixels */
722 for (; i < wid; i++) {
723 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
724 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
725 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
726 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
727
728 buff4[i] = (FTYPE)sp[0];
729
730 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
731 p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
732 p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
733 p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
734
735 dp[0] = FROM_S32(buffo[i]);
736
737 sp += chan1;
738 dp += chan1;
739 }
740
741 buff4[wid ] = (FTYPE)sp[0];
742 buff4[wid + 1] = (FTYPE)sp[chan1];
743 buff4[wid + 2] = (FTYPE)sp[chan2];
744
745 /* next line */
746 sl += sll;
747 dl += dll;
748
749 buffT = buff0;
750 buff0 = buff1;
751 buff1 = buff2;
752 buff2 = buff3;
753 buff3 = buff4;
754 buff4 = buffT;
755 }
756 }
757
758 if (pbuff != buff) mlib_free(pbuff);
759
760 return MLIB_SUCCESS;
761 }
762
763 /***************************************************************/
764 #undef KSIZE
765 #define KSIZE 5
766
767 mlib_status CONV_FUNC(5x5)(mlib_image *dst,
768 const mlib_image *src,
769 const mlib_s32 *kern,
770 mlib_s32 scalef_expon,
771 mlib_s32 cmask)
772 {
773 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
774 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
775 FTYPE k[KSIZE*KSIZE];
776 mlib_s32 d0, d1;
777 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
778 FTYPE p00, p01, p02, p03, p04, p05,
779 p10, p11, p12, p13, p14, p15,
780 p20, p21, p22, p23, p24,
781 p30, p31, p32, p33, p34,
782 p40, p41, p42, p43, p44;
783 DEF_VARS(DTYPE);
784 DTYPE *sl1;
785 mlib_s32 chan2;
786 mlib_s32 *buffo, *buffi;
787 DTYPE *sl2, *sl3, *sl4;
788 LOAD_KERNEL(KSIZE*KSIZE);
789 GET_SRC_DST_PARAMETERS(DTYPE);
790
791 if (wid > BUFF_LINE) {
792 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
793
794 if (pbuff == NULL) return MLIB_FAILURE;
795 }
796
797 buff0 = pbuff;
798 buff1 = buff0 + wid;
799 buff2 = buff1 + wid;
800 buff3 = buff2 + wid;
801 buff4 = buff3 + wid;
802 buff5 = buff4 + wid;
803 buffd = buff5 + wid;
804 buffo = (mlib_s32*)(buffd + wid);
805 buffi = buffo + (wid &~ 1);
806
807 chan1 = nchannel;
808 chan2 = chan1 + chan1;
809
810 wid -= (KSIZE - 1);
811 hgt -= (KSIZE - 1);
812
813 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
814
815 for (c = 0; c < nchannel; c++) {
816 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
817
818 sl = adr_src + c;
819 dl = adr_dst + c;
820
821 sl1 = sl + sll;
822 sl2 = sl1 + sll;
823 sl3 = sl2 + sll;
824 sl4 = sl3 + sll;
825 #ifdef __SUNPRO_C
826 #pragma pipeloop(0)
827 #endif /* __SUNPRO_C */
828 for (i = 0; i < wid + (KSIZE - 1); i++) {
829 buff0[i] = (FTYPE)sl[i*chan1];
830 buff1[i] = (FTYPE)sl1[i*chan1];
831 buff2[i] = (FTYPE)sl2[i*chan1];
832 buff3[i] = (FTYPE)sl3[i*chan1];
833 buff4[i] = (FTYPE)sl4[i*chan1];
834 }
835
836 sl += KSIZE*sll;
837
838 for (j = 0; j < hgt; j++) {
839 d64_2x32 dd;
840
841 /*
842 * First loop
843 */
844 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
845 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
846
847 sp = sl;
848 dp = dl;
849
850 p02 = buff0[0];
851 p12 = buff1[0];
852 p03 = buff0[1];
853 p13 = buff1[1];
854 p04 = buff0[2];
855 p14 = buff1[2];
856
857 #ifdef __SUNPRO_C
858 #pragma pipeloop(0)
859 #endif /* __SUNPRO_C */
860 for (i = 0; i <= (wid - 2); i += 2) {
861 p00 = p02; p10 = p12;
862 p01 = p03; p11 = p13;
863 p02 = p04; p12 = p14;
864
865 LOAD_BUFF(buffi);
866
867 p03 = buff0[i + 3]; p13 = buff1[i + 3];
868 p04 = buff0[i + 4]; p14 = buff1[i + 4];
869 p05 = buff0[i + 5]; p15 = buff1[i + 5];
870
871 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
872 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
873 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
874 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
875
876 sp += chan2;
877 dp += chan2;
878 }
879
880 /*
881 * Second loop
882 */
883 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
884 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
885
886 sp = sl;
887 dp = dl;
888
889 p02 = buff2[0];
890 p12 = buff3[0];
891 p03 = buff2[1];
892 p13 = buff3[1];
893 p04 = buff2[2];
894 p14 = buff3[2];
895
896 #ifdef __SUNPRO_C
897 #pragma pipeloop(0)
898 #endif /* __SUNPRO_C */
899 for (i = 0; i <= (wid - 2); i += 2) {
900 p00 = p02; p10 = p12;
901 p01 = p03; p11 = p13;
902
903 p02 = buff2[i + 2]; p12 = buff3[i + 2];
904 p03 = buff2[i + 3]; p13 = buff3[i + 3];
905 p04 = buff2[i + 4]; p14 = buff3[i + 4];
906 p05 = buff2[i + 5]; p15 = buff3[i + 5];
907
908 dd.d64 = *(FTYPE *)(buffi + i);
909 buff5[i ] = (FTYPE)dd.i32s.i0;
910 buff5[i + 1] = (FTYPE)dd.i32s.i1;
911
912 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
913 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
914 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
915 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
916
917 sp += chan2;
918 dp += chan2;
919 }
920
921 /*
922 * 3 loop
923 */
924 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
925
926 sp = sl;
927 dp = dl;
928
929 p02 = buff4[0];
930 p03 = buff4[1];
931 p04 = buff4[2];
932 p05 = buff4[3];
933
934 #ifdef __SUNPRO_C
935 #pragma pipeloop(0)
936 #endif /* __SUNPRO_C */
937 for (i = 0; i <= (wid - 2); i += 2) {
938 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
939
940 p04 = buff4[i + 4]; p05 = buff4[i + 5];
941
942 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
943 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
944
945 dp[0 ] = FROM_S32(d0);
946 dp[chan1] = FROM_S32(d1);
947
948 sp += chan2;
949 dp += chan2;
950 }
951
952 /* last pixels */
953 for (; i < wid; i++) {
954 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
955 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
956 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
957 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
958 p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
959
960 p40 = buff4[i]; p41 = buff4[i + 1]; p42 = buff4[i + 2];
961 p43 = buff4[i + 3]; p44 = buff4[i + 4];
962
963 buff5[i] = (FTYPE)sp[0];
964
965 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
966 p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
967 p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
968 p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
969 p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
970
971 dp[0] = FROM_S32(buffo[i]);
972
973 sp += chan1;
974 dp += chan1;
975 }
976
977 buff5[wid ] = (FTYPE)sp[0];
978 buff5[wid + 1] = (FTYPE)sp[chan1];
979 buff5[wid + 2] = (FTYPE)sp[chan2];
980 buff5[wid + 3] = (FTYPE)sp[chan2 + chan1];
981
982 /* next line */
983 sl += sll;
984 dl += dll;
985
986 buffT = buff0;
987 buff0 = buff1;
988 buff1 = buff2;
989 buff2 = buff3;
990 buff3 = buff4;
991 buff4 = buff5;
992 buff5 = buffT;
993 }
994 }
995
996 if (pbuff != buff) mlib_free(pbuff);
997
998 return MLIB_SUCCESS;
999 }
1000
1001 /***************************************************************/
1002 #ifndef __sparc /* for x86, using integer multiplies is faster */
1003
1004 mlib_status CONV_FUNC_I(5x5)(mlib_image *dst,
1005 const mlib_image *src,
1006 const mlib_s32 *kern,
1007 mlib_s32 scalef_expon,
1008 mlib_s32 cmask)
1009 {
1010 mlib_s32 buff[BUFF_LINE];
1011 mlib_s32 *buffd;
1012 mlib_s32 k[KSIZE*KSIZE];
1013 mlib_s32 shift1, shift2;
1014 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1015 mlib_s32 p00, p01, p02, p03, p04, p05,
1016 p10, p11, p12, p13, p14, p15;
1017 DTYPE *adr_src, *sl, *sp0, *sp1;
1018 DTYPE *adr_dst, *dl, *dp;
1019 mlib_s32 *pbuff = buff;
1020 mlib_s32 wid, hgt, sll, dll;
1021 mlib_s32 nchannel, chan1, chan2, chan3, chan4;
1022 mlib_s32 i, j, c;
1023
1024 #if IMG_TYPE != 1
1025 shift1 = 16;
1026 #else
1027 shift1 = 8;
1028 #endif /* IMG_TYPE != 1 */
1029
1030 shift2 = scalef_expon - shift1;
1031
1032 for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1033
1034 GET_SRC_DST_PARAMETERS(DTYPE);
1035
1036 if (wid > BUFF_LINE) {
1037 pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1038
1039 if (pbuff == NULL) return MLIB_FAILURE;
1040 }
1041
1042 buffd = pbuff;
1043
1044 chan1 = nchannel;
1045 chan2 = chan1 + chan1;
1046 chan3 = chan2 + chan1;
1047 chan4 = chan3 + chan1;
1048
1049 wid -= (KSIZE - 1);
1050 hgt -= (KSIZE - 1);
1051
1052 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1053
1054 for (c = 0; c < chan1; c++) {
1055 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1056
1057 sl = adr_src + c;
1058 dl = adr_dst + c;
1059
1060 for (j = 0; j < hgt; j++) {
1061 mlib_s32 pix0, pix1;
1062 /*
1063 * First loop
1064 */
1065 sp0 = sl;
1066 sp1 = sp0 + sll;
1067 dp = dl;
1068
1069 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1070 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1071
1072 p02 = sp0[0]; p12 = sp1[0];
1073 p03 = sp0[chan1]; p13 = sp1[chan1];
1074 p04 = sp0[chan2]; p14 = sp1[chan2];
1075 p05 = sp0[chan3]; p15 = sp1[chan3];
1076
1077 sp0 += chan4;
1078 sp1 += chan4;
1079
1080 #ifdef __SUNPRO_C
1081 #pragma pipeloop(0)
1082 #endif /* __SUNPRO_C */
1083 for (i = 0; i <= (wid - 2); i += 2) {
1084 p00 = p02; p10 = p12;
1085 p01 = p03; p11 = p13;
1086 p02 = p04; p12 = p14;
1087 p03 = p05; p13 = p15;
1088
1089 p04 = sp0[0]; p14 = sp1[0];
1090 p05 = sp0[chan1]; p15 = sp1[chan1];
1091
1092 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1093 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1094 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1095 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1096
1097 sp0 += chan2;
1098 sp1 += chan2;
1099 dp += chan2;
1100 }
1101
1102 if (wid & 1) {
1103 p00 = p02; p10 = p12;
1104 p01 = p03; p11 = p13;
1105 p02 = p04; p12 = p14;
1106 p03 = p05; p13 = p15;
1107
1108 p04 = sp0[0]; p14 = sp1[0];
1109
1110 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1111 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1112 }
1113
1114 /*
1115 * Second loop
1116 */
1117 sp0 = sl + 2*sll;
1118 sp1 = sp0 + sll;
1119 dp = dl;
1120
1121 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1122 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1123
1124 p02 = sp0[0]; p12 = sp1[0];
1125 p03 = sp0[chan1]; p13 = sp1[chan1];
1126 p04 = sp0[chan2]; p14 = sp1[chan2];
1127 p05 = sp0[chan3]; p15 = sp1[chan3];
1128
1129 sp0 += chan4;
1130 sp1 += chan4;
1131
1132 #ifdef __SUNPRO_C
1133 #pragma pipeloop(0)
1134 #endif /* __SUNPRO_C */
1135 for (i = 0; i <= (wid - 2); i += 2) {
1136 p00 = p02; p10 = p12;
1137 p01 = p03; p11 = p13;
1138 p02 = p04; p12 = p14;
1139 p03 = p05; p13 = p15;
1140
1141 p04 = sp0[0]; p14 = sp1[0];
1142 p05 = sp0[chan1]; p15 = sp1[chan1];
1143
1144 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1145 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1146 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1147 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1148
1149 sp0 += chan2;
1150 sp1 += chan2;
1151 dp += chan2;
1152 }
1153
1154 if (wid & 1) {
1155 p00 = p02; p10 = p12;
1156 p01 = p03; p11 = p13;
1157 p02 = p04; p12 = p14;
1158 p03 = p05; p13 = p15;
1159
1160 p04 = sp0[0]; p14 = sp1[0];
1161
1162 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1163 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1164 }
1165
1166 /*
1167 * 3 loop
1168 */
1169 dp = dl;
1170 sp0 = sl + 4*sll;
1171
1172 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1173
1174 p02 = sp0[0];
1175 p03 = sp0[chan1];
1176 p04 = sp0[chan2];
1177 p05 = sp0[chan3];
1178
1179 sp0 += chan2 + chan2;
1180
1181 #ifdef __SUNPRO_C
1182 #pragma pipeloop(0)
1183 #endif /* __SUNPRO_C */
1184 for (i = 0; i <= (wid - 2); i += 2) {
1185 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1186
1187 p04 = sp0[0]; p05 = sp0[chan1];
1188
1189 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1190 p03 * k3 + p04 * k4) >> shift2;
1191 pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1192 p04 * k3 + p05 * k4) >> shift2;
1193
1194 CLAMP_STORE(dp[0], pix0);
1195 CLAMP_STORE(dp[chan1], pix1);
1196
1197 dp += chan2;
1198 sp0 += chan2;
1199 }
1200
1201 if (wid & 1) {
1202 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1203
1204 p04 = sp0[0];
1205
1206 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1207 p03 * k3 + p04 * k4) >> shift2;
1208 CLAMP_STORE(dp[0], pix0);
1209 }
1210
1211 /* next line */
1212 sl += sll;
1213 dl += dll;
1214 }
1215 }
1216
1217 if (pbuff != buff) mlib_free(pbuff);
1218
1219 return MLIB_SUCCESS;
1220 }
1221
1222 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1223
1224 /***************************************************************/
1225 #if IMG_TYPE == 1
1226
1227 #undef KSIZE
1228 #define KSIZE 7
1229
1230 mlib_status CONV_FUNC(7x7)(mlib_image *dst,
1231 const mlib_image *src,
1232 const mlib_s32 *kern,
1233 mlib_s32 scalef_expon,
1234 mlib_s32 cmask)
1235 {
1236 FTYPE buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1237 FTYPE k[KSIZE*KSIZE];
1238 mlib_s32 l, m, buff_ind;
1239 mlib_s32 d0, d1;
1240 FTYPE k0, k1, k2, k3, k4, k5, k6;
1241 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1242 DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1243 DEF_VARS(DTYPE);
1244 DTYPE *sl1;
1245 mlib_s32 chan2;
1246 mlib_s32 *buffo, *buffi;
1247 LOAD_KERNEL(KSIZE*KSIZE);
1248 GET_SRC_DST_PARAMETERS(DTYPE);
1249
1250 if (wid > BUFF_LINE) {
1251 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
1252
1253 if (pbuff == NULL) return MLIB_FAILURE;
1254 }
1255
1256 for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
1257 for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1258 buffd = buffs[KSIZE] + wid;
1259 buffo = (mlib_s32*)(buffd + wid);
1260 buffi = buffo + (wid &~ 1);
1261
1262 chan1 = nchannel;
1263 chan2 = chan1 + chan1;
1264
1265 wid -= (KSIZE - 1);
1266 hgt -= (KSIZE - 1);
1267
1268 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1269
1270 for (c = 0; c < nchannel; c++) {
1271 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1272
1273 sl = adr_src + c;
1274 dl = adr_dst + c;
1275
1276 sl1 = sl + sll;
1277 sl2 = sl1 + sll;
1278 sl3 = sl2 + sll;
1279 sl4 = sl3 + sll;
1280 sl5 = sl4 + sll;
1281 sl6 = sl5 + sll;
1282 #ifdef __SUNPRO_C
1283 #pragma pipeloop(0)
1284 #endif /* __SUNPRO_C */
1285 for (i = 0; i < wid + (KSIZE - 1); i++) {
1286 buffs[0][i] = (FTYPE)sl[i*chan1];
1287 buffs[1][i] = (FTYPE)sl1[i*chan1];
1288 buffs[2][i] = (FTYPE)sl2[i*chan1];
1289 buffs[3][i] = (FTYPE)sl3[i*chan1];
1290 buffs[4][i] = (FTYPE)sl4[i*chan1];
1291 buffs[5][i] = (FTYPE)sl5[i*chan1];
1292 buffs[6][i] = (FTYPE)sl6[i*chan1];
1293 }
1294
1295 buff_ind = 0;
1296
1297 #ifdef __SUNPRO_C
1298 #pragma pipeloop(0)
1299 #endif /* __SUNPRO_C */
1300 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1301
1302 sl += KSIZE*sll;
1303
1304 for (j = 0; j < hgt; j++) {
1305 FTYPE **buffc = buffs + buff_ind;
1306 FTYPE *buffn = buffc[KSIZE];
1307 FTYPE *pk = k;
1308
1309 for (l = 0; l < KSIZE; l++) {
1310 FTYPE *buff = buffc[l];
1311 d64_2x32 dd;
1312
1313 sp = sl;
1314 dp = dl;
1315
1316 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1317 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1318
1319 k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1320 k4 = *pk++; k5 = *pk++; k6 = *pk++;
1321
1322 if (l < (KSIZE - 1)) {
1323 #ifdef __SUNPRO_C
1324 #pragma pipeloop(0)
1325 #endif /* __SUNPRO_C */
1326 for (i = 0; i <= (wid - 2); i += 2) {
1327 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1328
1329 p6 = buff[i + 6]; p7 = buff[i + 7];
1330
1331 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1332 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1333 }
1334
1335 } else {
1336 #ifdef __SUNPRO_C
1337 #pragma pipeloop(0)
1338 #endif /* __SUNPRO_C */
1339 for (i = 0; i <= (wid - 2); i += 2) {
1340 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1341
1342 p6 = buff[i + 6]; p7 = buff[i + 7];
1343
1344 LOAD_BUFF(buffi);
1345
1346 dd.d64 = *(FTYPE *)(buffi + i);
1347 buffn[i ] = (FTYPE)dd.i32s.i0;
1348 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1349
1350 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1351 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1352
1353 dp[0 ] = FROM_S32(d0);
1354 dp[chan1] = FROM_S32(d1);
1355
1356 buffd[i ] = 0.0;
1357 buffd[i + 1] = 0.0;
1358
1359 sp += chan2;
1360 dp += chan2;
1361 }
1362 }
1363 }
1364
1365 /* last pixels */
1366 for (; i < wid; i++) {
1367 FTYPE *pk = k, s = 0;
1368 mlib_s32 d0;
1369
1370 for (l = 0; l < KSIZE; l++) {
1371 FTYPE *buff = buffc[l] + i;
1372
1373 for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1374 }
1375
1376 d0 = D2I(s);
1377 dp[0] = FROM_S32(d0);
1378
1379 buffn[i] = (FTYPE)sp[0];
1380
1381 sp += chan1;
1382 dp += chan1;
1383 }
1384
1385 for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
1386
1387 /* next line */
1388 sl += sll;
1389 dl += dll;
1390
1391 buff_ind++;
1392
1393 if (buff_ind >= KSIZE + 1) buff_ind = 0;
1394 }
1395 }
1396
1397 if (pbuff != buff) mlib_free(pbuff);
1398
1399 return MLIB_SUCCESS;
1400 }
1401
1402 #endif /* IMG_TYPE == 1 */
1403
1404 /***************************************************************/
1405 #define MAX_KER 7
1406 #define MAX_N 15
1407
mlib_ImageConv1xN(mlib_image * dst,const mlib_image * src,const mlib_d64 * k,mlib_s32 n,mlib_s32 dn,mlib_s32 cmask)1408 static mlib_status mlib_ImageConv1xN(mlib_image *dst,
1409 const mlib_image *src,
1410 const mlib_d64 *k,
1411 mlib_s32 n,
1412 mlib_s32 dn,
1413 mlib_s32 cmask)
1414 {
1415 FTYPE buff[BUFF_SIZE];
1416 mlib_s32 off, kh;
1417 mlib_s32 d0, d1;
1418 const FTYPE *pk;
1419 FTYPE k0, k1, k2, k3;
1420 FTYPE p0, p1, p2, p3, p4;
1421 DEF_VARS(DTYPE);
1422 DTYPE *sl_c, *dl_c, *sl0;
1423 mlib_s32 l, hsize, max_hsize;
1424 GET_SRC_DST_PARAMETERS(DTYPE);
1425
1426 hgt -= (n - 1);
1427 adr_dst += dn*dll;
1428
1429 max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
1430
1431 if (!max_hsize) max_hsize = 1;
1432
1433 if (max_hsize > BUFF_SIZE) {
1434 pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
1435 }
1436
1437 chan1 = nchannel;
1438
1439 sl_c = adr_src;
1440 dl_c = adr_dst;
1441
1442 for (l = 0; l < hgt; l += hsize) {
1443 hsize = hgt - l;
1444
1445 if (hsize > max_hsize) hsize = max_hsize;
1446
1447 for (c = 0; c < nchannel; c++) {
1448 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1449
1450 sl = sl_c + c;
1451 dl = dl_c + c;
1452
1453 #ifdef __SUNPRO_C
1454 #pragma pipeloop(0)
1455 #endif /* __SUNPRO_C */
1456 for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
1457
1458 for (i = 0; i < wid; i++) {
1459 sl0 = sl;
1460
1461 for (off = 0; off < (n - 4); off += 4) {
1462 pk = k + off;
1463 sp = sl0;
1464
1465 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1466 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1467 sp += 3*sll;
1468
1469 #ifdef __SUNPRO_C
1470 #pragma pipeloop(0)
1471 #endif /* __SUNPRO_C */
1472 for (j = 0; j < hsize; j += 2) {
1473 p0 = p2; p1 = p3; p2 = p4;
1474 p3 = sp[0];
1475 p4 = sp[sll];
1476
1477 pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1478 pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1479
1480 sp += 2*sll;
1481 }
1482
1483 sl0 += 4*sll;
1484 }
1485
1486 pk = k + off;
1487 sp = sl0;
1488
1489 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1490 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1491
1492 dp = dl;
1493 kh = n - off;
1494
1495 if (kh == 4) {
1496 sp += 3*sll;
1497
1498 #ifdef __SUNPRO_C
1499 #pragma pipeloop(0)
1500 #endif /* __SUNPRO_C */
1501 for (j = 0; j <= (hsize - 2); j += 2) {
1502 p0 = p2; p1 = p3; p2 = p4;
1503 p3 = sp[0];
1504 p4 = sp[sll];
1505
1506 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1507 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
1508
1509 dp[0 ] = FROM_S32(d0);
1510 dp[dll] = FROM_S32(d1);
1511
1512 pbuff[j] = 0;
1513 pbuff[j + 1] = 0;
1514
1515 sp += 2*sll;
1516 dp += 2*dll;
1517 }
1518
1519 if (j < hsize) {
1520 p0 = p2; p1 = p3; p2 = p4;
1521 p3 = sp[0];
1522
1523 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1524
1525 pbuff[j] = 0;
1526
1527 dp[0] = FROM_S32(d0);
1528 }
1529
1530 } else if (kh == 3) {
1531 sp += 2*sll;
1532
1533 #ifdef __SUNPRO_C
1534 #pragma pipeloop(0)
1535 #endif /* __SUNPRO_C */
1536 for (j = 0; j <= (hsize - 2); j += 2) {
1537 p0 = p2; p1 = p3;
1538 p2 = sp[0];
1539 p3 = sp[sll];
1540
1541 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1542 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
1543
1544 dp[0 ] = FROM_S32(d0);
1545 dp[dll] = FROM_S32(d1);
1546
1547 pbuff[j] = 0;
1548 pbuff[j + 1] = 0;
1549
1550 sp += 2*sll;
1551 dp += 2*dll;
1552 }
1553
1554 if (j < hsize) {
1555 p0 = p2; p1 = p3;
1556 p2 = sp[0];
1557
1558 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1559
1560 pbuff[j] = 0;
1561
1562 dp[0] = FROM_S32(d0);
1563 }
1564
1565 } else if (kh == 2) {
1566 sp += sll;
1567
1568 #ifdef __SUNPRO_C
1569 #pragma pipeloop(0)
1570 #endif /* __SUNPRO_C */
1571 for (j = 0; j <= (hsize - 2); j += 2) {
1572 p0 = p2;
1573 p1 = sp[0];
1574 p2 = sp[sll];
1575
1576 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1577 d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
1578
1579 dp[0 ] = FROM_S32(d0);
1580 dp[dll] = FROM_S32(d1);
1581
1582 pbuff[j] = 0;
1583 pbuff[j + 1] = 0;
1584
1585 sp += 2*sll;
1586 dp += 2*dll;
1587 }
1588
1589 if (j < hsize) {
1590 p0 = p2;
1591 p1 = sp[0];
1592
1593 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1594
1595 pbuff[j] = 0;
1596
1597 dp[0] = FROM_S32(d0);
1598 }
1599
1600 } else /* if (kh == 1) */ {
1601 #ifdef __SUNPRO_C
1602 #pragma pipeloop(0)
1603 #endif /* __SUNPRO_C */
1604 for (j = 0; j < hsize; j++) {
1605 p0 = sp[0];
1606
1607 d0 = D2I(p0*k0 + pbuff[j]);
1608
1609 dp[0] = FROM_S32(d0);
1610
1611 pbuff[j] = 0;
1612
1613 sp += sll;
1614 dp += dll;
1615 }
1616 }
1617
1618 sl += chan1;
1619 dl += chan1;
1620 }
1621 }
1622
1623 sl_c += max_hsize*sll;
1624 dl_c += max_hsize*dll;
1625 }
1626
1627 if (pbuff != buff) mlib_free(pbuff);
1628
1629 return MLIB_SUCCESS;
1630 }
1631
1632 /***************************************************************/
CONV_FUNC(MxN)1633 mlib_status CONV_FUNC(MxN)(mlib_image *dst,
1634 const mlib_image *src,
1635 const mlib_s32 *kernel,
1636 mlib_s32 m,
1637 mlib_s32 n,
1638 mlib_s32 dm,
1639 mlib_s32 dn,
1640 mlib_s32 scale,
1641 mlib_s32 cmask)
1642 {
1643 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1644 FTYPE **buffs = buffs_arr, *buffd;
1645 FTYPE akernel[256], *k = akernel, fscale = DSCALE;
1646 mlib_s32 mn, l, off, kw, bsize, buff_ind;
1647 mlib_s32 d0, d1;
1648 FTYPE k0, k1, k2, k3, k4, k5, k6;
1649 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1650 d64_2x32 dd;
1651 DEF_VARS(DTYPE);
1652 mlib_s32 chan2;
1653 mlib_s32 *buffo, *buffi;
1654 mlib_status status = MLIB_SUCCESS;
1655
1656 GET_SRC_DST_PARAMETERS(DTYPE);
1657
1658 if (scale > 30) {
1659 fscale *= 1.0/(1 << 30);
1660 scale -= 30;
1661 }
1662
1663 fscale /= (1 << scale);
1664
1665 mn = m*n;
1666
1667 if (mn > 256) {
1668 k = mlib_malloc(mn*sizeof(mlib_d64));
1669
1670 if (k == NULL) return MLIB_FAILURE;
1671 }
1672
1673 for (i = 0; i < mn; i++) {
1674 k[i] = kernel[i]*fscale;
1675 }
1676
1677 if (m == 1) {
1678 status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1679 FREE_AND_RETURN_STATUS;
1680 }
1681
1682 bsize = (n + 3)*wid;
1683
1684 if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1685 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1686
1687 if (pbuff == NULL) {
1688 status = MLIB_FAILURE;
1689 FREE_AND_RETURN_STATUS;
1690 }
1691 buffs = (FTYPE **)(pbuff + bsize);
1692 }
1693
1694 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1695 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1696 buffd = buffs[n] + wid;
1697 buffo = (mlib_s32*)(buffd + wid);
1698 buffi = buffo + (wid &~ 1);
1699
1700 chan1 = nchannel;
1701 chan2 = chan1 + chan1;
1702
1703 wid -= (m - 1);
1704 hgt -= (n - 1);
1705 adr_dst += dn*dll + dm*nchannel;
1706
1707 for (c = 0; c < nchannel; c++) {
1708 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1709
1710 sl = adr_src + c;
1711 dl = adr_dst + c;
1712
1713 for (l = 0; l < n; l++) {
1714 FTYPE *buff = buffs[l];
1715
1716 #ifdef __SUNPRO_C
1717 #pragma pipeloop(0)
1718 #endif /* __SUNPRO_C */
1719 for (i = 0; i < wid + (m - 1); i++) {
1720 buff[i] = (FTYPE)sl[i*chan1];
1721 }
1722
1723 sl += sll;
1724 }
1725
1726 buff_ind = 0;
1727
1728 #ifdef __SUNPRO_C
1729 #pragma pipeloop(0)
1730 #endif /* __SUNPRO_C */
1731 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1732
1733 for (j = 0; j < hgt; j++) {
1734 FTYPE **buffc = buffs + buff_ind;
1735 FTYPE *buffn = buffc[n];
1736 FTYPE *pk = k;
1737
1738 for (l = 0; l < n; l++) {
1739 FTYPE *buff_l = buffc[l];
1740
1741 for (off = 0; off < m;) {
1742 FTYPE *buff = buff_l + off;
1743
1744 kw = m - off;
1745
1746 if (kw > 2*MAX_KER) kw = MAX_KER; else
1747 if (kw > MAX_KER) kw = kw/2;
1748 off += kw;
1749
1750 sp = sl;
1751 dp = dl;
1752
1753 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1754 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1755
1756 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1757 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1758 pk += kw;
1759
1760 if (kw == 7) {
1761
1762 if (l < (n - 1) || off < m) {
1763 #ifdef __SUNPRO_C
1764 #pragma pipeloop(0)
1765 #endif /* __SUNPRO_C */
1766 for (i = 0; i <= (wid - 2); i += 2) {
1767 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1768
1769 p6 = buff[i + 6]; p7 = buff[i + 7];
1770
1771 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1772 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1773 }
1774
1775 } else {
1776 #ifdef __SUNPRO_C
1777 #pragma pipeloop(0)
1778 #endif /* __SUNPRO_C */
1779 for (i = 0; i <= (wid - 2); i += 2) {
1780 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1781
1782 p6 = buff[i + 6]; p7 = buff[i + 7];
1783
1784 LOAD_BUFF(buffi);
1785
1786 dd.d64 = *(FTYPE *)(buffi + i);
1787 buffn[i ] = (FTYPE)dd.i32s.i0;
1788 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1789
1790 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1791 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1792
1793 dp[0 ] = FROM_S32(d0);
1794 dp[chan1] = FROM_S32(d1);
1795
1796 buffd[i ] = 0.0;
1797 buffd[i + 1] = 0.0;
1798
1799 sp += chan2;
1800 dp += chan2;
1801 }
1802 }
1803
1804 } else if (kw == 6) {
1805
1806 if (l < (n - 1) || off < m) {
1807 #ifdef __SUNPRO_C
1808 #pragma pipeloop(0)
1809 #endif /* __SUNPRO_C */
1810 for (i = 0; i <= (wid - 2); i += 2) {
1811 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1812
1813 p5 = buff[i + 5]; p6 = buff[i + 6];
1814
1815 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1816 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1817 }
1818
1819 } else {
1820 #ifdef __SUNPRO_C
1821 #pragma pipeloop(0)
1822 #endif /* __SUNPRO_C */
1823 for (i = 0; i <= (wid - 2); i += 2) {
1824 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1825
1826 p5 = buff[i + 5]; p6 = buff[i + 6];
1827
1828 buffn[i ] = (FTYPE)sp[0];
1829 buffn[i + 1] = (FTYPE)sp[chan1];
1830
1831 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
1832 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1833
1834 dp[0 ] = FROM_S32(d0);
1835 dp[chan1] = FROM_S32(d1);
1836
1837 buffd[i ] = 0.0;
1838 buffd[i + 1] = 0.0;
1839
1840 sp += chan2;
1841 dp += chan2;
1842 }
1843 }
1844
1845 } else if (kw == 5) {
1846
1847 if (l < (n - 1) || off < m) {
1848 #ifdef __SUNPRO_C
1849 #pragma pipeloop(0)
1850 #endif /* __SUNPRO_C */
1851 for (i = 0; i <= (wid - 2); i += 2) {
1852 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1853
1854 p4 = buff[i + 4]; p5 = buff[i + 5];
1855
1856 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1857 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1858 }
1859
1860 } else {
1861 #ifdef __SUNPRO_C
1862 #pragma pipeloop(0)
1863 #endif /* __SUNPRO_C */
1864 for (i = 0; i <= (wid - 2); i += 2) {
1865 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1866
1867 p4 = buff[i + 4]; p5 = buff[i + 5];
1868
1869 buffn[i ] = (FTYPE)sp[0];
1870 buffn[i + 1] = (FTYPE)sp[chan1];
1871
1872 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
1873 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1874
1875 dp[0 ] = FROM_S32(d0);
1876 dp[chan1] = FROM_S32(d1);
1877
1878 buffd[i ] = 0.0;
1879 buffd[i + 1] = 0.0;
1880
1881 sp += chan2;
1882 dp += chan2;
1883 }
1884 }
1885
1886 } else if (kw == 4) {
1887
1888 if (l < (n - 1) || off < m) {
1889 #ifdef __SUNPRO_C
1890 #pragma pipeloop(0)
1891 #endif /* __SUNPRO_C */
1892 for (i = 0; i <= (wid - 2); i += 2) {
1893 p0 = p2; p1 = p3; p2 = p4;
1894
1895 p3 = buff[i + 3]; p4 = buff[i + 4];
1896
1897 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1898 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1899 }
1900
1901 } else {
1902 #ifdef __SUNPRO_C
1903 #pragma pipeloop(0)
1904 #endif /* __SUNPRO_C */
1905 for (i = 0; i <= (wid - 2); i += 2) {
1906 p0 = p2; p1 = p3; p2 = p4;
1907
1908 p3 = buff[i + 3]; p4 = buff[i + 4];
1909
1910 buffn[i ] = (FTYPE)sp[0];
1911 buffn[i + 1] = (FTYPE)sp[chan1];
1912
1913 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
1914 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1915
1916 dp[0 ] = FROM_S32(d0);
1917 dp[chan1] = FROM_S32(d1);
1918
1919 buffd[i ] = 0.0;
1920 buffd[i + 1] = 0.0;
1921
1922 sp += chan2;
1923 dp += chan2;
1924 }
1925 }
1926
1927 } else if (kw == 3) {
1928
1929 if (l < (n - 1) || off < m) {
1930 #ifdef __SUNPRO_C
1931 #pragma pipeloop(0)
1932 #endif /* __SUNPRO_C */
1933 for (i = 0; i <= (wid - 2); i += 2) {
1934 p0 = p2; p1 = p3;
1935
1936 p2 = buff[i + 2]; p3 = buff[i + 3];
1937
1938 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
1939 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1940 }
1941
1942 } else {
1943 #ifdef __SUNPRO_C
1944 #pragma pipeloop(0)
1945 #endif /* __SUNPRO_C */
1946 for (i = 0; i <= (wid - 2); i += 2) {
1947 p0 = p2; p1 = p3;
1948
1949 p2 = buff[i + 2]; p3 = buff[i + 3];
1950
1951 buffn[i ] = (FTYPE)sp[0];
1952 buffn[i + 1] = (FTYPE)sp[chan1];
1953
1954 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
1955 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1956
1957 dp[0 ] = FROM_S32(d0);
1958 dp[chan1] = FROM_S32(d1);
1959
1960 buffd[i ] = 0.0;
1961 buffd[i + 1] = 0.0;
1962
1963 sp += chan2;
1964 dp += chan2;
1965 }
1966 }
1967
1968 } else /*if (kw == 2)*/ {
1969
1970 if (l < (n - 1) || off < m) {
1971 #ifdef __SUNPRO_C
1972 #pragma pipeloop(0)
1973 #endif /* __SUNPRO_C */
1974 for (i = 0; i <= (wid - 2); i += 2) {
1975 p0 = p2;
1976
1977 p1 = buff[i + 1]; p2 = buff[i + 2];
1978
1979 buffd[i ] += p0*k0 + p1*k1;
1980 buffd[i + 1] += p1*k0 + p2*k1;
1981 }
1982
1983 } else {
1984 #ifdef __SUNPRO_C
1985 #pragma pipeloop(0)
1986 #endif /* __SUNPRO_C */
1987 for (i = 0; i <= (wid - 2); i += 2) {
1988 p0 = p2;
1989
1990 p1 = buff[i + 1]; p2 = buff[i + 2];
1991
1992 buffn[i ] = (FTYPE)sp[0];
1993 buffn[i + 1] = (FTYPE)sp[chan1];
1994
1995 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]);
1996 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1997
1998 dp[0 ] = FROM_S32(d0);
1999 dp[chan1] = FROM_S32(d1);
2000
2001 buffd[i ] = 0.0;
2002 buffd[i + 1] = 0.0;
2003
2004 sp += chan2;
2005 dp += chan2;
2006 }
2007 }
2008 }
2009 }
2010 }
2011
2012 /* last pixels */
2013 for (; i < wid; i++) {
2014 FTYPE *pk = k, s = 0;
2015 mlib_s32 x, d0;
2016
2017 for (l = 0; l < n; l++) {
2018 FTYPE *buff = buffc[l] + i;
2019
2020 for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2021 }
2022
2023 d0 = D2I(s);
2024 dp[0] = FROM_S32(d0);
2025
2026 buffn[i] = (FTYPE)sp[0];
2027
2028 sp += chan1;
2029 dp += chan1;
2030 }
2031
2032 for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
2033
2034 /* next line */
2035 sl += sll;
2036 dl += dll;
2037
2038 buff_ind++;
2039
2040 if (buff_ind >= n + 1) buff_ind = 0;
2041 }
2042 }
2043
2044 FREE_AND_RETURN_STATUS;
2045 }
2046
2047 /***************************************************************/
2048 #ifndef __sparc /* for x86, using integer multiplies is faster */
2049
2050 #define STORE_RES(res, x) \
2051 x >>= shift2; \
2052 CLAMP_STORE(res, x)
2053
CONV_FUNC_I(MxN)2054 mlib_status CONV_FUNC_I(MxN)(mlib_image *dst,
2055 const mlib_image *src,
2056 const mlib_s32 *kernel,
2057 mlib_s32 m,
2058 mlib_s32 n,
2059 mlib_s32 dm,
2060 mlib_s32 dn,
2061 mlib_s32 scale,
2062 mlib_s32 cmask)
2063 {
2064 mlib_s32 buff[BUFF_SIZE], *buffd = buff;
2065 mlib_s32 l, off, kw;
2066 mlib_s32 d0, d1, shift1, shift2;
2067 mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2068 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2069 DTYPE *adr_src, *sl, *sp = NULL;
2070 DTYPE *adr_dst, *dl, *dp = NULL;
2071 mlib_s32 wid, hgt, sll, dll;
2072 mlib_s32 nchannel, chan1;
2073 mlib_s32 i, j, c;
2074 mlib_s32 chan2;
2075 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2076 GET_SRC_DST_PARAMETERS(DTYPE);
2077
2078 #if IMG_TYPE != 1
2079 shift1 = 16;
2080 #else
2081 shift1 = 8;
2082 #endif /* IMG_TYPE != 1 */
2083 shift2 = scale - shift1;
2084
2085 chan1 = nchannel;
2086 chan2 = chan1 + chan1;
2087
2088 wid -= (m - 1);
2089 hgt -= (n - 1);
2090 adr_dst += dn*dll + dm*nchannel;
2091
2092 if (wid > BUFF_SIZE) {
2093 buffd = mlib_malloc(sizeof(mlib_s32)*wid);
2094
2095 if (buffd == NULL) return MLIB_FAILURE;
2096 }
2097
2098 if (m*n > MAX_N*MAX_N) {
2099 k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2100
2101 if (k == NULL) {
2102 if (buffd != buff) mlib_free(buffd);
2103 return MLIB_FAILURE;
2104 }
2105 }
2106
2107 for (i = 0; i < m*n; i++) {
2108 k[i] = kernel[i] >> shift1;
2109 }
2110
2111 for (c = 0; c < nchannel; c++) {
2112 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2113
2114 sl = adr_src + c;
2115 dl = adr_dst + c;
2116
2117 #ifdef __SUNPRO_C
2118 #pragma pipeloop(0)
2119 #endif /* __SUNPRO_C */
2120 for (i = 0; i < wid; i++) buffd[i] = 0;
2121
2122 for (j = 0; j < hgt; j++) {
2123 mlib_s32 *pk = k;
2124
2125 for (l = 0; l < n; l++) {
2126 DTYPE *sp0 = sl + l*sll;
2127
2128 for (off = 0; off < m;) {
2129 sp = sp0 + off*chan1;
2130 dp = dl;
2131
2132 kw = m - off;
2133
2134 if (kw > 2*MAX_KER) kw = MAX_KER; else
2135 if (kw > MAX_KER) kw = kw/2;
2136 off += kw;
2137
2138 p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
2139 p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
2140
2141 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2142 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2143 pk += kw;
2144
2145 sp += (kw - 1)*chan1;
2146
2147 if (kw == 7) {
2148
2149 if (l < (n - 1) || off < m) {
2150 #ifdef __SUNPRO_C
2151 #pragma pipeloop(0)
2152 #endif /* __SUNPRO_C */
2153 for (i = 0; i <= (wid - 2); i += 2) {
2154 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2155 p6 = sp[0];
2156 p7 = sp[chan1];
2157
2158 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2159 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2160
2161 sp += chan2;
2162 }
2163
2164 } else {
2165 #ifdef __SUNPRO_C
2166 #pragma pipeloop(0)
2167 #endif /* __SUNPRO_C */
2168 for (i = 0; i <= (wid - 2); i += 2) {
2169 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2170 p6 = sp[0];
2171 p7 = sp[chan1];
2172
2173 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
2174 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2175
2176 STORE_RES(dp[0 ], d0);
2177 STORE_RES(dp[chan1], d1);
2178
2179 buffd[i ] = 0;
2180 buffd[i + 1] = 0;
2181
2182 sp += chan2;
2183 dp += chan2;
2184 }
2185 }
2186
2187 } else if (kw == 6) {
2188
2189 if (l < (n - 1) || off < m) {
2190 #ifdef __SUNPRO_C
2191 #pragma pipeloop(0)
2192 #endif /* __SUNPRO_C */
2193 for (i = 0; i <= (wid - 2); i += 2) {
2194 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2195 p5 = sp[0];
2196 p6 = sp[chan1];
2197
2198 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2199 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2200
2201 sp += chan2;
2202 }
2203
2204 } else {
2205 #ifdef __SUNPRO_C
2206 #pragma pipeloop(0)
2207 #endif /* __SUNPRO_C */
2208 for (i = 0; i <= (wid - 2); i += 2) {
2209 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2210 p5 = sp[0];
2211 p6 = sp[chan1];
2212
2213 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
2214 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2215
2216 STORE_RES(dp[0 ], d0);
2217 STORE_RES(dp[chan1], d1);
2218
2219 buffd[i ] = 0;
2220 buffd[i + 1] = 0;
2221
2222 sp += chan2;
2223 dp += chan2;
2224 }
2225 }
2226
2227 } else if (kw == 5) {
2228
2229 if (l < (n - 1) || off < m) {
2230 #ifdef __SUNPRO_C
2231 #pragma pipeloop(0)
2232 #endif /* __SUNPRO_C */
2233 for (i = 0; i <= (wid - 2); i += 2) {
2234 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2235 p4 = sp[0];
2236 p5 = sp[chan1];
2237
2238 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2239 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2240
2241 sp += chan2;
2242 }
2243
2244 } else {
2245 #ifdef __SUNPRO_C
2246 #pragma pipeloop(0)
2247 #endif /* __SUNPRO_C */
2248 for (i = 0; i <= (wid - 2); i += 2) {
2249 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2250 p4 = sp[0];
2251 p5 = sp[chan1];
2252
2253 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
2254 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2255
2256 STORE_RES(dp[0 ], d0);
2257 STORE_RES(dp[chan1], d1);
2258
2259 buffd[i ] = 0;
2260 buffd[i + 1] = 0;
2261
2262 sp += chan2;
2263 dp += chan2;
2264 }
2265 }
2266
2267 } else if (kw == 4) {
2268
2269 if (l < (n - 1) || off < m) {
2270 #ifdef __SUNPRO_C
2271 #pragma pipeloop(0)
2272 #endif /* __SUNPRO_C */
2273 for (i = 0; i <= (wid - 2); i += 2) {
2274 p0 = p2; p1 = p3; p2 = p4;
2275 p3 = sp[0];
2276 p4 = sp[chan1];
2277
2278 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2279 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2280
2281 sp += chan2;
2282 }
2283
2284 } else {
2285 #ifdef __SUNPRO_C
2286 #pragma pipeloop(0)
2287 #endif /* __SUNPRO_C */
2288 for (i = 0; i <= (wid - 2); i += 2) {
2289 p0 = p2; p1 = p3; p2 = p4;
2290 p3 = sp[0];
2291 p4 = sp[chan1];
2292
2293 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
2294 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2295
2296 STORE_RES(dp[0 ], d0);
2297 STORE_RES(dp[chan1], d1);
2298
2299 buffd[i ] = 0;
2300 buffd[i + 1] = 0;
2301
2302 sp += chan2;
2303 dp += chan2;
2304 }
2305 }
2306
2307 } else if (kw == 3) {
2308
2309 if (l < (n - 1) || off < m) {
2310 #ifdef __SUNPRO_C
2311 #pragma pipeloop(0)
2312 #endif /* __SUNPRO_C */
2313 for (i = 0; i <= (wid - 2); i += 2) {
2314 p0 = p2; p1 = p3;
2315 p2 = sp[0];
2316 p3 = sp[chan1];
2317
2318 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
2319 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2320
2321 sp += chan2;
2322 }
2323
2324 } else {
2325 #ifdef __SUNPRO_C
2326 #pragma pipeloop(0)
2327 #endif /* __SUNPRO_C */
2328 for (i = 0; i <= (wid - 2); i += 2) {
2329 p0 = p2; p1 = p3;
2330 p2 = sp[0];
2331 p3 = sp[chan1];
2332
2333 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
2334 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2335
2336 STORE_RES(dp[0 ], d0);
2337 STORE_RES(dp[chan1], d1);
2338
2339 buffd[i ] = 0;
2340 buffd[i + 1] = 0;
2341
2342 sp += chan2;
2343 dp += chan2;
2344 }
2345 }
2346
2347 } else if (kw == 2) {
2348
2349 if (l < (n - 1) || off < m) {
2350 #ifdef __SUNPRO_C
2351 #pragma pipeloop(0)
2352 #endif /* __SUNPRO_C */
2353 for (i = 0; i <= (wid - 2); i += 2) {
2354 p0 = p2;
2355 p1 = sp[0];
2356 p2 = sp[chan1];
2357
2358 buffd[i ] += p0*k0 + p1*k1;
2359 buffd[i + 1] += p1*k0 + p2*k1;
2360
2361 sp += chan2;
2362 }
2363
2364 } else {
2365 #ifdef __SUNPRO_C
2366 #pragma pipeloop(0)
2367 #endif /* __SUNPRO_C */
2368 for (i = 0; i <= (wid - 2); i += 2) {
2369 p0 = p2;
2370 p1 = sp[0];
2371 p2 = sp[chan1];
2372
2373 d0 = (p0*k0 + p1*k1 + buffd[i ]);
2374 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2375
2376 STORE_RES(dp[0 ], d0);
2377 STORE_RES(dp[chan1], d1);
2378
2379 buffd[i ] = 0;
2380 buffd[i + 1] = 0;
2381
2382 sp += chan2;
2383 dp += chan2;
2384 }
2385 }
2386
2387 } else /*if (kw == 1)*/ {
2388
2389 if (l < (n - 1) || off < m) {
2390 #ifdef __SUNPRO_C
2391 #pragma pipeloop(0)
2392 #endif /* __SUNPRO_C */
2393 for (i = 0; i <= (wid - 2); i += 2) {
2394 p0 = sp[0];
2395 p1 = sp[chan1];
2396
2397 buffd[i ] += p0*k0;
2398 buffd[i + 1] += p1*k0;
2399
2400 sp += chan2;
2401 }
2402
2403 } else {
2404 #ifdef __SUNPRO_C
2405 #pragma pipeloop(0)
2406 #endif /* __SUNPRO_C */
2407 for (i = 0; i <= (wid - 2); i += 2) {
2408 p0 = sp[0];
2409 p1 = sp[chan1];
2410
2411 d0 = (p0*k0 + buffd[i ]);
2412 d1 = (p1*k0 + buffd[i + 1]);
2413
2414 STORE_RES(dp[0 ], d0);
2415 STORE_RES(dp[chan1], d1);
2416
2417 buffd[i ] = 0;
2418 buffd[i + 1] = 0;
2419
2420 sp += chan2;
2421 dp += chan2;
2422 }
2423 }
2424 }
2425 }
2426 }
2427
2428 /* last pixels */
2429 for (; i < wid; i++) {
2430 mlib_s32 *pk = k, s = 0;
2431 mlib_s32 x;
2432
2433 for (l = 0; l < n; l++) {
2434 sp = sl + l*sll + i*chan1;
2435
2436 for (x = 0; x < m; x++) {
2437 s += sp[0] * pk[0];
2438 sp += chan1;
2439 pk ++;
2440 }
2441 }
2442
2443 STORE_RES(dp[0], s);
2444
2445 sp += chan1;
2446 dp += chan1;
2447 }
2448
2449 sl += sll;
2450 dl += dll;
2451 }
2452 }
2453
2454 if (buffd != buff) mlib_free(buffd);
2455 if (k != k_locl) mlib_free(k);
2456
2457 return MLIB_SUCCESS;
2458 }
2459
2460 /***************************************************************/
2461 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
2462
2463 /***************************************************************/
2464