1 /*
2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 /*
28  * FUNCTION
29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
30  *   MLIB_EDGE_SRC_EXTEND mask
31  */
32 
33 #include "mlib_image.h"
34 #include "mlib_ImageConv.h"
35 #include "mlib_c_ImageConv.h"
36 
37 /*
38  * This define switches between functions of different data types
39  */
40 
41 #define IMG_TYPE 3
42 
43 /***************************************************************/
44 #if IMG_TYPE == 1
45 
46 #define DTYPE             mlib_u8
47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
51 #define DSCALE            (1 << 24)
52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
53 #define S64TOS32(x)       (x)
54 #define SAT_OFF           -(1u << 31)
55 
56 #elif IMG_TYPE == 2
57 
58 #define DTYPE             mlib_s16
59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
63 #define DSCALE            65536.0
64 #define FROM_S32(x)       ((x) >> 16)
65 #define S64TOS32(x)       ((x) & 0xffffffff)
66 #define SAT_OFF
67 
68 #elif IMG_TYPE == 3
69 
70 #define DTYPE             mlib_u16
71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
75 #define DSCALE            65536.0
76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
77 #define S64TOS32(x)       (x)
78 #define SAT_OFF           -(1u << 31)
79 
80 #endif /* IMG_TYPE == 1 */
81 
82 /***************************************************************/
83 #define PARAM                                                   \
84   mlib_image       *dst,                                        \
85   const mlib_image *src,                                        \
86   mlib_s32         dx_l,                                        \
87   mlib_s32         dx_r,                                        \
88   mlib_s32         dy_t,                                        \
89   mlib_s32         dy_b,                                        \
90   const mlib_s32   *kern,                                       \
91   mlib_s32         scalef_expon,                                \
92   mlib_s32         cmask
93 
94 /***************************************************************/
95 #define PARAM_MxN                                               \
96   mlib_image       *dst,                                        \
97   const mlib_image *src,                                        \
98   const mlib_s32   *kernel,                                     \
99   mlib_s32         m,                                           \
100   mlib_s32         n,                                           \
101   mlib_s32         dx_l,                                        \
102   mlib_s32         dx_r,                                        \
103   mlib_s32         dy_t,                                        \
104   mlib_s32         dy_b,                                        \
105   mlib_s32         scale,                                       \
106   mlib_s32         cmask
107 
108 /***************************************************************/
109 #define FTYPE mlib_d64
110 
111 #ifndef MLIB_USE_FTOI_CLAMPING
112 
113 #define CLAMP_S32(x)                                            \
114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
115 
116 #else
117 
118 #define CLAMP_S32(x) ((mlib_s32)(x))
119 
120 #endif /* MLIB_USE_FTOI_CLAMPING */
121 
122 /***************************************************************/
123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
124 
125 /***************************************************************/
126 #ifdef _LITTLE_ENDIAN
127 
128 #define STORE2(res0, res1)                                      \
129   dp[0    ] = res1;                                             \
130   dp[chan1] = res0
131 
132 #else
133 
134 #define STORE2(res0, res1)                                      \
135   dp[0    ] = res0;                                             \
136   dp[chan1] = res1
137 
138 #endif /* _LITTLE_ENDIAN */
139 
140 /***************************************************************/
141 #ifdef _NO_LONGLONG
142 
143 #define LOAD_BUFF(buff)                                         \
144   buff[i    ] = sp[0];                                          \
145   buff[i + 1] = sp[chan1]
146 
147 #else /* _NO_LONGLONG */
148 
149 #ifdef _LITTLE_ENDIAN
150 
151 #define LOAD_BUFF(buff)                                         \
152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
153 
154 #else /* _LITTLE_ENDIAN */
155 
156 #define LOAD_BUFF(buff)                                         \
157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
158 
159 #endif /* _LITTLE_ENDIAN */
160 #endif /* _NO_LONGLONG */
161 
162 /***************************************************************/
163 typedef union {
164   mlib_d64 d64;
165   struct {
166     mlib_s32 i0;
167     mlib_s32 i1;
168   } i32s;
169 } d64_2x32;
170 
171 /***************************************************************/
172 #define DEF_VARS(type)                                          \
173   type     *adr_src, *sl, *sp, *sl1;                            \
174   type     *adr_dst, *dl, *dp;                                  \
175   FTYPE    *pbuff = buff;                                       \
176   mlib_s32 *buffi, *buffo;                                      \
177   mlib_s32 wid, hgt, sll, dll;                                  \
178   mlib_s32 nchannel, chan1, chan2;                              \
179   mlib_s32 i, j, c, swid
180 
181 /***************************************************************/
182 #define GET_SRC_DST_PARAMETERS(type)                            \
183   hgt = mlib_ImageGetHeight(src);                               \
184   wid = mlib_ImageGetWidth(src);                                \
185   nchannel = mlib_ImageGetChannels(src);                        \
186   sll = mlib_ImageGetStride(src) / sizeof(type);                \
187   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
188   adr_src = (type *)mlib_ImageGetData(src);                     \
189   adr_dst = (type *)mlib_ImageGetData(dst)
190 
191 /***************************************************************/
192 #ifndef __sparc
193 #if IMG_TYPE == 1
194 
195 /*
196  * Test for the presence of any "1" bit in bits
197    8 to 31 of val. If present, then val is either
198    negative or >255. If over/underflows of 8 bits
199    are uncommon, then this technique can be a win,
200    since only a single test, rather than two, is
201    necessary to determine if clamping is needed.
202    On the other hand, if over/underflows are common,
203    it adds an extra test.
204 */
205 #define CLAMP_STORE(dst, val)                                   \
206   if (val & 0xffffff00) {                                       \
207     if (val < MLIB_U8_MIN)                                      \
208       dst = MLIB_U8_MIN;                                        \
209     else                                                        \
210       dst = MLIB_U8_MAX;                                        \
211   } else {                                                      \
212     dst = (mlib_u8)val;                                         \
213   }
214 
215 #elif IMG_TYPE == 2
216 
217 #define CLAMP_STORE(dst, val)                                   \
218   if (val >= MLIB_S16_MAX)                                      \
219     dst = MLIB_S16_MAX;                                         \
220   else if (val <= MLIB_S16_MIN)                                 \
221     dst = MLIB_S16_MIN;                                         \
222   else                                                          \
223     dst = (mlib_s16)val
224 
225 #elif IMG_TYPE == 3
226 
227 #define CLAMP_STORE(dst, val)                                   \
228   if (val >= MLIB_U16_MAX)                                      \
229     dst = MLIB_U16_MAX;                                         \
230   else if (val <= MLIB_U16_MIN)                                 \
231     dst = MLIB_U16_MIN;                                         \
232   else                                                          \
233     dst = (mlib_u16)val
234 
235 #endif /* IMG_TYPE == 1 */
236 #endif /* __sparc */
237 
238 /***************************************************************/
239 #define MAX_KER   7
240 #define MAX_N    15
241 #define BUFF_SIZE   1600
242 #define CACHE_SIZE  (64*1024)
243 
mlib_ImageConv1xN_ext(mlib_image * dst,const mlib_image * src,const mlib_d64 * k,mlib_s32 n,mlib_s32 dy_t,mlib_s32 dy_b,mlib_s32 cmask)244 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
245                                          const mlib_image *src,
246                                          const mlib_d64   *k,
247                                          mlib_s32         n,
248                                          mlib_s32         dy_t,
249                                          mlib_s32         dy_b,
250                                          mlib_s32         cmask)
251 {
252   DTYPE    *adr_src, *sl;
253   DTYPE    *adr_dst, *dl, *dp;
254   FTYPE    buff[BUFF_SIZE];
255   FTYPE    *buffd;
256   FTYPE    *pbuff = buff;
257   const FTYPE    *pk;
258   FTYPE    k0, k1, k2, k3;
259   FTYPE    p0, p1, p2, p3, p4;
260   FTYPE    *sbuff;
261   mlib_s32 l, k_off, off, bsize;
262   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
263   mlib_s32 d0, d1, ii;
264   mlib_s32 wid, hgt, sll, dll;
265   mlib_s32 nchannel;
266   mlib_s32 i, j, c;
267   GET_SRC_DST_PARAMETERS(DTYPE);
268 
269   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
270 
271   if (max_hsize < 1) max_hsize = 1;
272   if (max_hsize > hgt) max_hsize = hgt;
273 
274   shgt = hgt + (n - 1);
275   smax_hsize = max_hsize + (n - 1);
276 
277   bsize = 2 * (smax_hsize + 1);
278 
279   if (bsize > BUFF_SIZE) {
280     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
281 
282     if (pbuff == NULL) return MLIB_FAILURE;
283   }
284 
285   sbuff = pbuff;
286   buffd = sbuff + smax_hsize;
287 
288   shgt -= (dy_t + dy_b);
289   k_off = 0;
290 
291   for (l = 0; l < hgt; l += hsize) {
292     hsize = hgt - l;
293 
294     if (hsize > max_hsize) hsize = max_hsize;
295 
296     smax_hsize = hsize + (n - 1);
297 
298     for (c = 0; c < nchannel; c++) {
299       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
300 
301       sl = adr_src + c;
302       dl = adr_dst + c;
303 
304 #ifdef __SUNPRO_C
305 #pragma pipeloop(0)
306 #endif /* __SUNPRO_C */
307       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
308 
309       for (j = 0; j < wid; j++) {
310         FTYPE    *buff = sbuff;
311 
312         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
313           sbuff[i - k_off] = (FTYPE)sl[0];
314         }
315 
316 #ifdef __SUNPRO_C
317 #pragma pipeloop(0)
318 #endif /* __SUNPRO_C */
319         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
320           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
321         }
322 
323         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
324           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
325         }
326 
327         pk = k;
328 
329         for (off = 0; off < (n - 4); off += 4) {
330 
331           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
332           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
333 
334 #ifdef __SUNPRO_C
335 #pragma pipeloop(0)
336 #endif /* __SUNPRO_C */
337           for (i = 0; i < hsize; i += 2) {
338             p0 = p2; p1 = p3; p2 = p4;
339 
340             p3 = buff[i + 3]; p4 = buff[i + 4];
341 
342             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
343             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
344           }
345 
346           pk += 4;
347           buff += 4;
348         }
349 
350         dp = dl;
351         kh = n - off;
352 
353         if (kh == 4) {
354           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
355           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
356 
357 #ifdef __SUNPRO_C
358 #pragma pipeloop(0)
359 #endif /* __SUNPRO_C */
360           for (i = 0; i <= (hsize - 2); i += 2) {
361             p0 = p2; p1 = p3; p2 = p4;
362 
363             p3 = buff[i + 3]; p4 = buff[i + 4];
364 
365             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
366             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
367 
368             dp[0  ] = FROM_S32(d0);
369             dp[dll] = FROM_S32(d1);
370 
371             buffd[i    ] = 0.0;
372             buffd[i + 1] = 0.0;
373 
374             dp += 2*dll;
375           }
376 
377           if (i < hsize) {
378             p0 = p2; p1 = p3; p2 = p4;
379             p3 = buff[i + 3];
380             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
381             dp[0] = FROM_S32(d0);
382             buffd[i] = 0.0;
383           }
384 
385         } else if (kh == 3) {
386 
387           p2 = buff[0]; p3 = buff[1];
388           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
389 
390 #ifdef __SUNPRO_C
391 #pragma pipeloop(0)
392 #endif /* __SUNPRO_C */
393           for (i = 0; i <= (hsize - 2); i += 2) {
394             p0 = p2; p1 = p3;
395 
396             p2 = buff[i + 2]; p3 = buff[i + 3];
397 
398             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
399             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
400 
401             dp[0  ] = FROM_S32(d0);
402             dp[dll] = FROM_S32(d1);
403 
404             buffd[i    ] = 0.0;
405             buffd[i + 1] = 0.0;
406 
407             dp += 2*dll;
408           }
409 
410           if (i < hsize) {
411             p0 = p2; p1 = p3;
412             p2 = buff[i + 2];
413             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
414             dp[0] = FROM_S32(d0);
415 
416             buffd[i] = 0.0;
417           }
418 
419         } else if (kh == 2) {
420 
421           p2 = buff[0];
422           k0 = pk[0]; k1 = pk[1];
423 
424 #ifdef __SUNPRO_C
425 #pragma pipeloop(0)
426 #endif /* __SUNPRO_C */
427           for (i = 0; i <= (hsize - 2); i += 2) {
428             p0 = p2;
429 
430             p1 = buff[i + 1]; p2 = buff[i + 2];
431 
432             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
433             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
434 
435             dp[0  ] = FROM_S32(d0);
436             dp[dll] = FROM_S32(d1);
437 
438             buffd[i    ] = 0.0;
439             buffd[i + 1] = 0.0;
440 
441             dp += 2*dll;
442           }
443 
444           if (i < hsize) {
445             p0 = p2;
446             p1 = buff[i + 1];
447             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
448             dp[0] = FROM_S32(d0);
449 
450             buffd[i] = 0.0;
451           }
452 
453         } else /* kh == 1 */{
454 
455           k0 = pk[0];
456 
457 #ifdef __SUNPRO_C
458 #pragma pipeloop(0)
459 #endif /* __SUNPRO_C */
460           for (i = 0; i <= (hsize - 2); i += 2) {
461             p0 = buff[i]; p1 = buff[i + 1];
462 
463             d0 = D2I(p0*k0 + buffd[i    ]);
464             d1 = D2I(p1*k0 + buffd[i + 1]);
465 
466             dp[0  ] = FROM_S32(d0);
467             dp[dll] = FROM_S32(d1);
468 
469             buffd[i    ] = 0.0;
470             buffd[i + 1] = 0.0;
471 
472             dp += 2*dll;
473           }
474 
475           if (i < hsize) {
476             p0 = buff[i];
477             d0 = D2I(p0*k0 + buffd[i]);
478             dp[0] = FROM_S32(d0);
479 
480             buffd[i] = 0.0;
481           }
482         }
483 
484         /* next line */
485         sl += nchannel;
486         dl += nchannel;
487       }
488     }
489 
490     k_off += max_hsize;
491     adr_dst += max_hsize*dll;
492   }
493 
494   if (pbuff != buff) mlib_free(pbuff);
495 
496   return MLIB_SUCCESS;
497 }
498 
499 /***************************************************************/
500 mlib_status CONV_FUNC_MxN
501 {
502   DTYPE    *adr_src, *sl, *sp = NULL;
503   DTYPE    *adr_dst, *dl, *dp = NULL;
504   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
505   FTYPE    **buffs = buffs_arr, *buffd;
506   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
507   FTYPE    *pbuff = buff;
508   FTYPE    k0, k1, k2, k3, k4, k5, k6;
509   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
510   mlib_s32 *buffi;
511   mlib_s32 mn, l, off, kw, bsize, buff_ind;
512   mlib_s32 d0, d1;
513   mlib_s32 wid, hgt, sll, dll;
514   mlib_s32 nchannel, chan1, chan2;
515   mlib_s32 i, j, c, swid;
516   d64_2x32 dd;
517   mlib_status status = MLIB_SUCCESS;
518 
519   GET_SRC_DST_PARAMETERS(DTYPE);
520 
521   if (scale > 30) {
522     fscale *= 1.0/(1 << 30);
523     scale -= 30;
524   }
525 
526   fscale /= (1 << scale);
527 
528   mn = m*n;
529 
530   if (mn > 256) {
531     k = mlib_malloc(mn*sizeof(mlib_d64));
532 
533     if (k == NULL) return MLIB_FAILURE;
534   }
535 
536   for (i = 0; i < mn; i++) {
537     k[i] = kernel[i]*fscale;
538   }
539 
540   if (m == 1) {
541     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
542     FREE_AND_RETURN_STATUS;
543   }
544 
545   swid = wid + (m - 1);
546 
547   bsize = (n + 3)*swid;
548 
549   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
550     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
551 
552     if (pbuff == NULL) {
553       status = MLIB_FAILURE;
554       FREE_AND_RETURN_STATUS;
555     }
556     buffs = (FTYPE   **)(pbuff + bsize);
557   }
558 
559   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
560   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
561   buffd = buffs[n] + swid;
562   buffi = (mlib_s32*)(buffd + swid);
563 
564   chan1 = nchannel;
565   chan2 = chan1 + chan1;
566 
567   swid -= (dx_l + dx_r);
568 
569   for (c = 0; c < nchannel; c++) {
570     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
571 
572     sl = adr_src + c;
573     dl = adr_dst + c;
574 
575     for (l = 0; l < n; l++) {
576       FTYPE    *buff = buffs[l];
577 
578       for (i = 0; i < dx_l; i++) {
579         buff[i] = (FTYPE)sl[0];
580       }
581 
582 #ifdef __SUNPRO_C
583 #pragma pipeloop(0)
584 #endif /* __SUNPRO_C */
585       for (i = 0; i < swid; i++) {
586         buff[i + dx_l] = (FTYPE)sl[i*chan1];
587       }
588 
589       for (i = 0; i < dx_r; i++) {
590         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
591       }
592 
593       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
594     }
595 
596     buff_ind = 0;
597 
598 #ifdef __SUNPRO_C
599 #pragma pipeloop(0)
600 #endif /* __SUNPRO_C */
601     for (i = 0; i < wid; i++) buffd[i] = 0.0;
602 
603     for (j = 0; j < hgt; j++) {
604       FTYPE    **buffc = buffs + buff_ind;
605       FTYPE    *buffn = buffc[n];
606       FTYPE    *pk = k;
607 
608       for (l = 0; l < n; l++) {
609         FTYPE    *buff_l = buffc[l];
610 
611         for (off = 0; off < m;) {
612           FTYPE    *buff = buff_l + off;
613 
614           kw = m - off;
615 
616           if (kw > 2*MAX_KER) kw = MAX_KER; else
617             if (kw > MAX_KER) kw = kw/2;
618           off += kw;
619 
620           sp = sl;
621           dp = dl;
622 
623           if (kw == 7) {
624 
625             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
626             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
627 
628             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
629             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
630 
631             if (l < (n - 1) || off < m) {
632 #ifdef __SUNPRO_C
633 #pragma pipeloop(0)
634 #endif /* __SUNPRO_C */
635               for (i = 0; i <= (wid - 2); i += 2) {
636                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
637 
638                 p6 = buff[i + 6]; p7 = buff[i + 7];
639 
640                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
641                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
642               }
643 
644             } else {
645 #ifdef __SUNPRO_C
646 #pragma pipeloop(0)
647 #endif /* __SUNPRO_C */
648               for (i = 0; i <= (wid - 2); i += 2) {
649                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
650 
651                 p6 = buff[i + 6]; p7 = buff[i + 7];
652 
653                 LOAD_BUFF(buffi);
654 
655                 dd.d64 = *(FTYPE   *)(buffi + i);
656                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
657                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
658 
659                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
660                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
661 
662                 dp[0    ] = FROM_S32(d0);
663                 dp[chan1] = FROM_S32(d1);
664 
665                 buffd[i    ] = 0.0;
666                 buffd[i + 1] = 0.0;
667 
668                 sp += chan2;
669                 dp += chan2;
670               }
671             }
672 
673           } else if (kw == 6) {
674 
675             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
676             p5 = buff[3]; p6 = buff[4];
677 
678             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
679             k4 = pk[4]; k5 = pk[5];
680 
681             if (l < (n - 1) || off < m) {
682 #ifdef __SUNPRO_C
683 #pragma pipeloop(0)
684 #endif /* __SUNPRO_C */
685               for (i = 0; i <= (wid - 2); i += 2) {
686                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
687 
688                 p5 = buff[i + 5]; p6 = buff[i + 6];
689 
690                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
691                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
692               }
693 
694             } else {
695 #ifdef __SUNPRO_C
696 #pragma pipeloop(0)
697 #endif /* __SUNPRO_C */
698               for (i = 0; i <= (wid - 2); i += 2) {
699                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
700 
701                 p5 = buff[i + 5]; p6 = buff[i + 6];
702 
703                 LOAD_BUFF(buffi);
704 
705                 dd.d64 = *(FTYPE   *)(buffi + i);
706                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
707                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
708 
709                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
710                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
711 
712                 dp[0    ] = FROM_S32(d0);
713                 dp[chan1] = FROM_S32(d1);
714 
715                 buffd[i    ] = 0.0;
716                 buffd[i + 1] = 0.0;
717 
718                 sp += chan2;
719                 dp += chan2;
720               }
721             }
722 
723           } else if (kw == 5) {
724 
725             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
726             p5 = buff[3];
727 
728             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
729             k4 = pk[4];
730 
731             if (l < (n - 1) || off < m) {
732 #ifdef __SUNPRO_C
733 #pragma pipeloop(0)
734 #endif /* __SUNPRO_C */
735               for (i = 0; i <= (wid - 2); i += 2) {
736                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
737 
738                 p4 = buff[i + 4]; p5 = buff[i + 5];
739 
740                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
741                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
742               }
743 
744             } else {
745 #ifdef __SUNPRO_C
746 #pragma pipeloop(0)
747 #endif /* __SUNPRO_C */
748               for (i = 0; i <= (wid - 2); i += 2) {
749                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
750 
751                 p4 = buff[i + 4]; p5 = buff[i + 5];
752 
753                 LOAD_BUFF(buffi);
754 
755                 dd.d64 = *(FTYPE   *)(buffi + i);
756                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
757                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
758 
759                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
760                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
761 
762                 dp[0    ] = FROM_S32(d0);
763                 dp[chan1] = FROM_S32(d1);
764 
765                 buffd[i    ] = 0.0;
766                 buffd[i + 1] = 0.0;
767 
768                 sp += chan2;
769                 dp += chan2;
770               }
771             }
772 
773           } else if (kw == 4) {
774 
775             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
776 
777             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
778 
779             if (l < (n - 1) || off < m) {
780 #ifdef __SUNPRO_C
781 #pragma pipeloop(0)
782 #endif /* __SUNPRO_C */
783               for (i = 0; i <= (wid - 2); i += 2) {
784                 p0 = p2; p1 = p3; p2 = p4;
785 
786                 p3 = buff[i + 3]; p4 = buff[i + 4];
787 
788                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
789                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
790               }
791 
792             } else {
793 #ifdef __SUNPRO_C
794 #pragma pipeloop(0)
795 #endif /* __SUNPRO_C */
796               for (i = 0; i <= (wid - 2); i += 2) {
797                 p0 = p2; p1 = p3; p2 = p4;
798 
799                 p3 = buff[i + 3]; p4 = buff[i + 4];
800 
801                 LOAD_BUFF(buffi);
802 
803                 dd.d64 = *(FTYPE   *)(buffi + i);
804                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
805                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
806 
807                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
808                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
809 
810                 dp[0    ] = FROM_S32(d0);
811                 dp[chan1] = FROM_S32(d1);
812 
813                 buffd[i    ] = 0.0;
814                 buffd[i + 1] = 0.0;
815 
816                 sp += chan2;
817                 dp += chan2;
818               }
819             }
820 
821           } else if (kw == 3) {
822 
823             p2 = buff[0]; p3 = buff[1];
824             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
825 
826             if (l < (n - 1) || off < m) {
827 #ifdef __SUNPRO_C
828 #pragma pipeloop(0)
829 #endif /* __SUNPRO_C */
830               for (i = 0; i <= (wid - 2); i += 2) {
831                 p0 = p2; p1 = p3;
832 
833                 p2 = buff[i + 2]; p3 = buff[i + 3];
834 
835                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
836                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
837               }
838 
839             } else {
840 #ifdef __SUNPRO_C
841 #pragma pipeloop(0)
842 #endif /* __SUNPRO_C */
843               for (i = 0; i <= (wid - 2); i += 2) {
844                 p0 = p2; p1 = p3;
845 
846                 p2 = buff[i + 2]; p3 = buff[i + 3];
847 
848                 LOAD_BUFF(buffi);
849 
850                 dd.d64 = *(FTYPE   *)(buffi + i);
851                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
852                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
853 
854                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
855                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
856 
857                 dp[0    ] = FROM_S32(d0);
858                 dp[chan1] = FROM_S32(d1);
859 
860                 buffd[i    ] = 0.0;
861                 buffd[i + 1] = 0.0;
862 
863                 sp += chan2;
864                 dp += chan2;
865               }
866             }
867 
868           } else /* if (kw == 2) */ {
869 
870             p2 = buff[0];
871             k0 = pk[0]; k1 = pk[1];
872 
873             if (l < (n - 1) || off < m) {
874 #ifdef __SUNPRO_C
875 #pragma pipeloop(0)
876 #endif /* __SUNPRO_C */
877               for (i = 0; i <= (wid - 2); i += 2) {
878                 p0 = p2;
879 
880                 p1 = buff[i + 1]; p2 = buff[i + 2];
881 
882                 buffd[i    ] += p0*k0 + p1*k1;
883                 buffd[i + 1] += p1*k0 + p2*k1;
884               }
885 
886             } else {
887 #ifdef __SUNPRO_C
888 #pragma pipeloop(0)
889 #endif /* __SUNPRO_C */
890               for (i = 0; i <= (wid - 2); i += 2) {
891                 p0 = p2;
892 
893                 p1 = buff[i + 1]; p2 = buff[i + 2];
894 
895                 LOAD_BUFF(buffi);
896 
897                 dd.d64 = *(FTYPE   *)(buffi + i);
898                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
899                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
900 
901                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
902                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
903 
904                 dp[0    ] = FROM_S32(d0);
905                 dp[chan1] = FROM_S32(d1);
906 
907                 buffd[i    ] = 0.0;
908                 buffd[i + 1] = 0.0;
909 
910                 sp += chan2;
911                 dp += chan2;
912               }
913             }
914           }
915 
916           pk += kw;
917         }
918       }
919 
920       /* last pixels */
921       for (; i < wid; i++) {
922         FTYPE    *pk = k, s = 0;
923         mlib_s32 x, d0;
924 
925         for (l = 0; l < n; l++) {
926           FTYPE    *buff = buffc[l] + i;
927 
928           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
929         }
930 
931         d0 = D2I(s);
932         dp[0] = FROM_S32(d0);
933 
934         buffn[i + dx_l] = (FTYPE)sp[0];
935 
936         sp += chan1;
937         dp += chan1;
938       }
939 
940       for (; i < swid; i++) {
941         buffn[i + dx_l] = (FTYPE)sp[0];
942         sp += chan1;
943       }
944 
945       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
946       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
947 
948       /* next line */
949 
950       if (j < hgt - dy_b - 2) sl += sll;
951       dl += dll;
952 
953       buff_ind++;
954 
955       if (buff_ind >= n + 1) buff_ind = 0;
956     }
957   }
958 
959   FREE_AND_RETURN_STATUS;
960 }
961 
962 /***************************************************************/
963 #ifndef __sparc /* for x86, using integer multiplies is faster */
964 
965 #define STORE_RES(res, x)                                       \
966   x >>= shift2;                                                 \
967   CLAMP_STORE(res, x)
968 
969 mlib_status CONV_FUNC_MxN_I
970 {
971   DTYPE    *adr_src, *sl, *sp = NULL;
972   DTYPE    *adr_dst, *dl, *dp = NULL;
973   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
974   mlib_s32 *pbuff = buff;
975   mlib_s32 **buffs = buffs_arr, *buffd;
976   mlib_s32 l, off, kw, bsize, buff_ind;
977   mlib_s32 d0, d1, shift1, shift2;
978   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
979   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
980   mlib_s32 wid, hgt, sll, dll;
981   mlib_s32 nchannel, chan1;
982   mlib_s32 i, j, c, swid;
983   mlib_s32 chan2;
984   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
985   GET_SRC_DST_PARAMETERS(DTYPE);
986 
987 #if IMG_TYPE != 1
988   shift1 = 16;
989 #else
990   shift1 = 8;
991 #endif /* IMG_TYPE != 1 */
992   shift2 = scale - shift1;
993 
994   chan1 = nchannel;
995   chan2 = chan1 + chan1;
996 
997   swid = wid + (m - 1);
998 
999   bsize = (n + 2)*swid;
1000 
1001   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1002     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
1003 
1004     if (pbuff == NULL) return MLIB_FAILURE;
1005     buffs = (mlib_s32 **)(pbuff + bsize);
1006   }
1007 
1008   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
1009   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1010   buffd = buffs[n] + swid;
1011 
1012   if (m*n > MAX_N*MAX_N) {
1013     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
1014 
1015     if (k == NULL) {
1016       if (pbuff != buff) mlib_free(pbuff);
1017       return MLIB_FAILURE;
1018     }
1019   }
1020 
1021   for (i = 0; i < m*n; i++) {
1022     k[i] = kernel[i] >> shift1;
1023   }
1024 
1025   swid -= (dx_l + dx_r);
1026 
1027   for (c = 0; c < nchannel; c++) {
1028     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1029 
1030     sl = adr_src + c;
1031     dl = adr_dst + c;
1032 
1033     for (l = 0; l < n; l++) {
1034       mlib_s32  *buff = buffs[l];
1035 
1036       for (i = 0; i < dx_l; i++) {
1037         buff[i] = (mlib_s32)sl[0];
1038       }
1039 
1040 #ifdef __SUNPRO_C
1041 #pragma pipeloop(0)
1042 #endif /* __SUNPRO_C */
1043       for (i = 0; i < swid; i++) {
1044         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
1045       }
1046 
1047       for (i = 0; i < dx_r; i++) {
1048         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1049       }
1050 
1051       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1052     }
1053 
1054     buff_ind = 0;
1055 
1056 #ifdef __SUNPRO_C
1057 #pragma pipeloop(0)
1058 #endif /* __SUNPRO_C */
1059     for (i = 0; i < wid; i++) buffd[i] = 0;
1060 
1061     for (j = 0; j < hgt; j++) {
1062       mlib_s32 **buffc = buffs + buff_ind;
1063       mlib_s32 *buffn = buffc[n];
1064       mlib_s32 *pk = k;
1065 
1066       for (l = 0; l < n; l++) {
1067         mlib_s32  *buff_l = buffc[l];
1068 
1069         for (off = 0; off < m;) {
1070           mlib_s32 *buff = buff_l + off;
1071 
1072           sp = sl;
1073           dp = dl;
1074 
1075           kw = m - off;
1076 
1077           if (kw > 2*MAX_KER) kw = MAX_KER; else
1078             if (kw > MAX_KER) kw = kw/2;
1079           off += kw;
1080 
1081           if (kw == 7) {
1082 
1083             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1084             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1085 
1086             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1087             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1088 
1089             if (l < (n - 1) || off < m) {
1090 #ifdef __SUNPRO_C
1091 #pragma pipeloop(0)
1092 #endif /* __SUNPRO_C */
1093               for (i = 0; i <= (wid - 2); i += 2) {
1094                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1095 
1096                 p6 = buff[i + 6]; p7 = buff[i + 7];
1097 
1098                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1099                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1100               }
1101 
1102             } else {
1103 #ifdef __SUNPRO_C
1104 #pragma pipeloop(0)
1105 #endif /* __SUNPRO_C */
1106               for (i = 0; i <= (wid - 2); i += 2) {
1107                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1108 
1109                 p6 = buff[i + 6]; p7 = buff[i + 7];
1110 
1111                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1112                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1113 
1114                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1115                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1116 
1117                 STORE_RES(dp[0    ], d0);
1118                 STORE_RES(dp[chan1], d1);
1119 
1120                 buffd[i    ] = 0;
1121                 buffd[i + 1] = 0;
1122 
1123                 sp += chan2;
1124                 dp += chan2;
1125               }
1126             }
1127 
1128           } else if (kw == 6) {
1129 
1130             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1131             p5 = buff[3]; p6 = buff[4];
1132 
1133             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1134             k4 = pk[4]; k5 = pk[5];
1135 
1136             if (l < (n - 1) || off < m) {
1137 #ifdef __SUNPRO_C
1138 #pragma pipeloop(0)
1139 #endif /* __SUNPRO_C */
1140               for (i = 0; i <= (wid - 2); i += 2) {
1141                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1142 
1143                 p5 = buff[i + 5]; p6 = buff[i + 6];
1144 
1145                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1146                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1147               }
1148 
1149             } else {
1150 #ifdef __SUNPRO_C
1151 #pragma pipeloop(0)
1152 #endif /* __SUNPRO_C */
1153               for (i = 0; i <= (wid - 2); i += 2) {
1154                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1155 
1156                 p5 = buff[i + 5]; p6 = buff[i + 6];
1157 
1158                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1159                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1160 
1161                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1162                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1163 
1164                 STORE_RES(dp[0    ], d0);
1165                 STORE_RES(dp[chan1], d1);
1166 
1167                 buffd[i    ] = 0;
1168                 buffd[i + 1] = 0;
1169 
1170                 sp += chan2;
1171                 dp += chan2;
1172               }
1173             }
1174 
1175           } else if (kw == 5) {
1176 
1177             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1178             p5 = buff[3];
1179 
1180             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1181             k4 = pk[4];
1182 
1183             if (l < (n - 1) || off < m) {
1184 #ifdef __SUNPRO_C
1185 #pragma pipeloop(0)
1186 #endif /* __SUNPRO_C */
1187               for (i = 0; i <= (wid - 2); i += 2) {
1188                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1189 
1190                 p4 = buff[i + 4]; p5 = buff[i + 5];
1191 
1192                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1193                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1194               }
1195 
1196             } else {
1197 #ifdef __SUNPRO_C
1198 #pragma pipeloop(0)
1199 #endif /* __SUNPRO_C */
1200               for (i = 0; i <= (wid - 2); i += 2) {
1201                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1202 
1203                 p4 = buff[i + 4]; p5 = buff[i + 5];
1204 
1205                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1206                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1207 
1208                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1209                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1210 
1211                 STORE_RES(dp[0    ], d0);
1212                 STORE_RES(dp[chan1], d1);
1213 
1214                 buffd[i    ] = 0;
1215                 buffd[i + 1] = 0;
1216 
1217                 sp += chan2;
1218                 dp += chan2;
1219               }
1220             }
1221 
1222           } else if (kw == 4) {
1223 
1224             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1225 
1226             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1227 
1228             if (l < (n - 1) || off < m) {
1229 #ifdef __SUNPRO_C
1230 #pragma pipeloop(0)
1231 #endif /* __SUNPRO_C */
1232               for (i = 0; i <= (wid - 2); i += 2) {
1233                 p0 = p2; p1 = p3; p2 = p4;
1234 
1235                 p3 = buff[i + 3]; p4 = buff[i + 4];
1236 
1237                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1238                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1239               }
1240 
1241             } else {
1242 #ifdef __SUNPRO_C
1243 #pragma pipeloop(0)
1244 #endif /* __SUNPRO_C */
1245               for (i = 0; i <= (wid - 2); i += 2) {
1246                 p0 = p2; p1 = p3; p2 = p4;
1247 
1248                 p3 = buff[i + 3]; p4 = buff[i + 4];
1249 
1250                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1251                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1252 
1253                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1254                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1255 
1256                 STORE_RES(dp[0    ], d0);
1257                 STORE_RES(dp[chan1], d1);
1258 
1259                 buffd[i    ] = 0;
1260                 buffd[i + 1] = 0;
1261 
1262                 sp += chan2;
1263                 dp += chan2;
1264               }
1265             }
1266 
1267           } else if (kw == 3) {
1268 
1269             p2 = buff[0]; p3 = buff[1];
1270             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1271 
1272             if (l < (n - 1) || off < m) {
1273 #ifdef __SUNPRO_C
1274 #pragma pipeloop(0)
1275 #endif /* __SUNPRO_C */
1276               for (i = 0; i <= (wid - 2); i += 2) {
1277                 p0 = p2; p1 = p3;
1278 
1279                 p2 = buff[i + 2]; p3 = buff[i + 3];
1280 
1281                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1282                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1283               }
1284 
1285             } else {
1286 #ifdef __SUNPRO_C
1287 #pragma pipeloop(0)
1288 #endif /* __SUNPRO_C */
1289               for (i = 0; i <= (wid - 2); i += 2) {
1290                 p0 = p2; p1 = p3;
1291 
1292                 p2 = buff[i + 2]; p3 = buff[i + 3];
1293 
1294                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1295                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1296 
1297                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1298                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1299 
1300                 STORE_RES(dp[0    ], d0);
1301                 STORE_RES(dp[chan1], d1);
1302 
1303                 buffd[i    ] = 0;
1304                 buffd[i + 1] = 0;
1305 
1306                 sp += chan2;
1307                 dp += chan2;
1308               }
1309             }
1310 
1311           } else if (kw == 2) {
1312 
1313             p2 = buff[0];
1314             k0 = pk[0]; k1 = pk[1];
1315 
1316             if (l < (n - 1) || off < m) {
1317 #ifdef __SUNPRO_C
1318 #pragma pipeloop(0)
1319 #endif /* __SUNPRO_C */
1320               for (i = 0; i <= (wid - 2); i += 2) {
1321                 p0 = p2;
1322 
1323                 p1 = buff[i + 1]; p2 = buff[i + 2];
1324 
1325                 buffd[i    ] += p0*k0 + p1*k1;
1326                 buffd[i + 1] += p1*k0 + p2*k1;
1327               }
1328 
1329             } else {
1330 #ifdef __SUNPRO_C
1331 #pragma pipeloop(0)
1332 #endif /* __SUNPRO_C */
1333               for (i = 0; i <= (wid - 2); i += 2) {
1334                 p0 = p2;
1335 
1336                 p1 = buff[i + 1]; p2 = buff[i + 2];
1337 
1338                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1339                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1340 
1341                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1342                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1343 
1344                 STORE_RES(dp[0    ], d0);
1345                 STORE_RES(dp[chan1], d1);
1346 
1347                 buffd[i    ] = 0;
1348                 buffd[i + 1] = 0;
1349 
1350                 sp += chan2;
1351                 dp += chan2;
1352               }
1353             }
1354 
1355           } else /* kw == 1 */{
1356 
1357             k0 = pk[0];
1358 
1359             if (l < (n - 1) || off < m) {
1360 #ifdef __SUNPRO_C
1361 #pragma pipeloop(0)
1362 #endif /* __SUNPRO_C */
1363               for (i = 0; i <= (wid - 2); i += 2) {
1364                 p0 = buff[i]; p1 = buff[i + 1];
1365 
1366                 buffd[i    ] += p0*k0;
1367                 buffd[i + 1] += p1*k0;
1368               }
1369 
1370             } else {
1371 #ifdef __SUNPRO_C
1372 #pragma pipeloop(0)
1373 #endif /* __SUNPRO_C */
1374               for (i = 0; i <= (wid - 2); i += 2) {
1375                 p0 = buff[i]; p1 = buff[i + 1];
1376 
1377                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1378                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1379 
1380                 d0 = (p0*k0 + buffd[i    ]);
1381                 d1 = (p1*k0 + buffd[i + 1]);
1382 
1383                 STORE_RES(dp[0    ], d0);
1384                 STORE_RES(dp[chan1], d1);
1385 
1386                 buffd[i    ] = 0;
1387                 buffd[i + 1] = 0;
1388 
1389                 sp += chan2;
1390                 dp += chan2;
1391               }
1392             }
1393           }
1394 
1395           pk += kw;
1396         }
1397       }
1398 
1399       /* last pixels */
1400       for (; i < wid; i++) {
1401         mlib_s32 *pk = k, x, s = 0;
1402 
1403         for (l = 0; l < n; l++) {
1404           mlib_s32 *buff = buffc[l] + i;
1405 
1406           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1407         }
1408 
1409         STORE_RES(dp[0], s);
1410 
1411         buffn[i + dx_l] = (mlib_s32)sp[0];
1412 
1413         sp += chan1;
1414         dp += chan1;
1415       }
1416 
1417       for (; i < swid; i++) {
1418         buffn[i + dx_l] = (mlib_s32)sp[0];
1419         sp += chan1;
1420       }
1421 
1422       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1423       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1424 
1425       /* next line */
1426 
1427       if (j < hgt - dy_b - 2) sl += sll;
1428       dl += dll;
1429 
1430       buff_ind++;
1431 
1432       if (buff_ind >= n + 1) buff_ind = 0;
1433     }
1434   }
1435 
1436   if (pbuff != buff) mlib_free(pbuff);
1437   if (k != k_locl) mlib_free(k);
1438 
1439   return MLIB_SUCCESS;
1440 }
1441 
1442 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1443 
1444 /***************************************************************/
1445