1 /*
2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 /*
28  * FUNCTION
29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
30  *   MLIB_EDGE_SRC_EXTEND mask
31  */
32 
33 #include "mlib_image.h"
34 #include "mlib_ImageConv.h"
35 #include "mlib_c_ImageConv.h"
36 
37 /*
38  * This define switches between functions of different data types
39  */
40 
41 #define IMG_TYPE 1
42 
43 /***************************************************************/
44 #if IMG_TYPE == 1
45 
46 #define DTYPE             mlib_u8
47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
51 #define DSCALE            (1 << 24)
52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
53 #define S64TOS32(x)       (x)
54 #define SAT_OFF           -(1u << 31)
55 
56 #elif IMG_TYPE == 2
57 
58 #define DTYPE             mlib_s16
59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
63 #define DSCALE            65536.0
64 #define FROM_S32(x)       ((x) >> 16)
65 #define S64TOS32(x)       ((x) & 0xffffffff)
66 #define SAT_OFF
67 
68 #elif IMG_TYPE == 3
69 
70 #define DTYPE             mlib_u16
71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
75 #define DSCALE            65536.0
76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
77 #define S64TOS32(x)       (x)
78 #define SAT_OFF           -(1u << 31)
79 
80 #endif /* IMG_TYPE == 1 */
81 
82 /***************************************************************/
83 #define PARAM                                                   \
84   mlib_image       *dst,                                        \
85   const mlib_image *src,                                        \
86   mlib_s32         dx_l,                                        \
87   mlib_s32         dx_r,                                        \
88   mlib_s32         dy_t,                                        \
89   mlib_s32         dy_b,                                        \
90   const mlib_s32   *kern,                                       \
91   mlib_s32         scalef_expon,                                \
92   mlib_s32         cmask
93 
94 /***************************************************************/
95 #define PARAM_MxN                                               \
96   mlib_image       *dst,                                        \
97   const mlib_image *src,                                        \
98   const mlib_s32   *kernel,                                     \
99   mlib_s32         m,                                           \
100   mlib_s32         n,                                           \
101   mlib_s32         dx_l,                                        \
102   mlib_s32         dx_r,                                        \
103   mlib_s32         dy_t,                                        \
104   mlib_s32         dy_b,                                        \
105   mlib_s32         scale,                                       \
106   mlib_s32         cmask
107 
108 /***************************************************************/
109 #define FTYPE mlib_d64
110 
111 #ifndef MLIB_USE_FTOI_CLAMPING
112 
113 #define CLAMP_S32(x)                                            \
114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
115 
116 #else
117 
118 #define CLAMP_S32(x) ((mlib_s32)(x))
119 
120 #endif /* MLIB_USE_FTOI_CLAMPING */
121 
122 /***************************************************************/
123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
124 
125 /***************************************************************/
126 #ifdef _NO_LONGLONG
127 
128 #define LOAD_BUFF(buff)                                         \
129   buff[i    ] = sp[0];                                          \
130   buff[i + 1] = sp[chan1]
131 
132 #else /* _NO_LONGLONG */
133 
134 #ifdef _LITTLE_ENDIAN
135 
136 #define LOAD_BUFF(buff)                                         \
137   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
138 
139 #else /* _LITTLE_ENDIAN */
140 
141 #define LOAD_BUFF(buff)                                         \
142   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
143 
144 #endif /* _LITTLE_ENDIAN */
145 #endif /* _NO_LONGLONG */
146 
147 /***************************************************************/
148 typedef union {
149   mlib_d64 d64;
150   struct {
151     mlib_s32 i0;
152     mlib_s32 i1;
153   } i32s;
154 } d64_2x32;
155 
156 /***************************************************************/
157 #define GET_SRC_DST_PARAMETERS(type)                            \
158   hgt = mlib_ImageGetHeight(src);                               \
159   wid = mlib_ImageGetWidth(src);                                \
160   nchannel = mlib_ImageGetChannels(src);                        \
161   sll = mlib_ImageGetStride(src) / sizeof(type);                \
162   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
163   adr_src = (type *)mlib_ImageGetData(src);                     \
164   adr_dst = (type *)mlib_ImageGetData(dst)
165 
166 /***************************************************************/
167 #ifndef __sparc
168 #if IMG_TYPE == 1
169 
170 /*
171  * Test for the presence of any "1" bit in bits
172    8 to 31 of val. If present, then val is either
173    negative or >255. If over/underflows of 8 bits
174    are uncommon, then this technique can be a win,
175    since only a single test, rather than two, is
176    necessary to determine if clamping is needed.
177    On the other hand, if over/underflows are common,
178    it adds an extra test.
179 */
180 #define CLAMP_STORE(dst, val)                                   \
181   if (val & 0xffffff00) {                                       \
182     if (val < MLIB_U8_MIN)                                      \
183       dst = MLIB_U8_MIN;                                        \
184     else                                                        \
185       dst = MLIB_U8_MAX;                                        \
186   } else {                                                      \
187     dst = (mlib_u8)val;                                         \
188   }
189 
190 #elif IMG_TYPE == 2
191 
192 #define CLAMP_STORE(dst, val)                                   \
193   if (val >= MLIB_S16_MAX)                                      \
194     dst = MLIB_S16_MAX;                                         \
195   else if (val <= MLIB_S16_MIN)                                 \
196     dst = MLIB_S16_MIN;                                         \
197   else                                                          \
198     dst = (mlib_s16)val
199 
200 #elif IMG_TYPE == 3
201 
202 #define CLAMP_STORE(dst, val)                                   \
203   if (val >= MLIB_U16_MAX)                                      \
204     dst = MLIB_U16_MAX;                                         \
205   else if (val <= MLIB_U16_MIN)                                 \
206     dst = MLIB_U16_MIN;                                         \
207   else                                                          \
208     dst = (mlib_u16)val
209 
210 #endif /* IMG_TYPE == 1 */
211 #endif /* __sparc */
212 
213 /***************************************************************/
214 #define MAX_KER   7
215 #define MAX_N    15
216 #define BUFF_SIZE   1600
217 #define CACHE_SIZE  (64*1024)
218 
mlib_ImageConv1xN_ext(mlib_image * dst,const mlib_image * src,const mlib_d64 * k,mlib_s32 n,mlib_s32 dy_t,mlib_s32 dy_b,mlib_s32 cmask)219 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
220                                          const mlib_image *src,
221                                          const mlib_d64   *k,
222                                          mlib_s32         n,
223                                          mlib_s32         dy_t,
224                                          mlib_s32         dy_b,
225                                          mlib_s32         cmask)
226 {
227   DTYPE    *adr_src, *sl;
228   DTYPE    *adr_dst, *dl, *dp;
229   FTYPE    buff[BUFF_SIZE];
230   FTYPE    *buffd;
231   FTYPE    *pbuff = buff;
232   const FTYPE    *pk;
233   FTYPE    k0, k1, k2, k3;
234   FTYPE    p0, p1, p2, p3, p4;
235   FTYPE    *sbuff;
236   mlib_s32 l, k_off, off, bsize;
237   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
238   mlib_s32 d0, d1, ii;
239   mlib_s32 wid, hgt, sll, dll;
240   mlib_s32 nchannel;
241   mlib_s32 i, j, c;
242   GET_SRC_DST_PARAMETERS(DTYPE);
243 
244   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
245 
246   if (max_hsize < 1) max_hsize = 1;
247   if (max_hsize > hgt) max_hsize = hgt;
248 
249   shgt = hgt + (n - 1);
250   smax_hsize = max_hsize + (n - 1);
251 
252   bsize = 2 * (smax_hsize + 1);
253 
254   if (bsize > BUFF_SIZE) {
255     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
256 
257     if (pbuff == NULL) return MLIB_FAILURE;
258   }
259 
260   sbuff = pbuff;
261   buffd = sbuff + smax_hsize;
262 
263   shgt -= (dy_t + dy_b);
264   k_off = 0;
265 
266   for (l = 0; l < hgt; l += hsize) {
267     hsize = hgt - l;
268 
269     if (hsize > max_hsize) hsize = max_hsize;
270 
271     smax_hsize = hsize + (n - 1);
272 
273     for (c = 0; c < nchannel; c++) {
274       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
275 
276       sl = adr_src + c;
277       dl = adr_dst + c;
278 
279 #ifdef __SUNPRO_C
280 #pragma pipeloop(0)
281 #endif /* __SUNPRO_C */
282       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
283 
284       for (j = 0; j < wid; j++) {
285         FTYPE    *buff = sbuff;
286 
287         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
288           sbuff[i - k_off] = (FTYPE)sl[0];
289         }
290 
291 #ifdef __SUNPRO_C
292 #pragma pipeloop(0)
293 #endif /* __SUNPRO_C */
294         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
295           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
296         }
297 
298         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
299           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
300         }
301 
302         pk = k;
303 
304         for (off = 0; off < (n - 4); off += 4) {
305 
306           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
307           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
308 
309 #ifdef __SUNPRO_C
310 #pragma pipeloop(0)
311 #endif /* __SUNPRO_C */
312           for (i = 0; i < hsize; i += 2) {
313             p0 = p2; p1 = p3; p2 = p4;
314 
315             p3 = buff[i + 3]; p4 = buff[i + 4];
316 
317             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
318             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
319           }
320 
321           pk += 4;
322           buff += 4;
323         }
324 
325         dp = dl;
326         kh = n - off;
327 
328         if (kh == 4) {
329           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
330           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
331 
332 #ifdef __SUNPRO_C
333 #pragma pipeloop(0)
334 #endif /* __SUNPRO_C */
335           for (i = 0; i <= (hsize - 2); i += 2) {
336             p0 = p2; p1 = p3; p2 = p4;
337 
338             p3 = buff[i + 3]; p4 = buff[i + 4];
339 
340             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
341             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
342 
343             dp[0  ] = FROM_S32(d0);
344             dp[dll] = FROM_S32(d1);
345 
346             buffd[i    ] = 0.0;
347             buffd[i + 1] = 0.0;
348 
349             dp += 2*dll;
350           }
351 
352           if (i < hsize) {
353             p0 = p2; p1 = p3; p2 = p4;
354             p3 = buff[i + 3];
355             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
356             dp[0] = FROM_S32(d0);
357             buffd[i] = 0.0;
358           }
359 
360         } else if (kh == 3) {
361 
362           p2 = buff[0]; p3 = buff[1];
363           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
364 
365 #ifdef __SUNPRO_C
366 #pragma pipeloop(0)
367 #endif /* __SUNPRO_C */
368           for (i = 0; i <= (hsize - 2); i += 2) {
369             p0 = p2; p1 = p3;
370 
371             p2 = buff[i + 2]; p3 = buff[i + 3];
372 
373             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
374             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
375 
376             dp[0  ] = FROM_S32(d0);
377             dp[dll] = FROM_S32(d1);
378 
379             buffd[i    ] = 0.0;
380             buffd[i + 1] = 0.0;
381 
382             dp += 2*dll;
383           }
384 
385           if (i < hsize) {
386             p0 = p2; p1 = p3;
387             p2 = buff[i + 2];
388             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
389             dp[0] = FROM_S32(d0);
390 
391             buffd[i] = 0.0;
392           }
393 
394         } else if (kh == 2) {
395 
396           p2 = buff[0];
397           k0 = pk[0]; k1 = pk[1];
398 
399 #ifdef __SUNPRO_C
400 #pragma pipeloop(0)
401 #endif /* __SUNPRO_C */
402           for (i = 0; i <= (hsize - 2); i += 2) {
403             p0 = p2;
404 
405             p1 = buff[i + 1]; p2 = buff[i + 2];
406 
407             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
408             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
409 
410             dp[0  ] = FROM_S32(d0);
411             dp[dll] = FROM_S32(d1);
412 
413             buffd[i    ] = 0.0;
414             buffd[i + 1] = 0.0;
415 
416             dp += 2*dll;
417           }
418 
419           if (i < hsize) {
420             p0 = p2;
421             p1 = buff[i + 1];
422             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
423             dp[0] = FROM_S32(d0);
424 
425             buffd[i] = 0.0;
426           }
427 
428         } else /* kh == 1 */{
429 
430           k0 = pk[0];
431 
432 #ifdef __SUNPRO_C
433 #pragma pipeloop(0)
434 #endif /* __SUNPRO_C */
435           for (i = 0; i <= (hsize - 2); i += 2) {
436             p0 = buff[i]; p1 = buff[i + 1];
437 
438             d0 = D2I(p0*k0 + buffd[i    ]);
439             d1 = D2I(p1*k0 + buffd[i + 1]);
440 
441             dp[0  ] = FROM_S32(d0);
442             dp[dll] = FROM_S32(d1);
443 
444             buffd[i    ] = 0.0;
445             buffd[i + 1] = 0.0;
446 
447             dp += 2*dll;
448           }
449 
450           if (i < hsize) {
451             p0 = buff[i];
452             d0 = D2I(p0*k0 + buffd[i]);
453             dp[0] = FROM_S32(d0);
454 
455             buffd[i] = 0.0;
456           }
457         }
458 
459         /* next line */
460         sl += nchannel;
461         dl += nchannel;
462       }
463     }
464 
465     k_off += max_hsize;
466     adr_dst += max_hsize*dll;
467   }
468 
469   if (pbuff != buff) mlib_free(pbuff);
470 
471   return MLIB_SUCCESS;
472 }
473 
474 /***************************************************************/
475 mlib_status CONV_FUNC_MxN
476 {
477   DTYPE    *adr_src, *sl, *sp = NULL;
478   DTYPE    *adr_dst, *dl, *dp = NULL;
479   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
480   FTYPE    **buffs = buffs_arr, *buffd;
481   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
482   FTYPE    *pbuff = buff;
483   FTYPE    k0, k1, k2, k3, k4, k5, k6;
484   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
485   mlib_s32 *buffi;
486   mlib_s32 mn, l, off, kw, bsize, buff_ind;
487   mlib_s32 d0, d1;
488   mlib_s32 wid, hgt, sll, dll;
489   mlib_s32 nchannel, chan1, chan2;
490   mlib_s32 i, j, c, swid;
491   d64_2x32 dd;
492   mlib_status status = MLIB_SUCCESS;
493 
494   GET_SRC_DST_PARAMETERS(DTYPE);
495 
496   if (scale > 30) {
497     fscale *= 1.0/(1 << 30);
498     scale -= 30;
499   }
500 
501   fscale /= (1 << scale);
502 
503   mn = m*n;
504 
505   if (mn > 256) {
506     k = mlib_malloc(mn*sizeof(mlib_d64));
507 
508     if (k == NULL) return MLIB_FAILURE;
509   }
510 
511   for (i = 0; i < mn; i++) {
512     k[i] = kernel[i]*fscale;
513   }
514 
515   if (m == 1) {
516     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
517     FREE_AND_RETURN_STATUS;
518   }
519 
520   swid = wid + (m - 1);
521 
522   bsize = (n + 3)*swid;
523 
524   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
525     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
526 
527     if (pbuff == NULL) {
528       status = MLIB_FAILURE;
529       FREE_AND_RETURN_STATUS;
530     }
531     buffs = (FTYPE   **)(pbuff + bsize);
532   }
533 
534   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
535   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
536   buffd = buffs[n] + swid;
537   buffi = (mlib_s32*)(buffd + swid);
538 
539   chan1 = nchannel;
540   chan2 = chan1 + chan1;
541 
542   swid -= (dx_l + dx_r);
543 
544   for (c = 0; c < nchannel; c++) {
545     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
546 
547     sl = adr_src + c;
548     dl = adr_dst + c;
549 
550     for (l = 0; l < n; l++) {
551       FTYPE    *buff = buffs[l];
552 
553       for (i = 0; i < dx_l; i++) {
554         buff[i] = (FTYPE)sl[0];
555       }
556 
557 #ifdef __SUNPRO_C
558 #pragma pipeloop(0)
559 #endif /* __SUNPRO_C */
560       for (i = 0; i < swid; i++) {
561         buff[i + dx_l] = (FTYPE)sl[i*chan1];
562       }
563 
564       for (i = 0; i < dx_r; i++) {
565         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
566       }
567 
568       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
569     }
570 
571     buff_ind = 0;
572 
573 #ifdef __SUNPRO_C
574 #pragma pipeloop(0)
575 #endif /* __SUNPRO_C */
576     for (i = 0; i < wid; i++) buffd[i] = 0.0;
577 
578     for (j = 0; j < hgt; j++) {
579       FTYPE    **buffc = buffs + buff_ind;
580       FTYPE    *buffn = buffc[n];
581       FTYPE    *pk = k;
582 
583       for (l = 0; l < n; l++) {
584         FTYPE    *buff_l = buffc[l];
585 
586         for (off = 0; off < m;) {
587           FTYPE    *buff = buff_l + off;
588 
589           kw = m - off;
590 
591           if (kw > 2*MAX_KER) kw = MAX_KER; else
592             if (kw > MAX_KER) kw = kw/2;
593           off += kw;
594 
595           sp = sl;
596           dp = dl;
597 
598           if (kw == 7) {
599 
600             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
601             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
602 
603             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
604             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
605 
606             if (l < (n - 1) || off < m) {
607 #ifdef __SUNPRO_C
608 #pragma pipeloop(0)
609 #endif /* __SUNPRO_C */
610               for (i = 0; i <= (wid - 2); i += 2) {
611                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
612 
613                 p6 = buff[i + 6]; p7 = buff[i + 7];
614 
615                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
616                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
617               }
618 
619             } else {
620 #ifdef __SUNPRO_C
621 #pragma pipeloop(0)
622 #endif /* __SUNPRO_C */
623               for (i = 0; i <= (wid - 2); i += 2) {
624                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
625 
626                 p6 = buff[i + 6]; p7 = buff[i + 7];
627 
628                 LOAD_BUFF(buffi);
629 
630                 dd.d64 = *(FTYPE   *)(buffi + i);
631                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
632                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
633 
634                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
635                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
636 
637                 dp[0    ] = FROM_S32(d0);
638                 dp[chan1] = FROM_S32(d1);
639 
640                 buffd[i    ] = 0.0;
641                 buffd[i + 1] = 0.0;
642 
643                 sp += chan2;
644                 dp += chan2;
645               }
646             }
647 
648           } else if (kw == 6) {
649 
650             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
651             p5 = buff[3]; p6 = buff[4];
652 
653             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
654             k4 = pk[4]; k5 = pk[5];
655 
656             if (l < (n - 1) || off < m) {
657 #ifdef __SUNPRO_C
658 #pragma pipeloop(0)
659 #endif /* __SUNPRO_C */
660               for (i = 0; i <= (wid - 2); i += 2) {
661                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
662 
663                 p5 = buff[i + 5]; p6 = buff[i + 6];
664 
665                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
666                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
667               }
668 
669             } else {
670 #ifdef __SUNPRO_C
671 #pragma pipeloop(0)
672 #endif /* __SUNPRO_C */
673               for (i = 0; i <= (wid - 2); i += 2) {
674                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
675 
676                 p5 = buff[i + 5]; p6 = buff[i + 6];
677 
678                 LOAD_BUFF(buffi);
679 
680                 dd.d64 = *(FTYPE   *)(buffi + i);
681                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
682                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
683 
684                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
685                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
686 
687                 dp[0    ] = FROM_S32(d0);
688                 dp[chan1] = FROM_S32(d1);
689 
690                 buffd[i    ] = 0.0;
691                 buffd[i + 1] = 0.0;
692 
693                 sp += chan2;
694                 dp += chan2;
695               }
696             }
697 
698           } else if (kw == 5) {
699 
700             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
701             p5 = buff[3];
702 
703             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
704             k4 = pk[4];
705 
706             if (l < (n - 1) || off < m) {
707 #ifdef __SUNPRO_C
708 #pragma pipeloop(0)
709 #endif /* __SUNPRO_C */
710               for (i = 0; i <= (wid - 2); i += 2) {
711                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
712 
713                 p4 = buff[i + 4]; p5 = buff[i + 5];
714 
715                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
716                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
717               }
718 
719             } else {
720 #ifdef __SUNPRO_C
721 #pragma pipeloop(0)
722 #endif /* __SUNPRO_C */
723               for (i = 0; i <= (wid - 2); i += 2) {
724                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
725 
726                 p4 = buff[i + 4]; p5 = buff[i + 5];
727 
728                 LOAD_BUFF(buffi);
729 
730                 dd.d64 = *(FTYPE   *)(buffi + i);
731                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
732                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
733 
734                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
735                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
736 
737                 dp[0    ] = FROM_S32(d0);
738                 dp[chan1] = FROM_S32(d1);
739 
740                 buffd[i    ] = 0.0;
741                 buffd[i + 1] = 0.0;
742 
743                 sp += chan2;
744                 dp += chan2;
745               }
746             }
747 
748           } else if (kw == 4) {
749 
750             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
751 
752             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
753 
754             if (l < (n - 1) || off < m) {
755 #ifdef __SUNPRO_C
756 #pragma pipeloop(0)
757 #endif /* __SUNPRO_C */
758               for (i = 0; i <= (wid - 2); i += 2) {
759                 p0 = p2; p1 = p3; p2 = p4;
760 
761                 p3 = buff[i + 3]; p4 = buff[i + 4];
762 
763                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
764                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
765               }
766 
767             } else {
768 #ifdef __SUNPRO_C
769 #pragma pipeloop(0)
770 #endif /* __SUNPRO_C */
771               for (i = 0; i <= (wid - 2); i += 2) {
772                 p0 = p2; p1 = p3; p2 = p4;
773 
774                 p3 = buff[i + 3]; p4 = buff[i + 4];
775 
776                 LOAD_BUFF(buffi);
777 
778                 dd.d64 = *(FTYPE   *)(buffi + i);
779                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
780                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
781 
782                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
783                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
784 
785                 dp[0    ] = FROM_S32(d0);
786                 dp[chan1] = FROM_S32(d1);
787 
788                 buffd[i    ] = 0.0;
789                 buffd[i + 1] = 0.0;
790 
791                 sp += chan2;
792                 dp += chan2;
793               }
794             }
795 
796           } else if (kw == 3) {
797 
798             p2 = buff[0]; p3 = buff[1];
799             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
800 
801             if (l < (n - 1) || off < m) {
802 #ifdef __SUNPRO_C
803 #pragma pipeloop(0)
804 #endif /* __SUNPRO_C */
805               for (i = 0; i <= (wid - 2); i += 2) {
806                 p0 = p2; p1 = p3;
807 
808                 p2 = buff[i + 2]; p3 = buff[i + 3];
809 
810                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
811                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
812               }
813 
814             } else {
815 #ifdef __SUNPRO_C
816 #pragma pipeloop(0)
817 #endif /* __SUNPRO_C */
818               for (i = 0; i <= (wid - 2); i += 2) {
819                 p0 = p2; p1 = p3;
820 
821                 p2 = buff[i + 2]; p3 = buff[i + 3];
822 
823                 LOAD_BUFF(buffi);
824 
825                 dd.d64 = *(FTYPE   *)(buffi + i);
826                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
827                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
828 
829                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
830                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
831 
832                 dp[0    ] = FROM_S32(d0);
833                 dp[chan1] = FROM_S32(d1);
834 
835                 buffd[i    ] = 0.0;
836                 buffd[i + 1] = 0.0;
837 
838                 sp += chan2;
839                 dp += chan2;
840               }
841             }
842 
843           } else /* if (kw == 2) */ {
844 
845             p2 = buff[0];
846             k0 = pk[0]; k1 = pk[1];
847 
848             if (l < (n - 1) || off < m) {
849 #ifdef __SUNPRO_C
850 #pragma pipeloop(0)
851 #endif /* __SUNPRO_C */
852               for (i = 0; i <= (wid - 2); i += 2) {
853                 p0 = p2;
854 
855                 p1 = buff[i + 1]; p2 = buff[i + 2];
856 
857                 buffd[i    ] += p0*k0 + p1*k1;
858                 buffd[i + 1] += p1*k0 + p2*k1;
859               }
860 
861             } else {
862 #ifdef __SUNPRO_C
863 #pragma pipeloop(0)
864 #endif /* __SUNPRO_C */
865               for (i = 0; i <= (wid - 2); i += 2) {
866                 p0 = p2;
867 
868                 p1 = buff[i + 1]; p2 = buff[i + 2];
869 
870                 LOAD_BUFF(buffi);
871 
872                 dd.d64 = *(FTYPE   *)(buffi + i);
873                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
874                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
875 
876                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
877                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
878 
879                 dp[0    ] = FROM_S32(d0);
880                 dp[chan1] = FROM_S32(d1);
881 
882                 buffd[i    ] = 0.0;
883                 buffd[i + 1] = 0.0;
884 
885                 sp += chan2;
886                 dp += chan2;
887               }
888             }
889           }
890 
891           pk += kw;
892         }
893       }
894 
895       /* last pixels */
896       for (; i < wid; i++) {
897         FTYPE    *pk = k, s = 0;
898         mlib_s32 x, d0;
899 
900         for (l = 0; l < n; l++) {
901           FTYPE    *buff = buffc[l] + i;
902 
903           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
904         }
905 
906         d0 = D2I(s);
907         dp[0] = FROM_S32(d0);
908 
909         buffn[i + dx_l] = (FTYPE)sp[0];
910 
911         sp += chan1;
912         dp += chan1;
913       }
914 
915       for (; i < swid; i++) {
916         buffn[i + dx_l] = (FTYPE)sp[0];
917         sp += chan1;
918       }
919 
920       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
921       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
922 
923       /* next line */
924 
925       if (j < hgt - dy_b - 2) sl += sll;
926       dl += dll;
927 
928       buff_ind++;
929 
930       if (buff_ind >= n + 1) buff_ind = 0;
931     }
932   }
933 
934   FREE_AND_RETURN_STATUS;
935 }
936 
937 /***************************************************************/
938 #ifndef __sparc /* for x86, using integer multiplies is faster */
939 
940 #define STORE_RES(res, x)                                       \
941   x >>= shift2;                                                 \
942   CLAMP_STORE(res, x)
943 
944 mlib_status CONV_FUNC_MxN_I
945 {
946   DTYPE    *adr_src, *sl, *sp = NULL;
947   DTYPE    *adr_dst, *dl, *dp = NULL;
948   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
949   mlib_s32 *pbuff = buff;
950   mlib_s32 **buffs = buffs_arr, *buffd;
951   mlib_s32 l, off, kw, bsize, buff_ind;
952   mlib_s32 d0, d1, shift1, shift2;
953   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
954   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
955   mlib_s32 wid, hgt, sll, dll;
956   mlib_s32 nchannel, chan1;
957   mlib_s32 i, j, c, swid;
958   mlib_s32 chan2;
959   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
960   GET_SRC_DST_PARAMETERS(DTYPE);
961 
962 #if IMG_TYPE != 1
963   shift1 = 16;
964 #else
965   shift1 = 8;
966 #endif /* IMG_TYPE != 1 */
967   shift2 = scale - shift1;
968 
969   chan1 = nchannel;
970   chan2 = chan1 + chan1;
971 
972   swid = wid + (m - 1);
973 
974   bsize = (n + 2)*swid;
975 
976   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
977     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
978 
979     if (pbuff == NULL) return MLIB_FAILURE;
980     buffs = (mlib_s32 **)(pbuff + bsize);
981   }
982 
983   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
984   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
985   buffd = buffs[n] + swid;
986 
987   if (m*n > MAX_N*MAX_N) {
988     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
989 
990     if (k == NULL) {
991       if (pbuff != buff) mlib_free(pbuff);
992       return MLIB_FAILURE;
993     }
994   }
995 
996   for (i = 0; i < m*n; i++) {
997     k[i] = kernel[i] >> shift1;
998   }
999 
1000   swid -= (dx_l + dx_r);
1001 
1002   for (c = 0; c < nchannel; c++) {
1003     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1004 
1005     sl = adr_src + c;
1006     dl = adr_dst + c;
1007 
1008     for (l = 0; l < n; l++) {
1009       mlib_s32  *buff = buffs[l];
1010 
1011       for (i = 0; i < dx_l; i++) {
1012         buff[i] = (mlib_s32)sl[0];
1013       }
1014 
1015 #ifdef __SUNPRO_C
1016 #pragma pipeloop(0)
1017 #endif /* __SUNPRO_C */
1018       for (i = 0; i < swid; i++) {
1019         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
1020       }
1021 
1022       for (i = 0; i < dx_r; i++) {
1023         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1024       }
1025 
1026       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1027     }
1028 
1029     buff_ind = 0;
1030 
1031 #ifdef __SUNPRO_C
1032 #pragma pipeloop(0)
1033 #endif /* __SUNPRO_C */
1034     for (i = 0; i < wid; i++) buffd[i] = 0;
1035 
1036     for (j = 0; j < hgt; j++) {
1037       mlib_s32 **buffc = buffs + buff_ind;
1038       mlib_s32 *buffn = buffc[n];
1039       mlib_s32 *pk = k;
1040 
1041       for (l = 0; l < n; l++) {
1042         mlib_s32  *buff_l = buffc[l];
1043 
1044         for (off = 0; off < m;) {
1045           mlib_s32 *buff = buff_l + off;
1046 
1047           sp = sl;
1048           dp = dl;
1049 
1050           kw = m - off;
1051 
1052           if (kw > 2*MAX_KER) kw = MAX_KER; else
1053             if (kw > MAX_KER) kw = kw/2;
1054           off += kw;
1055 
1056           if (kw == 7) {
1057 
1058             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1059             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1060 
1061             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1062             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1063 
1064             if (l < (n - 1) || off < m) {
1065 #ifdef __SUNPRO_C
1066 #pragma pipeloop(0)
1067 #endif /* __SUNPRO_C */
1068               for (i = 0; i <= (wid - 2); i += 2) {
1069                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1070 
1071                 p6 = buff[i + 6]; p7 = buff[i + 7];
1072 
1073                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1074                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1075               }
1076 
1077             } else {
1078 #ifdef __SUNPRO_C
1079 #pragma pipeloop(0)
1080 #endif /* __SUNPRO_C */
1081               for (i = 0; i <= (wid - 2); i += 2) {
1082                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1083 
1084                 p6 = buff[i + 6]; p7 = buff[i + 7];
1085 
1086                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1087                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1088 
1089                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1090                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1091 
1092                 STORE_RES(dp[0    ], d0);
1093                 STORE_RES(dp[chan1], d1);
1094 
1095                 buffd[i    ] = 0;
1096                 buffd[i + 1] = 0;
1097 
1098                 sp += chan2;
1099                 dp += chan2;
1100               }
1101             }
1102 
1103           } else if (kw == 6) {
1104 
1105             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1106             p5 = buff[3]; p6 = buff[4];
1107 
1108             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1109             k4 = pk[4]; k5 = pk[5];
1110 
1111             if (l < (n - 1) || off < m) {
1112 #ifdef __SUNPRO_C
1113 #pragma pipeloop(0)
1114 #endif /* __SUNPRO_C */
1115               for (i = 0; i <= (wid - 2); i += 2) {
1116                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1117 
1118                 p5 = buff[i + 5]; p6 = buff[i + 6];
1119 
1120                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1121                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1122               }
1123 
1124             } else {
1125 #ifdef __SUNPRO_C
1126 #pragma pipeloop(0)
1127 #endif /* __SUNPRO_C */
1128               for (i = 0; i <= (wid - 2); i += 2) {
1129                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1130 
1131                 p5 = buff[i + 5]; p6 = buff[i + 6];
1132 
1133                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1134                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1135 
1136                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1137                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1138 
1139                 STORE_RES(dp[0    ], d0);
1140                 STORE_RES(dp[chan1], d1);
1141 
1142                 buffd[i    ] = 0;
1143                 buffd[i + 1] = 0;
1144 
1145                 sp += chan2;
1146                 dp += chan2;
1147               }
1148             }
1149 
1150           } else if (kw == 5) {
1151 
1152             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1153             p5 = buff[3];
1154 
1155             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1156             k4 = pk[4];
1157 
1158             if (l < (n - 1) || off < m) {
1159 #ifdef __SUNPRO_C
1160 #pragma pipeloop(0)
1161 #endif /* __SUNPRO_C */
1162               for (i = 0; i <= (wid - 2); i += 2) {
1163                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1164 
1165                 p4 = buff[i + 4]; p5 = buff[i + 5];
1166 
1167                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1168                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1169               }
1170 
1171             } else {
1172 #ifdef __SUNPRO_C
1173 #pragma pipeloop(0)
1174 #endif /* __SUNPRO_C */
1175               for (i = 0; i <= (wid - 2); i += 2) {
1176                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1177 
1178                 p4 = buff[i + 4]; p5 = buff[i + 5];
1179 
1180                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1181                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1182 
1183                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1184                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1185 
1186                 STORE_RES(dp[0    ], d0);
1187                 STORE_RES(dp[chan1], d1);
1188 
1189                 buffd[i    ] = 0;
1190                 buffd[i + 1] = 0;
1191 
1192                 sp += chan2;
1193                 dp += chan2;
1194               }
1195             }
1196 
1197           } else if (kw == 4) {
1198 
1199             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1200 
1201             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1202 
1203             if (l < (n - 1) || off < m) {
1204 #ifdef __SUNPRO_C
1205 #pragma pipeloop(0)
1206 #endif /* __SUNPRO_C */
1207               for (i = 0; i <= (wid - 2); i += 2) {
1208                 p0 = p2; p1 = p3; p2 = p4;
1209 
1210                 p3 = buff[i + 3]; p4 = buff[i + 4];
1211 
1212                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1213                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1214               }
1215 
1216             } else {
1217 #ifdef __SUNPRO_C
1218 #pragma pipeloop(0)
1219 #endif /* __SUNPRO_C */
1220               for (i = 0; i <= (wid - 2); i += 2) {
1221                 p0 = p2; p1 = p3; p2 = p4;
1222 
1223                 p3 = buff[i + 3]; p4 = buff[i + 4];
1224 
1225                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1226                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1227 
1228                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1229                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1230 
1231                 STORE_RES(dp[0    ], d0);
1232                 STORE_RES(dp[chan1], d1);
1233 
1234                 buffd[i    ] = 0;
1235                 buffd[i + 1] = 0;
1236 
1237                 sp += chan2;
1238                 dp += chan2;
1239               }
1240             }
1241 
1242           } else if (kw == 3) {
1243 
1244             p2 = buff[0]; p3 = buff[1];
1245             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1246 
1247             if (l < (n - 1) || off < m) {
1248 #ifdef __SUNPRO_C
1249 #pragma pipeloop(0)
1250 #endif /* __SUNPRO_C */
1251               for (i = 0; i <= (wid - 2); i += 2) {
1252                 p0 = p2; p1 = p3;
1253 
1254                 p2 = buff[i + 2]; p3 = buff[i + 3];
1255 
1256                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1257                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1258               }
1259 
1260             } else {
1261 #ifdef __SUNPRO_C
1262 #pragma pipeloop(0)
1263 #endif /* __SUNPRO_C */
1264               for (i = 0; i <= (wid - 2); i += 2) {
1265                 p0 = p2; p1 = p3;
1266 
1267                 p2 = buff[i + 2]; p3 = buff[i + 3];
1268 
1269                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1270                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1271 
1272                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1273                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1274 
1275                 STORE_RES(dp[0    ], d0);
1276                 STORE_RES(dp[chan1], d1);
1277 
1278                 buffd[i    ] = 0;
1279                 buffd[i + 1] = 0;
1280 
1281                 sp += chan2;
1282                 dp += chan2;
1283               }
1284             }
1285 
1286           } else if (kw == 2) {
1287 
1288             p2 = buff[0];
1289             k0 = pk[0]; k1 = pk[1];
1290 
1291             if (l < (n - 1) || off < m) {
1292 #ifdef __SUNPRO_C
1293 #pragma pipeloop(0)
1294 #endif /* __SUNPRO_C */
1295               for (i = 0; i <= (wid - 2); i += 2) {
1296                 p0 = p2;
1297 
1298                 p1 = buff[i + 1]; p2 = buff[i + 2];
1299 
1300                 buffd[i    ] += p0*k0 + p1*k1;
1301                 buffd[i + 1] += p1*k0 + p2*k1;
1302               }
1303 
1304             } else {
1305 #ifdef __SUNPRO_C
1306 #pragma pipeloop(0)
1307 #endif /* __SUNPRO_C */
1308               for (i = 0; i <= (wid - 2); i += 2) {
1309                 p0 = p2;
1310 
1311                 p1 = buff[i + 1]; p2 = buff[i + 2];
1312 
1313                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1314                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1315 
1316                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1317                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1318 
1319                 STORE_RES(dp[0    ], d0);
1320                 STORE_RES(dp[chan1], d1);
1321 
1322                 buffd[i    ] = 0;
1323                 buffd[i + 1] = 0;
1324 
1325                 sp += chan2;
1326                 dp += chan2;
1327               }
1328             }
1329 
1330           } else /* kw == 1 */{
1331 
1332             k0 = pk[0];
1333 
1334             if (l < (n - 1) || off < m) {
1335 #ifdef __SUNPRO_C
1336 #pragma pipeloop(0)
1337 #endif /* __SUNPRO_C */
1338               for (i = 0; i <= (wid - 2); i += 2) {
1339                 p0 = buff[i]; p1 = buff[i + 1];
1340 
1341                 buffd[i    ] += p0*k0;
1342                 buffd[i + 1] += p1*k0;
1343               }
1344 
1345             } else {
1346 #ifdef __SUNPRO_C
1347 #pragma pipeloop(0)
1348 #endif /* __SUNPRO_C */
1349               for (i = 0; i <= (wid - 2); i += 2) {
1350                 p0 = buff[i]; p1 = buff[i + 1];
1351 
1352                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1353                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1354 
1355                 d0 = (p0*k0 + buffd[i    ]);
1356                 d1 = (p1*k0 + buffd[i + 1]);
1357 
1358                 STORE_RES(dp[0    ], d0);
1359                 STORE_RES(dp[chan1], d1);
1360 
1361                 buffd[i    ] = 0;
1362                 buffd[i + 1] = 0;
1363 
1364                 sp += chan2;
1365                 dp += chan2;
1366               }
1367             }
1368           }
1369 
1370           pk += kw;
1371         }
1372       }
1373 
1374       /* last pixels */
1375       for (; i < wid; i++) {
1376         mlib_s32 *pk = k, x, s = 0;
1377 
1378         for (l = 0; l < n; l++) {
1379           mlib_s32 *buff = buffc[l] + i;
1380 
1381           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1382         }
1383 
1384         STORE_RES(dp[0], s);
1385 
1386         buffn[i + dx_l] = (mlib_s32)sp[0];
1387 
1388         sp += chan1;
1389         dp += chan1;
1390       }
1391 
1392       for (; i < swid; i++) {
1393         buffn[i + dx_l] = (mlib_s32)sp[0];
1394         sp += chan1;
1395       }
1396 
1397       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1398       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1399 
1400       /* next line */
1401 
1402       if (j < hgt - dy_b - 2) sl += sll;
1403       dl += dll;
1404 
1405       buff_ind++;
1406 
1407       if (buff_ind >= n + 1) buff_ind = 0;
1408     }
1409   }
1410 
1411   if (pbuff != buff) mlib_free(pbuff);
1412   if (k != k_locl) mlib_free(k);
1413 
1414   return MLIB_SUCCESS;
1415 }
1416 
1417 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1418 
1419 /***************************************************************/
1420