1 /*
2  * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 /*
28  * FUNCTION
29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 types and
30  *   MLIB_EDGE_DST_NO_WRITE mask
31  */
32 
33 #include "mlib_image.h"
34 #include "mlib_c_ImageConv.h"
35 
36 /*
37   This define switches between functions of different data types
38 */
39 #define IMG_TYPE 2
40 
41 /***************************************************************/
42 #if IMG_TYPE == 1
43 
44 #define DTYPE             mlib_u8
45 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##nw_u8
46 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
47 #define DSCALE            (1 << 24)
48 #define FROM_S32(x)       (((x) >> 24) ^ 128)
49 #define S64TOS32(x)       (x)
50 #define SAT_OFF           -(1u << 31)
51 
52 #elif IMG_TYPE == 2
53 
54 #define DTYPE             mlib_s16
55 #define CONV_FUNC(KERN)   mlib_conv##KERN##nw_s16
56 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
57 #define DSCALE            65536.0
58 #define FROM_S32(x)       ((x) >> 16)
59 #define S64TOS32(x)       ((x) & 0xffffffff)
60 #define SAT_OFF
61 
62 #elif IMG_TYPE == 3
63 
64 #define DTYPE             mlib_u16
65 #define CONV_FUNC(KERN)   mlib_conv##KERN##nw_u16
66 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
67 #define DSCALE            65536.0
68 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
69 #define S64TOS32(x)       (x)
70 #define SAT_OFF           -(1u << 31)
71 
72 #endif /* IMG_TYPE == 1 */
73 
74 /***************************************************************/
75 #define BUFF_SIZE   1600
76 
77 #define CACHE_SIZE  (64*1024)
78 
79 /***************************************************************/
80 #define FTYPE mlib_d64
81 
82 #ifndef MLIB_USE_FTOI_CLAMPING
83 
84 #define CLAMP_S32(x)                                            \
85   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
86 
87 #else
88 
89 #define CLAMP_S32(x) ((mlib_s32)(x))
90 
91 #endif /* MLIB_USE_FTOI_CLAMPING */
92 
93 /***************************************************************/
94 #define D2I(x) CLAMP_S32((x) SAT_OFF)
95 
96 /***************************************************************/
97 #ifdef VM_LITTLE_ENDIAN
98 
99 #define STORE2(res0, res1)                                      \
100   dp[0    ] = res1;                                             \
101   dp[chan1] = res0
102 
103 #else
104 
105 #define STORE2(res0, res1)                                      \
106   dp[0    ] = res0;                                             \
107   dp[chan1] = res1
108 
109 #endif /* VM_LITTLE_ENDIAN */
110 
111 /***************************************************************/
112 #ifdef _NO_LONGLONG
113 
114 #define LOAD_BUFF(buff)                                         \
115   buff[i    ] = sp[0];                                          \
116   buff[i + 1] = sp[chan1]
117 
118 #else /* _NO_LONGLONG */
119 
120 #ifdef VM_LITTLE_ENDIAN
121 
122 #define LOAD_BUFF(buff)                                         \
123   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
124 
125 #else /* VM_LITTLE_ENDIAN */
126 
127 #define LOAD_BUFF(buff)                                         \
128   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
129 
130 #endif /* VM_LITTLE_ENDIAN */
131 #endif /* _NO_LONGLONG */
132 
133 /***************************************************************/
134 typedef union {
135   mlib_d64 d64;
136   struct {
137     mlib_s32 i0;
138     mlib_s32 i1;
139   } i32s;
140   struct {
141     mlib_s32 f0;
142     mlib_s32 f1;
143   } f32s;
144 } d64_2x32;
145 
146 /***************************************************************/
147 #define BUFF_LINE 256
148 
149 /***************************************************************/
150 #define DEF_VARS(type)                                          \
151   type     *adr_src, *sl, *sp = NULL;                           \
152   type     *adr_dst, *dl, *dp = NULL;                           \
153   FTYPE    *pbuff = buff;                                       \
154   mlib_s32 wid, hgt, sll, dll;                                  \
155   mlib_s32 nchannel, chan1;                                     \
156   mlib_s32 i, j, c
157 
158 /***************************************************************/
159 #define LOAD_KERNEL3()                                                   \
160   FTYPE    scalef = DSCALE;                                              \
161   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8;                           \
162   FTYPE    p00, p01, p02, p03,                                           \
163            p10, p11, p12, p13,                                           \
164            p20, p21, p22, p23;                                           \
165                                                                          \
166   while (scalef_expon > 30) {                                            \
167     scalef /= (1 << 30);                                                 \
168     scalef_expon -= 30;                                                  \
169   }                                                                      \
170                                                                          \
171   scalef /= (1 << scalef_expon);                                         \
172                                                                          \
173   /* keep kernel in regs */                                              \
174   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2]; \
175   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5]; \
176   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8]
177 
178 /***************************************************************/
179 #define LOAD_KERNEL(SIZE)                                       \
180   FTYPE    scalef = DSCALE;                                     \
181                                                                 \
182   while (scalef_expon > 30) {                                   \
183     scalef /= (1 << 30);                                        \
184     scalef_expon -= 30;                                         \
185   }                                                             \
186                                                                 \
187   scalef /= (1 << scalef_expon);                                \
188                                                                 \
189   for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
190 
191 /***************************************************************/
192 #define GET_SRC_DST_PARAMETERS(type)                            \
193   hgt = mlib_ImageGetHeight(src);                               \
194   wid = mlib_ImageGetWidth(src);                                \
195   nchannel = mlib_ImageGetChannels(src);                        \
196   sll = mlib_ImageGetStride(src) / sizeof(type);                \
197   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
198   adr_src = (type *)mlib_ImageGetData(src);                     \
199   adr_dst = (type *)mlib_ImageGetData(dst)
200 
201 /***************************************************************/
202 #ifndef __sparc
203 
204 #if IMG_TYPE == 1
205 
206 /* Test for the presence of any "1" bit in bits
207    8 to 31 of val. If present, then val is either
208    negative or >255. If over/underflows of 8 bits
209    are uncommon, then this technique can be a win,
210    since only a single test, rather than two, is
211    necessary to determine if clamping is needed.
212    On the other hand, if over/underflows are common,
213    it adds an extra test.
214 */
215 #define CLAMP_STORE(dst, val)                                   \
216   if (val & 0xffffff00) {                                       \
217     if (val < MLIB_U8_MIN)                                      \
218       dst = MLIB_U8_MIN;                                        \
219     else                                                        \
220       dst = MLIB_U8_MAX;                                        \
221   } else {                                                      \
222     dst = (mlib_u8)val;                                         \
223   }
224 
225 #elif IMG_TYPE == 2
226 
227 #define CLAMP_STORE(dst, val)                                   \
228   if (val >= MLIB_S16_MAX)                                      \
229     dst = MLIB_S16_MAX;                                         \
230   else if (val <= MLIB_S16_MIN)                                 \
231     dst = MLIB_S16_MIN;                                         \
232   else                                                          \
233     dst = (mlib_s16)val
234 
235 #elif IMG_TYPE == 3
236 
237 #define CLAMP_STORE(dst, val)                                   \
238   if (val >= MLIB_U16_MAX)                                      \
239     dst = MLIB_U16_MAX;                                         \
240   else if (val <= MLIB_U16_MIN)                                 \
241     dst = MLIB_U16_MIN;                                         \
242   else                                                          \
243     dst = (mlib_u16)val
244 
245 #endif /* IMG_TYPE == 1 */
246 #endif /* __sparc */
247 
248 /***************************************************************/
249 #define KSIZE  3
250 
251 mlib_status CONV_FUNC(3x3)(mlib_image       *dst,
252                            const mlib_image *src,
253                            const mlib_s32   *kern,
254                            mlib_s32         scalef_expon,
255                            mlib_s32         cmask)
256 {
257   FTYPE    buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
258   DEF_VARS(DTYPE);
259   DTYPE *sl1;
260   mlib_s32 chan2;
261   mlib_s32 *buffo, *buffi;
262   DTYPE *sl2;
263 #ifndef __sparc
264   mlib_s32 d0, d1;
265 #endif /* __sparc */
266   LOAD_KERNEL3();
267   GET_SRC_DST_PARAMETERS(DTYPE);
268 
269   if (wid > BUFF_LINE) {
270     pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE)*wid);
271 
272     if (pbuff == NULL) return MLIB_FAILURE;
273   }
274 
275   buff0 = pbuff;
276   buff1 = buff0 + wid;
277   buff2 = buff1 + wid;
278   buff3 = buff2 + wid;
279   buffo = (mlib_s32*)(buff3 + wid);
280   buffi = buffo + (wid &~ 1);
281 
282   chan1 = nchannel;
283   chan2 = chan1 + chan1;
284 
285   wid -= (KSIZE - 1);
286   hgt -= (KSIZE - 1);
287 
288   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
289 
290   for (c = 0; c < nchannel; c++) {
291     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
292 
293     sl = adr_src + c;
294     dl = adr_dst + c;
295 
296     sl1 = sl  + sll;
297     sl2 = sl1 + sll;
298 #ifdef __SUNPRO_C
299 #pragma pipeloop(0)
300 #endif /* __SUNPRO_C */
301     for (i = 0; i < wid + (KSIZE - 1); i++) {
302       buff0[i] = (FTYPE)sl[i*chan1];
303       buff1[i] = (FTYPE)sl1[i*chan1];
304       buff2[i] = (FTYPE)sl2[i*chan1];
305     }
306 
307     sl += KSIZE*sll;
308 
309     for (j = 0; j < hgt; j++) {
310       FTYPE    s0, s1;
311 
312       p02 = buff0[0];
313       p12 = buff1[0];
314       p22 = buff2[0];
315 
316       p03 = buff0[1];
317       p13 = buff1[1];
318       p23 = buff2[1];
319 
320       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
321       s1 = p03 * k0 + p13 * k3 + p23 * k6;
322 
323       sp = sl;
324       dp = dl;
325 
326 #ifdef __SUNPRO_C
327 #pragma pipeloop(0)
328 #endif /* __SUNPRO_C */
329       for (i = 0; i <= (wid - 2); i += 2) {
330 #ifdef __sparc
331 #ifdef _NO_LONGLONG
332         mlib_s32 o64_1, o64_2;
333 #else /* _NO_LONGLONG */
334         mlib_s64 o64;
335 #endif /* _NO_LONGLONG */
336 #endif /* __sparc */
337         d64_2x32 dd;
338 
339         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
340         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
341 
342         LOAD_BUFF(buffi);
343 
344         dd.d64 = *(FTYPE   *)(buffi + i);
345         buff3[i    ] = (FTYPE)dd.i32s.i0;
346         buff3[i + 1] = (FTYPE)dd.i32s.i1;
347 
348 #ifndef __sparc
349         d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
350         d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
351 
352         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
353         s1 = p03 * k0 + p13 * k3 + p23 * k6;
354 
355         dp[0    ] = FROM_S32(d0);
356         dp[chan1] = FROM_S32(d1);
357 
358 #else /* __sparc */
359 
360         dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
361         dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
362         *(FTYPE   *)(buffo + i) = dd.d64;
363 
364         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
365         s1 = p03 * k0 + p13 * k3 + p23 * k6;
366 
367 #ifdef _NO_LONGLONG
368 
369         o64_1 = buffo[i];
370         o64_2 = buffo[i+1];
371 #if IMG_TYPE != 1
372         STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
373 #else
374         STORE2(o64_1 >> 24, o64_2 >> 24);
375 #endif /* IMG_TYPE != 1 */
376 
377 #else /* _NO_LONGLONG */
378 
379         o64 = *(mlib_s64*)(buffo + i);
380 #if IMG_TYPE != 1
381         STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
382 #else
383         STORE2(o64 >> 56, o64 >> 24);
384 #endif /* IMG_TYPE != 1 */
385 #endif /* _NO_LONGLONG */
386 #endif /* __sparc */
387 
388         sp += chan2;
389         dp += chan2;
390       }
391 
392       for (; i < wid; i++) {
393         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
394         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
395         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
396 
397         buffi[i] = (mlib_s32)sp[0];
398         buff3[i] = (FTYPE)buffi[i];
399 
400 #ifndef __sparc
401 
402         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
403                  p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
404 
405         dp[0] = FROM_S32(d0);
406 
407 #else  /* __sparc */
408 
409         buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
410                        p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
411 #if IMG_TYPE != 1
412         dp[0] = FROM_S32(buffo[i]);
413 #else
414         dp[0] = buffo[i] >> 24;
415 #endif /* IMG_TYPE != 1 */
416 #endif /* __sparc */
417 
418         sp += chan1;
419         dp += chan1;
420       }
421 
422       buffi[wid] = (mlib_s32)sp[0];
423       buff3[wid] = (FTYPE)buffi[wid];
424       buffi[wid + 1] = (mlib_s32)sp[chan1];
425       buff3[wid + 1] = (FTYPE)buffi[wid + 1];
426 
427       sl += sll;
428       dl += dll;
429 
430       buffT = buff0;
431       buff0 = buff1;
432       buff1 = buff2;
433       buff2 = buff3;
434       buff3 = buffT;
435     }
436   }
437 
438 #ifdef __sparc
439 #if IMG_TYPE == 1
440   {
441     mlib_s32 amask = (1 << nchannel) - 1;
442 
443     if ((cmask & amask) != amask) {
444       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
445     } else {
446       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
447     }
448   }
449 
450 #endif /* IMG_TYPE == 1 */
451 #endif /* __sparc */
452 
453   if (pbuff != buff) mlib_free(pbuff);
454 
455   return MLIB_SUCCESS;
456 }
457 
458 /***************************************************************/
459 #ifndef __sparc /* for x86, using integer multiplies is faster */
460 
461 mlib_status CONV_FUNC_I(3x3)(mlib_image       *dst,
462                              const mlib_image *src,
463                              const mlib_s32   *kern,
464                              mlib_s32         scalef_expon,
465                              mlib_s32         cmask)
466 {
467   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2;
468   DTYPE    *adr_dst, *dl, *dp;
469   mlib_s32 wid, hgt, sll, dll;
470   mlib_s32 nchannel, chan1, chan2;
471   mlib_s32 i, j, c;
472   mlib_s32 shift1, shift2;
473   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
474   mlib_s32 p02, p03,
475            p12, p13,
476            p22, p23;
477 
478 #if IMG_TYPE != 1
479   shift1 = 16;
480 #else
481   shift1 = 8;
482 #endif /* IMG_TYPE != 1 */
483 
484   shift2 = scalef_expon - shift1;
485 
486   /* keep kernel in regs */
487   k0 = kern[0] >> shift1;  k1 = kern[1] >> shift1;  k2 = kern[2] >> shift1;
488   k3 = kern[3] >> shift1;  k4 = kern[4] >> shift1;  k5 = kern[5] >> shift1;
489   k6 = kern[6] >> shift1;  k7 = kern[7] >> shift1;  k8 = kern[8] >> shift1;
490 
491   GET_SRC_DST_PARAMETERS(DTYPE);
492 
493   chan1 = nchannel;
494   chan2 = chan1 + chan1;
495 
496   wid -= (KSIZE - 1);
497   hgt -= (KSIZE - 1);
498 
499   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
500 
501   for (c = 0; c < chan1; c++) {
502     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
503 
504     sl = adr_src + c;
505     dl = adr_dst + c;
506 
507     for (j = 0; j < hgt; j++) {
508       mlib_s32 s0, s1;
509       mlib_s32 pix0, pix1;
510 
511       dp  = dl;
512       sp0 = sl;
513       sp1 = sp0 + sll;
514       sp2 = sp1 + sll;
515 
516       p02 = sp0[0];
517       p12 = sp1[0];
518       p22 = sp2[0];
519 
520       p03 = sp0[chan1];
521       p13 = sp1[chan1];
522       p23 = sp2[chan1];
523 
524       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
525       s1 = p03 * k0 + p13 * k3 + p23 * k6;
526 
527       sp0 += chan2;
528       sp1 += chan2;
529       sp2 += chan2;
530 
531 #ifdef __SUNPRO_C
532 #pragma pipeloop(0)
533 #endif /* __SUNPRO_C */
534       for (i = 0; i <= (wid - 2); i += 2) {
535         p02 = sp0[0];     p12 = sp1[0];     p22 = sp2[0];
536         p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
537 
538         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
539         pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
540                 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
541 
542         CLAMP_STORE(dp[0],     pix0);
543         CLAMP_STORE(dp[chan1], pix1);
544 
545         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
546         s1 = p03 * k0 + p13 * k3 + p23 * k6;
547 
548         sp0 += chan2;
549         sp1 += chan2;
550         sp2 += chan2;
551         dp += chan2;
552       }
553 
554       if (wid & 1) {
555         p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
556         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
557         CLAMP_STORE(dp[0], pix0);
558       }
559 
560       sl += sll;
561       dl += dll;
562     }
563   }
564 
565   return MLIB_SUCCESS;
566 }
567 
568 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
569 
570 /***************************************************************/
571 #undef  KSIZE
572 #define KSIZE 4
573 
574 mlib_status CONV_FUNC(4x4)(mlib_image       *dst,
575                            const mlib_image *src,
576                            const mlib_s32   *kern,
577                            mlib_s32         scalef_expon,
578                            mlib_s32         cmask)
579 {
580   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
581   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
582   FTYPE    k[KSIZE*KSIZE];
583   mlib_s32 d0, d1;
584   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7;
585   FTYPE    p00, p01, p02, p03, p04,
586            p10, p11, p12, p13, p14,
587            p20, p21, p22, p23,
588            p30, p31, p32, p33;
589   DEF_VARS(DTYPE);
590   DTYPE *sl1;
591   mlib_s32 chan2;
592   mlib_s32 *buffo, *buffi;
593   DTYPE *sl2, *sl3;
594   LOAD_KERNEL(KSIZE*KSIZE);
595   GET_SRC_DST_PARAMETERS(DTYPE);
596 
597   if (wid > BUFF_LINE) {
598     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
599 
600     if (pbuff == NULL) return MLIB_FAILURE;
601   }
602 
603   buff0 = pbuff;
604   buff1 = buff0 + wid;
605   buff2 = buff1 + wid;
606   buff3 = buff2 + wid;
607   buff4 = buff3 + wid;
608   buffd = buff4 + wid;
609   buffo = (mlib_s32*)(buffd + wid);
610   buffi = buffo + (wid &~ 1);
611 
612   chan1 = nchannel;
613   chan2 = chan1 + chan1;
614 
615   wid -= (KSIZE - 1);
616   hgt -= (KSIZE - 1);
617 
618   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
619 
620   for (c = 0; c < nchannel; c++) {
621     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
622 
623     sl = adr_src + c;
624     dl = adr_dst + c;
625 
626     sl1 = sl  + sll;
627     sl2 = sl1 + sll;
628     sl3 = sl2 + sll;
629 #ifdef __SUNPRO_C
630 #pragma pipeloop(0)
631 #endif /* __SUNPRO_C */
632     for (i = 0; i < wid + (KSIZE - 1); i++) {
633       buff0[i] = (FTYPE)sl[i*chan1];
634       buff1[i] = (FTYPE)sl1[i*chan1];
635       buff2[i] = (FTYPE)sl2[i*chan1];
636       buff3[i] = (FTYPE)sl3[i*chan1];
637     }
638 
639     sl += KSIZE*sll;
640 
641     for (j = 0; j < hgt; j++) {
642       d64_2x32 dd;
643 
644       /*
645        *  First loop on two first lines of kernel
646        */
647       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
648       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
649 
650       sp = sl;
651       dp = dl;
652 
653       p02 = buff0[0];
654       p12 = buff1[0];
655       p03 = buff0[1];
656       p13 = buff1[1];
657       p04 = buff0[2];
658 
659 #ifdef __SUNPRO_C
660 #pragma pipeloop(0)
661 #endif /* __SUNPRO_C */
662       for (i = 0; i <= (wid - 2); i += 2) {
663         p00 = p02; p10 = p12;
664         p01 = p03; p11 = p13;
665         p02 = p04; p12 = buff1[i + 2];
666         p03 = buff0[i + 3]; p13 = buff1[i + 3];
667         p04 = buff0[i + 4]; p14 = buff1[i + 4];
668 
669         LOAD_BUFF(buffi);
670 
671         dd.d64 = *(FTYPE   *)(buffi + i);
672         buff4[i    ] = (FTYPE)dd.i32s.i0;
673         buff4[i + 1] = (FTYPE)dd.i32s.i1;
674 
675         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
676                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
677         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
678                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
679 
680         sp += chan2;
681         dp += chan2;
682       }
683 
684       /*
685        *  Second loop on two last lines of kernel
686        */
687       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
688       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
689 
690       sp = sl;
691       dp = dl;
692 
693       p02 = buff2[0];
694       p12 = buff3[0];
695       p03 = buff2[1];
696       p13 = buff3[1];
697       p04 = buff2[2];
698 
699 #ifdef __SUNPRO_C
700 #pragma pipeloop(0)
701 #endif /* __SUNPRO_C */
702       for (i = 0; i <= (wid - 2); i += 2) {
703         p00 = p02; p10 = p12;
704         p01 = p03; p11 = p13;
705         p02 = p04; p12 = buff3[i + 2];
706         p03 = buff2[i + 3]; p13 = buff3[i + 3];
707         p04 = buff2[i + 4]; p14 = buff3[i + 4];
708 
709         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
710                  p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
711         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
712                  p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
713 
714         dp[0    ] = FROM_S32(d0);
715         dp[chan1] = FROM_S32(d1);
716 
717         sp += chan2;
718         dp += chan2;
719       }
720 
721       /* last pixels */
722       for (; i < wid; i++) {
723         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
724         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
725         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
726         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
727 
728         buff4[i] = (FTYPE)sp[0];
729 
730         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
731                        p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
732                        p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
733                        p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
734 
735         dp[0] = FROM_S32(buffo[i]);
736 
737         sp += chan1;
738         dp += chan1;
739       }
740 
741       buff4[wid    ] = (FTYPE)sp[0];
742       buff4[wid + 1] = (FTYPE)sp[chan1];
743       buff4[wid + 2] = (FTYPE)sp[chan2];
744 
745       /* next line */
746       sl += sll;
747       dl += dll;
748 
749       buffT = buff0;
750       buff0 = buff1;
751       buff1 = buff2;
752       buff2 = buff3;
753       buff3 = buff4;
754       buff4 = buffT;
755     }
756   }
757 
758   if (pbuff != buff) mlib_free(pbuff);
759 
760   return MLIB_SUCCESS;
761 }
762 
763 /***************************************************************/
764 #undef  KSIZE
765 #define KSIZE 5
766 
767 mlib_status CONV_FUNC(5x5)(mlib_image       *dst,
768                            const mlib_image *src,
769                            const mlib_s32   *kern,
770                            mlib_s32         scalef_expon,
771                            mlib_s32         cmask)
772 {
773   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
774   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
775   FTYPE    k[KSIZE*KSIZE];
776   mlib_s32 d0, d1;
777   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
778   FTYPE    p00, p01, p02, p03, p04, p05,
779            p10, p11, p12, p13, p14, p15,
780            p20, p21, p22, p23, p24,
781            p30, p31, p32, p33, p34,
782            p40, p41, p42, p43, p44;
783   DEF_VARS(DTYPE);
784   DTYPE *sl1;
785   mlib_s32 chan2;
786   mlib_s32 *buffo, *buffi;
787   DTYPE *sl2, *sl3, *sl4;
788   LOAD_KERNEL(KSIZE*KSIZE);
789   GET_SRC_DST_PARAMETERS(DTYPE);
790 
791   if (wid > BUFF_LINE) {
792     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
793 
794     if (pbuff == NULL) return MLIB_FAILURE;
795   }
796 
797   buff0 = pbuff;
798   buff1 = buff0 + wid;
799   buff2 = buff1 + wid;
800   buff3 = buff2 + wid;
801   buff4 = buff3 + wid;
802   buff5 = buff4 + wid;
803   buffd = buff5 + wid;
804   buffo = (mlib_s32*)(buffd + wid);
805   buffi = buffo + (wid &~ 1);
806 
807   chan1 = nchannel;
808   chan2 = chan1 + chan1;
809 
810   wid -= (KSIZE - 1);
811   hgt -= (KSIZE - 1);
812 
813   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
814 
815   for (c = 0; c < nchannel; c++) {
816     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
817 
818     sl = adr_src + c;
819     dl = adr_dst + c;
820 
821     sl1 = sl  + sll;
822     sl2 = sl1 + sll;
823     sl3 = sl2 + sll;
824     sl4 = sl3 + sll;
825 #ifdef __SUNPRO_C
826 #pragma pipeloop(0)
827 #endif /* __SUNPRO_C */
828     for (i = 0; i < wid + (KSIZE - 1); i++) {
829       buff0[i] = (FTYPE)sl[i*chan1];
830       buff1[i] = (FTYPE)sl1[i*chan1];
831       buff2[i] = (FTYPE)sl2[i*chan1];
832       buff3[i] = (FTYPE)sl3[i*chan1];
833       buff4[i] = (FTYPE)sl4[i*chan1];
834     }
835 
836     sl += KSIZE*sll;
837 
838     for (j = 0; j < hgt; j++) {
839       d64_2x32 dd;
840 
841       /*
842        *  First loop
843        */
844       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
845       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
846 
847       sp = sl;
848       dp = dl;
849 
850       p02 = buff0[0];
851       p12 = buff1[0];
852       p03 = buff0[1];
853       p13 = buff1[1];
854       p04 = buff0[2];
855       p14 = buff1[2];
856 
857 #ifdef __SUNPRO_C
858 #pragma pipeloop(0)
859 #endif /* __SUNPRO_C */
860       for (i = 0; i <= (wid - 2); i += 2) {
861         p00 = p02; p10 = p12;
862         p01 = p03; p11 = p13;
863         p02 = p04; p12 = p14;
864 
865         LOAD_BUFF(buffi);
866 
867         p03 = buff0[i + 3]; p13 = buff1[i + 3];
868         p04 = buff0[i + 4]; p14 = buff1[i + 4];
869         p05 = buff0[i + 5]; p15 = buff1[i + 5];
870 
871         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
872                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
873         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
874                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
875 
876         sp += chan2;
877         dp += chan2;
878       }
879 
880       /*
881        *  Second loop
882        */
883       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
884       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
885 
886       sp = sl;
887       dp = dl;
888 
889       p02 = buff2[0];
890       p12 = buff3[0];
891       p03 = buff2[1];
892       p13 = buff3[1];
893       p04 = buff2[2];
894       p14 = buff3[2];
895 
896 #ifdef __SUNPRO_C
897 #pragma pipeloop(0)
898 #endif /* __SUNPRO_C */
899       for (i = 0; i <= (wid - 2); i += 2) {
900         p00 = p02; p10 = p12;
901         p01 = p03; p11 = p13;
902 
903         p02 = buff2[i + 2]; p12 = buff3[i + 2];
904         p03 = buff2[i + 3]; p13 = buff3[i + 3];
905         p04 = buff2[i + 4]; p14 = buff3[i + 4];
906         p05 = buff2[i + 5]; p15 = buff3[i + 5];
907 
908         dd.d64 = *(FTYPE   *)(buffi + i);
909         buff5[i    ] = (FTYPE)dd.i32s.i0;
910         buff5[i + 1] = (FTYPE)dd.i32s.i1;
911 
912         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
913                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
914         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
915                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
916 
917         sp += chan2;
918         dp += chan2;
919       }
920 
921       /*
922        *  3 loop
923        */
924       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
925 
926       sp = sl;
927       dp = dl;
928 
929       p02 = buff4[0];
930       p03 = buff4[1];
931       p04 = buff4[2];
932       p05 = buff4[3];
933 
934 #ifdef __SUNPRO_C
935 #pragma pipeloop(0)
936 #endif /* __SUNPRO_C */
937       for (i = 0; i <= (wid - 2); i += 2) {
938         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
939 
940         p04 = buff4[i + 4]; p05 = buff4[i + 5];
941 
942         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
943         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
944 
945         dp[0    ] = FROM_S32(d0);
946         dp[chan1] = FROM_S32(d1);
947 
948         sp += chan2;
949         dp += chan2;
950       }
951 
952       /* last pixels */
953       for (; i < wid; i++) {
954         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
955         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
956         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
957         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
958         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
959 
960         p40 = buff4[i];     p41 = buff4[i + 1]; p42 = buff4[i + 2];
961         p43 = buff4[i + 3]; p44 = buff4[i + 4];
962 
963         buff5[i] = (FTYPE)sp[0];
964 
965         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
966                        p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
967                        p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
968                        p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
969                        p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
970 
971         dp[0] = FROM_S32(buffo[i]);
972 
973         sp += chan1;
974         dp += chan1;
975       }
976 
977       buff5[wid    ] = (FTYPE)sp[0];
978       buff5[wid + 1] = (FTYPE)sp[chan1];
979       buff5[wid + 2] = (FTYPE)sp[chan2];
980       buff5[wid + 3] = (FTYPE)sp[chan2 + chan1];
981 
982       /* next line */
983       sl += sll;
984       dl += dll;
985 
986       buffT = buff0;
987       buff0 = buff1;
988       buff1 = buff2;
989       buff2 = buff3;
990       buff3 = buff4;
991       buff4 = buff5;
992       buff5 = buffT;
993     }
994   }
995 
996   if (pbuff != buff) mlib_free(pbuff);
997 
998   return MLIB_SUCCESS;
999 }
1000 
1001 /***************************************************************/
1002 #ifndef __sparc /* for x86, using integer multiplies is faster */
1003 
1004 mlib_status CONV_FUNC_I(5x5)(mlib_image       *dst,
1005                              const mlib_image *src,
1006                              const mlib_s32   *kern,
1007                              mlib_s32         scalef_expon,
1008                              mlib_s32         cmask)
1009 {
1010   mlib_s32 buff[BUFF_LINE];
1011   mlib_s32 *buffd;
1012   mlib_s32 k[KSIZE*KSIZE];
1013   mlib_s32 shift1, shift2;
1014   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1015   mlib_s32 p00, p01, p02, p03, p04, p05,
1016            p10, p11, p12, p13, p14, p15;
1017   DTYPE    *adr_src, *sl, *sp0, *sp1;
1018   DTYPE    *adr_dst, *dl, *dp;
1019   mlib_s32 *pbuff = buff;
1020   mlib_s32 wid, hgt, sll, dll;
1021   mlib_s32 nchannel, chan1, chan2, chan3, chan4;
1022   mlib_s32 i, j, c;
1023 
1024 #if IMG_TYPE != 1
1025   shift1 = 16;
1026 #else
1027   shift1 = 8;
1028 #endif /* IMG_TYPE != 1 */
1029 
1030   shift2 = scalef_expon - shift1;
1031 
1032   for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1033 
1034   GET_SRC_DST_PARAMETERS(DTYPE);
1035 
1036   if (wid > BUFF_LINE) {
1037     pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1038 
1039     if (pbuff == NULL) return MLIB_FAILURE;
1040   }
1041 
1042   buffd = pbuff;
1043 
1044   chan1 = nchannel;
1045   chan2 = chan1 + chan1;
1046   chan3 = chan2 + chan1;
1047   chan4 = chan3 + chan1;
1048 
1049   wid -= (KSIZE - 1);
1050   hgt -= (KSIZE - 1);
1051 
1052   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1053 
1054   for (c = 0; c < chan1; c++) {
1055     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1056 
1057     sl = adr_src + c;
1058     dl = adr_dst + c;
1059 
1060     for (j = 0; j < hgt; j++) {
1061       mlib_s32 pix0, pix1;
1062       /*
1063        *  First loop
1064        */
1065       sp0 = sl;
1066       sp1 = sp0 + sll;
1067       dp = dl;
1068 
1069       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1070       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1071 
1072       p02 = sp0[0];     p12 = sp1[0];
1073       p03 = sp0[chan1]; p13 = sp1[chan1];
1074       p04 = sp0[chan2]; p14 = sp1[chan2];
1075       p05 = sp0[chan3]; p15 = sp1[chan3];
1076 
1077       sp0 += chan4;
1078       sp1 += chan4;
1079 
1080 #ifdef __SUNPRO_C
1081 #pragma pipeloop(0)
1082 #endif /* __SUNPRO_C */
1083       for (i = 0; i <= (wid - 2); i += 2) {
1084         p00 = p02; p10 = p12;
1085         p01 = p03; p11 = p13;
1086         p02 = p04; p12 = p14;
1087         p03 = p05; p13 = p15;
1088 
1089         p04 = sp0[0];     p14 = sp1[0];
1090         p05 = sp0[chan1]; p15 = sp1[chan1];
1091 
1092         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1093                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1094         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1095                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1096 
1097         sp0 += chan2;
1098         sp1 += chan2;
1099         dp += chan2;
1100       }
1101 
1102       if (wid & 1) {
1103         p00 = p02; p10 = p12;
1104         p01 = p03; p11 = p13;
1105         p02 = p04; p12 = p14;
1106         p03 = p05; p13 = p15;
1107 
1108         p04 = sp0[0];     p14 = sp1[0];
1109 
1110         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1111                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1112       }
1113 
1114       /*
1115        *  Second loop
1116        */
1117       sp0 = sl + 2*sll;
1118       sp1 = sp0 + sll;
1119       dp = dl;
1120 
1121       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1122       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1123 
1124       p02 = sp0[0];     p12 = sp1[0];
1125       p03 = sp0[chan1]; p13 = sp1[chan1];
1126       p04 = sp0[chan2]; p14 = sp1[chan2];
1127       p05 = sp0[chan3]; p15 = sp1[chan3];
1128 
1129       sp0 += chan4;
1130       sp1 += chan4;
1131 
1132 #ifdef __SUNPRO_C
1133 #pragma pipeloop(0)
1134 #endif /* __SUNPRO_C */
1135       for (i = 0; i <= (wid - 2); i += 2) {
1136         p00 = p02; p10 = p12;
1137         p01 = p03; p11 = p13;
1138         p02 = p04; p12 = p14;
1139         p03 = p05; p13 = p15;
1140 
1141         p04 = sp0[0];     p14 = sp1[0];
1142         p05 = sp0[chan1]; p15 = sp1[chan1];
1143 
1144         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1145                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1146         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1147                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1148 
1149         sp0 += chan2;
1150         sp1 += chan2;
1151         dp += chan2;
1152       }
1153 
1154       if (wid & 1) {
1155         p00 = p02; p10 = p12;
1156         p01 = p03; p11 = p13;
1157         p02 = p04; p12 = p14;
1158         p03 = p05; p13 = p15;
1159 
1160         p04 = sp0[0];     p14 = sp1[0];
1161 
1162         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1163                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1164       }
1165 
1166       /*
1167        *  3 loop
1168        */
1169       dp = dl;
1170       sp0 = sl + 4*sll;
1171 
1172       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1173 
1174       p02 = sp0[0];
1175       p03 = sp0[chan1];
1176       p04 = sp0[chan2];
1177       p05 = sp0[chan3];
1178 
1179       sp0 += chan2 + chan2;
1180 
1181 #ifdef __SUNPRO_C
1182 #pragma pipeloop(0)
1183 #endif /* __SUNPRO_C */
1184       for (i = 0; i <= (wid - 2); i += 2) {
1185         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1186 
1187         p04 = sp0[0]; p05 = sp0[chan1];
1188 
1189         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1190                 p03 * k3 + p04 * k4) >> shift2;
1191         pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1192                 p04 * k3 + p05 * k4) >> shift2;
1193 
1194         CLAMP_STORE(dp[0],     pix0);
1195         CLAMP_STORE(dp[chan1], pix1);
1196 
1197         dp  += chan2;
1198         sp0 += chan2;
1199       }
1200 
1201       if (wid & 1) {
1202         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1203 
1204         p04 = sp0[0];
1205 
1206         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1207                 p03 * k3 + p04 * k4) >> shift2;
1208         CLAMP_STORE(dp[0],     pix0);
1209       }
1210 
1211       /* next line */
1212       sl += sll;
1213       dl += dll;
1214     }
1215   }
1216 
1217   if (pbuff != buff) mlib_free(pbuff);
1218 
1219   return MLIB_SUCCESS;
1220 }
1221 
1222 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1223 
1224 /***************************************************************/
1225 #if IMG_TYPE == 1
1226 
1227 #undef  KSIZE
1228 #define KSIZE 7
1229 
1230 mlib_status CONV_FUNC(7x7)(mlib_image       *dst,
1231                            const mlib_image *src,
1232                            const mlib_s32   *kern,
1233                            mlib_s32         scalef_expon,
1234                            mlib_s32         cmask)
1235 {
1236   FTYPE    buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1237   FTYPE    k[KSIZE*KSIZE];
1238   mlib_s32 l, m, buff_ind;
1239   mlib_s32 d0, d1;
1240   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1241   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1242   DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1243   DEF_VARS(DTYPE);
1244   DTYPE *sl1;
1245   mlib_s32 chan2;
1246   mlib_s32 *buffo, *buffi;
1247   LOAD_KERNEL(KSIZE*KSIZE);
1248   GET_SRC_DST_PARAMETERS(DTYPE);
1249 
1250   if (wid > BUFF_LINE) {
1251     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
1252 
1253     if (pbuff == NULL) return MLIB_FAILURE;
1254   }
1255 
1256   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
1257   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1258   buffd = buffs[KSIZE] + wid;
1259   buffo = (mlib_s32*)(buffd + wid);
1260   buffi = buffo + (wid &~ 1);
1261 
1262   chan1 = nchannel;
1263   chan2 = chan1 + chan1;
1264 
1265   wid -= (KSIZE - 1);
1266   hgt -= (KSIZE - 1);
1267 
1268   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1269 
1270   for (c = 0; c < nchannel; c++) {
1271     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1272 
1273     sl = adr_src + c;
1274     dl = adr_dst + c;
1275 
1276     sl1 = sl  + sll;
1277     sl2 = sl1 + sll;
1278     sl3 = sl2 + sll;
1279     sl4 = sl3 + sll;
1280     sl5 = sl4 + sll;
1281     sl6 = sl5 + sll;
1282 #ifdef __SUNPRO_C
1283 #pragma pipeloop(0)
1284 #endif /* __SUNPRO_C */
1285     for (i = 0; i < wid + (KSIZE - 1); i++) {
1286       buffs[0][i] = (FTYPE)sl[i*chan1];
1287       buffs[1][i] = (FTYPE)sl1[i*chan1];
1288       buffs[2][i] = (FTYPE)sl2[i*chan1];
1289       buffs[3][i] = (FTYPE)sl3[i*chan1];
1290       buffs[4][i] = (FTYPE)sl4[i*chan1];
1291       buffs[5][i] = (FTYPE)sl5[i*chan1];
1292       buffs[6][i] = (FTYPE)sl6[i*chan1];
1293     }
1294 
1295     buff_ind = 0;
1296 
1297 #ifdef __SUNPRO_C
1298 #pragma pipeloop(0)
1299 #endif /* __SUNPRO_C */
1300     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1301 
1302     sl += KSIZE*sll;
1303 
1304     for (j = 0; j < hgt; j++) {
1305       FTYPE    **buffc = buffs + buff_ind;
1306       FTYPE    *buffn = buffc[KSIZE];
1307       FTYPE    *pk = k;
1308 
1309       for (l = 0; l < KSIZE; l++) {
1310         FTYPE    *buff = buffc[l];
1311         d64_2x32 dd;
1312 
1313         sp = sl;
1314         dp = dl;
1315 
1316         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1317         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1318 
1319         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1320         k4 = *pk++; k5 = *pk++; k6 = *pk++;
1321 
1322         if (l < (KSIZE - 1)) {
1323 #ifdef __SUNPRO_C
1324 #pragma pipeloop(0)
1325 #endif /* __SUNPRO_C */
1326           for (i = 0; i <= (wid - 2); i += 2) {
1327             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1328 
1329             p6 = buff[i + 6]; p7 = buff[i + 7];
1330 
1331             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1332             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1333           }
1334 
1335         } else {
1336 #ifdef __SUNPRO_C
1337 #pragma pipeloop(0)
1338 #endif /* __SUNPRO_C */
1339           for (i = 0; i <= (wid - 2); i += 2) {
1340             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1341 
1342             p6 = buff[i + 6]; p7 = buff[i + 7];
1343 
1344             LOAD_BUFF(buffi);
1345 
1346             dd.d64 = *(FTYPE   *)(buffi + i);
1347             buffn[i    ] = (FTYPE)dd.i32s.i0;
1348             buffn[i + 1] = (FTYPE)dd.i32s.i1;
1349 
1350             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1351             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1352 
1353             dp[0    ] = FROM_S32(d0);
1354             dp[chan1] = FROM_S32(d1);
1355 
1356             buffd[i    ] = 0.0;
1357             buffd[i + 1] = 0.0;
1358 
1359             sp += chan2;
1360             dp += chan2;
1361           }
1362         }
1363       }
1364 
1365       /* last pixels */
1366       for (; i < wid; i++) {
1367         FTYPE    *pk = k, s = 0;
1368         mlib_s32 d0;
1369 
1370         for (l = 0; l < KSIZE; l++) {
1371           FTYPE    *buff = buffc[l] + i;
1372 
1373           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1374         }
1375 
1376         d0 = D2I(s);
1377         dp[0] = FROM_S32(d0);
1378 
1379         buffn[i] = (FTYPE)sp[0];
1380 
1381         sp += chan1;
1382         dp += chan1;
1383       }
1384 
1385       for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
1386 
1387       /* next line */
1388       sl += sll;
1389       dl += dll;
1390 
1391       buff_ind++;
1392 
1393       if (buff_ind >= KSIZE + 1) buff_ind = 0;
1394     }
1395   }
1396 
1397   if (pbuff != buff) mlib_free(pbuff);
1398 
1399   return MLIB_SUCCESS;
1400 }
1401 
1402 #endif /* IMG_TYPE == 1 */
1403 
1404 /***************************************************************/
1405 #define MAX_KER   7
1406 #define MAX_N    15
1407 
mlib_ImageConv1xN(mlib_image * dst,const mlib_image * src,const mlib_d64 * k,mlib_s32 n,mlib_s32 dn,mlib_s32 cmask)1408 static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
1409                                      const mlib_image *src,
1410                                      const mlib_d64   *k,
1411                                      mlib_s32         n,
1412                                      mlib_s32         dn,
1413                                      mlib_s32         cmask)
1414 {
1415   FTYPE    buff[BUFF_SIZE];
1416   mlib_s32 off, kh;
1417   mlib_s32 d0, d1;
1418   const FTYPE    *pk;
1419   FTYPE    k0, k1, k2, k3;
1420   FTYPE    p0, p1, p2, p3, p4;
1421   DEF_VARS(DTYPE);
1422   DTYPE    *sl_c, *dl_c, *sl0;
1423   mlib_s32 l, hsize, max_hsize;
1424   GET_SRC_DST_PARAMETERS(DTYPE);
1425 
1426   hgt -= (n - 1);
1427   adr_dst += dn*dll;
1428 
1429   max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
1430 
1431   if (!max_hsize) max_hsize = 1;
1432 
1433   if (max_hsize > BUFF_SIZE) {
1434     pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
1435   }
1436 
1437   chan1 = nchannel;
1438 
1439   sl_c = adr_src;
1440   dl_c = adr_dst;
1441 
1442   for (l = 0; l < hgt; l += hsize) {
1443     hsize = hgt - l;
1444 
1445     if (hsize > max_hsize) hsize = max_hsize;
1446 
1447     for (c = 0; c < nchannel; c++) {
1448       if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1449 
1450       sl = sl_c + c;
1451       dl = dl_c + c;
1452 
1453 #ifdef __SUNPRO_C
1454 #pragma pipeloop(0)
1455 #endif /* __SUNPRO_C */
1456       for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
1457 
1458       for (i = 0; i < wid; i++) {
1459         sl0 = sl;
1460 
1461         for (off = 0; off < (n - 4); off += 4) {
1462           pk = k + off;
1463           sp = sl0;
1464 
1465           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1466           p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1467           sp += 3*sll;
1468 
1469 #ifdef __SUNPRO_C
1470 #pragma pipeloop(0)
1471 #endif /* __SUNPRO_C */
1472           for (j = 0; j < hsize; j += 2) {
1473             p0 = p2; p1 = p3; p2 = p4;
1474             p3 = sp[0];
1475             p4 = sp[sll];
1476 
1477             pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1478             pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1479 
1480             sp += 2*sll;
1481           }
1482 
1483           sl0 += 4*sll;
1484         }
1485 
1486         pk = k + off;
1487         sp = sl0;
1488 
1489         k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1490         p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1491 
1492         dp = dl;
1493         kh = n - off;
1494 
1495         if (kh == 4) {
1496           sp += 3*sll;
1497 
1498 #ifdef __SUNPRO_C
1499 #pragma pipeloop(0)
1500 #endif /* __SUNPRO_C */
1501           for (j = 0; j <= (hsize - 2); j += 2) {
1502             p0 = p2; p1 = p3; p2 = p4;
1503             p3 = sp[0];
1504             p4 = sp[sll];
1505 
1506             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1507             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
1508 
1509             dp[0  ] = FROM_S32(d0);
1510             dp[dll] = FROM_S32(d1);
1511 
1512             pbuff[j] = 0;
1513             pbuff[j + 1] = 0;
1514 
1515             sp += 2*sll;
1516             dp += 2*dll;
1517           }
1518 
1519           if (j < hsize) {
1520             p0 = p2; p1 = p3; p2 = p4;
1521             p3 = sp[0];
1522 
1523             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1524 
1525             pbuff[j] = 0;
1526 
1527             dp[0] = FROM_S32(d0);
1528           }
1529 
1530         } else if (kh == 3) {
1531           sp += 2*sll;
1532 
1533 #ifdef __SUNPRO_C
1534 #pragma pipeloop(0)
1535 #endif /* __SUNPRO_C */
1536           for (j = 0; j <= (hsize - 2); j += 2) {
1537             p0 = p2; p1 = p3;
1538             p2 = sp[0];
1539             p3 = sp[sll];
1540 
1541             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1542             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
1543 
1544             dp[0  ] = FROM_S32(d0);
1545             dp[dll] = FROM_S32(d1);
1546 
1547             pbuff[j] = 0;
1548             pbuff[j + 1] = 0;
1549 
1550             sp += 2*sll;
1551             dp += 2*dll;
1552           }
1553 
1554           if (j < hsize) {
1555             p0 = p2; p1 = p3;
1556             p2 = sp[0];
1557 
1558             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1559 
1560             pbuff[j] = 0;
1561 
1562             dp[0] = FROM_S32(d0);
1563           }
1564 
1565         } else if (kh == 2) {
1566           sp += sll;
1567 
1568 #ifdef __SUNPRO_C
1569 #pragma pipeloop(0)
1570 #endif /* __SUNPRO_C */
1571           for (j = 0; j <= (hsize - 2); j += 2) {
1572             p0 = p2;
1573             p1 = sp[0];
1574             p2 = sp[sll];
1575 
1576             d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1577             d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
1578 
1579             dp[0  ] = FROM_S32(d0);
1580             dp[dll] = FROM_S32(d1);
1581 
1582             pbuff[j] = 0;
1583             pbuff[j + 1] = 0;
1584 
1585             sp += 2*sll;
1586             dp += 2*dll;
1587           }
1588 
1589           if (j < hsize) {
1590             p0 = p2;
1591             p1 = sp[0];
1592 
1593             d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1594 
1595             pbuff[j] = 0;
1596 
1597             dp[0] = FROM_S32(d0);
1598           }
1599 
1600         } else /* if (kh == 1) */ {
1601 #ifdef __SUNPRO_C
1602 #pragma pipeloop(0)
1603 #endif /* __SUNPRO_C */
1604           for (j = 0; j < hsize; j++) {
1605             p0 = sp[0];
1606 
1607             d0 = D2I(p0*k0 + pbuff[j]);
1608 
1609             dp[0] = FROM_S32(d0);
1610 
1611             pbuff[j] = 0;
1612 
1613             sp += sll;
1614             dp += dll;
1615           }
1616         }
1617 
1618         sl += chan1;
1619         dl += chan1;
1620       }
1621     }
1622 
1623     sl_c += max_hsize*sll;
1624     dl_c += max_hsize*dll;
1625   }
1626 
1627   if (pbuff != buff) mlib_free(pbuff);
1628 
1629   return MLIB_SUCCESS;
1630 }
1631 
1632 /***************************************************************/
CONV_FUNC(MxN)1633 mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
1634                            const mlib_image *src,
1635                            const mlib_s32   *kernel,
1636                            mlib_s32         m,
1637                            mlib_s32         n,
1638                            mlib_s32         dm,
1639                            mlib_s32         dn,
1640                            mlib_s32         scale,
1641                            mlib_s32         cmask)
1642 {
1643   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1644   FTYPE    **buffs = buffs_arr, *buffd;
1645   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
1646   mlib_s32 mn, l, off, kw, bsize, buff_ind;
1647   mlib_s32 d0, d1;
1648   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1649   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1650   d64_2x32 dd;
1651   DEF_VARS(DTYPE);
1652   mlib_s32 chan2;
1653   mlib_s32 *buffo, *buffi;
1654   mlib_status status = MLIB_SUCCESS;
1655 
1656   GET_SRC_DST_PARAMETERS(DTYPE);
1657 
1658   if (scale > 30) {
1659     fscale *= 1.0/(1 << 30);
1660     scale -= 30;
1661   }
1662 
1663   fscale /= (1 << scale);
1664 
1665   mn = m*n;
1666 
1667   if (mn > 256) {
1668     k = mlib_malloc(mn*sizeof(mlib_d64));
1669 
1670     if (k == NULL) return MLIB_FAILURE;
1671   }
1672 
1673   for (i = 0; i < mn; i++) {
1674     k[i] = kernel[i]*fscale;
1675   }
1676 
1677   if (m == 1) {
1678     status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1679     FREE_AND_RETURN_STATUS;
1680   }
1681 
1682   bsize = (n + 3)*wid;
1683 
1684   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1685     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1686 
1687     if (pbuff == NULL) {
1688       status = MLIB_FAILURE;
1689       FREE_AND_RETURN_STATUS;
1690     }
1691     buffs = (FTYPE   **)(pbuff + bsize);
1692   }
1693 
1694   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1695   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1696   buffd = buffs[n] + wid;
1697   buffo = (mlib_s32*)(buffd + wid);
1698   buffi = buffo + (wid &~ 1);
1699 
1700   chan1 = nchannel;
1701   chan2 = chan1 + chan1;
1702 
1703   wid -= (m - 1);
1704   hgt -= (n - 1);
1705   adr_dst += dn*dll + dm*nchannel;
1706 
1707   for (c = 0; c < nchannel; c++) {
1708     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1709 
1710     sl = adr_src + c;
1711     dl = adr_dst + c;
1712 
1713     for (l = 0; l < n; l++) {
1714       FTYPE    *buff = buffs[l];
1715 
1716 #ifdef __SUNPRO_C
1717 #pragma pipeloop(0)
1718 #endif /* __SUNPRO_C */
1719       for (i = 0; i < wid + (m - 1); i++) {
1720         buff[i] = (FTYPE)sl[i*chan1];
1721       }
1722 
1723       sl += sll;
1724     }
1725 
1726     buff_ind = 0;
1727 
1728 #ifdef __SUNPRO_C
1729 #pragma pipeloop(0)
1730 #endif /* __SUNPRO_C */
1731     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1732 
1733     for (j = 0; j < hgt; j++) {
1734       FTYPE    **buffc = buffs + buff_ind;
1735       FTYPE    *buffn = buffc[n];
1736       FTYPE    *pk = k;
1737 
1738       for (l = 0; l < n; l++) {
1739         FTYPE    *buff_l = buffc[l];
1740 
1741         for (off = 0; off < m;) {
1742           FTYPE    *buff = buff_l + off;
1743 
1744           kw = m - off;
1745 
1746           if (kw > 2*MAX_KER) kw = MAX_KER; else
1747             if (kw > MAX_KER) kw = kw/2;
1748           off += kw;
1749 
1750           sp = sl;
1751           dp = dl;
1752 
1753           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1754           p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1755 
1756           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1757           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1758           pk += kw;
1759 
1760           if (kw == 7) {
1761 
1762             if (l < (n - 1) || off < m) {
1763 #ifdef __SUNPRO_C
1764 #pragma pipeloop(0)
1765 #endif /* __SUNPRO_C */
1766               for (i = 0; i <= (wid - 2); i += 2) {
1767                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1768 
1769                 p6 = buff[i + 6]; p7 = buff[i + 7];
1770 
1771                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1772                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1773               }
1774 
1775             } else {
1776 #ifdef __SUNPRO_C
1777 #pragma pipeloop(0)
1778 #endif /* __SUNPRO_C */
1779               for (i = 0; i <= (wid - 2); i += 2) {
1780                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1781 
1782                 p6 = buff[i + 6]; p7 = buff[i + 7];
1783 
1784                 LOAD_BUFF(buffi);
1785 
1786                 dd.d64 = *(FTYPE   *)(buffi + i);
1787                 buffn[i    ] = (FTYPE)dd.i32s.i0;
1788                 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1789 
1790                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1791                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1792 
1793                 dp[0    ] = FROM_S32(d0);
1794                 dp[chan1] = FROM_S32(d1);
1795 
1796                 buffd[i    ] = 0.0;
1797                 buffd[i + 1] = 0.0;
1798 
1799                 sp += chan2;
1800                 dp += chan2;
1801               }
1802             }
1803 
1804           } else if (kw == 6) {
1805 
1806             if (l < (n - 1) || off < m) {
1807 #ifdef __SUNPRO_C
1808 #pragma pipeloop(0)
1809 #endif /* __SUNPRO_C */
1810               for (i = 0; i <= (wid - 2); i += 2) {
1811                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1812 
1813                 p5 = buff[i + 5]; p6 = buff[i + 6];
1814 
1815                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1816                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1817               }
1818 
1819             } else {
1820 #ifdef __SUNPRO_C
1821 #pragma pipeloop(0)
1822 #endif /* __SUNPRO_C */
1823               for (i = 0; i <= (wid - 2); i += 2) {
1824                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1825 
1826                 p5 = buff[i + 5]; p6 = buff[i + 6];
1827 
1828                 buffn[i    ] = (FTYPE)sp[0];
1829                 buffn[i + 1] = (FTYPE)sp[chan1];
1830 
1831                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1832                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1833 
1834                 dp[0    ] = FROM_S32(d0);
1835                 dp[chan1] = FROM_S32(d1);
1836 
1837                 buffd[i    ] = 0.0;
1838                 buffd[i + 1] = 0.0;
1839 
1840                 sp += chan2;
1841                 dp += chan2;
1842               }
1843             }
1844 
1845           } else if (kw == 5) {
1846 
1847             if (l < (n - 1) || off < m) {
1848 #ifdef __SUNPRO_C
1849 #pragma pipeloop(0)
1850 #endif /* __SUNPRO_C */
1851               for (i = 0; i <= (wid - 2); i += 2) {
1852                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1853 
1854                 p4 = buff[i + 4]; p5 = buff[i + 5];
1855 
1856                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1857                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1858               }
1859 
1860             } else {
1861 #ifdef __SUNPRO_C
1862 #pragma pipeloop(0)
1863 #endif /* __SUNPRO_C */
1864               for (i = 0; i <= (wid - 2); i += 2) {
1865                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1866 
1867                 p4 = buff[i + 4]; p5 = buff[i + 5];
1868 
1869                 buffn[i    ] = (FTYPE)sp[0];
1870                 buffn[i + 1] = (FTYPE)sp[chan1];
1871 
1872                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1873                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1874 
1875                 dp[0    ] = FROM_S32(d0);
1876                 dp[chan1] = FROM_S32(d1);
1877 
1878                 buffd[i    ] = 0.0;
1879                 buffd[i + 1] = 0.0;
1880 
1881                 sp += chan2;
1882                 dp += chan2;
1883               }
1884             }
1885 
1886           } else if (kw == 4) {
1887 
1888             if (l < (n - 1) || off < m) {
1889 #ifdef __SUNPRO_C
1890 #pragma pipeloop(0)
1891 #endif /* __SUNPRO_C */
1892               for (i = 0; i <= (wid - 2); i += 2) {
1893                 p0 = p2; p1 = p3; p2 = p4;
1894 
1895                 p3 = buff[i + 3]; p4 = buff[i + 4];
1896 
1897                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1898                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1899               }
1900 
1901             } else {
1902 #ifdef __SUNPRO_C
1903 #pragma pipeloop(0)
1904 #endif /* __SUNPRO_C */
1905               for (i = 0; i <= (wid - 2); i += 2) {
1906                 p0 = p2; p1 = p3; p2 = p4;
1907 
1908                 p3 = buff[i + 3]; p4 = buff[i + 4];
1909 
1910                 buffn[i    ] = (FTYPE)sp[0];
1911                 buffn[i + 1] = (FTYPE)sp[chan1];
1912 
1913                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1914                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1915 
1916                 dp[0    ] = FROM_S32(d0);
1917                 dp[chan1] = FROM_S32(d1);
1918 
1919                 buffd[i    ] = 0.0;
1920                 buffd[i + 1] = 0.0;
1921 
1922                 sp += chan2;
1923                 dp += chan2;
1924               }
1925             }
1926 
1927           } else if (kw == 3) {
1928 
1929             if (l < (n - 1) || off < m) {
1930 #ifdef __SUNPRO_C
1931 #pragma pipeloop(0)
1932 #endif /* __SUNPRO_C */
1933               for (i = 0; i <= (wid - 2); i += 2) {
1934                 p0 = p2; p1 = p3;
1935 
1936                 p2 = buff[i + 2]; p3 = buff[i + 3];
1937 
1938                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1939                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1940               }
1941 
1942             } else {
1943 #ifdef __SUNPRO_C
1944 #pragma pipeloop(0)
1945 #endif /* __SUNPRO_C */
1946               for (i = 0; i <= (wid - 2); i += 2) {
1947                 p0 = p2; p1 = p3;
1948 
1949                 p2 = buff[i + 2]; p3 = buff[i + 3];
1950 
1951                 buffn[i    ] = (FTYPE)sp[0];
1952                 buffn[i + 1] = (FTYPE)sp[chan1];
1953 
1954                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1955                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1956 
1957                 dp[0    ] = FROM_S32(d0);
1958                 dp[chan1] = FROM_S32(d1);
1959 
1960                 buffd[i    ] = 0.0;
1961                 buffd[i + 1] = 0.0;
1962 
1963                 sp += chan2;
1964                 dp += chan2;
1965               }
1966             }
1967 
1968           } else /*if (kw == 2)*/ {
1969 
1970             if (l < (n - 1) || off < m) {
1971 #ifdef __SUNPRO_C
1972 #pragma pipeloop(0)
1973 #endif /* __SUNPRO_C */
1974               for (i = 0; i <= (wid - 2); i += 2) {
1975                 p0 = p2;
1976 
1977                 p1 = buff[i + 1]; p2 = buff[i + 2];
1978 
1979                 buffd[i    ] += p0*k0 + p1*k1;
1980                 buffd[i + 1] += p1*k0 + p2*k1;
1981               }
1982 
1983             } else {
1984 #ifdef __SUNPRO_C
1985 #pragma pipeloop(0)
1986 #endif /* __SUNPRO_C */
1987               for (i = 0; i <= (wid - 2); i += 2) {
1988                 p0 = p2;
1989 
1990                 p1 = buff[i + 1]; p2 = buff[i + 2];
1991 
1992                 buffn[i    ] = (FTYPE)sp[0];
1993                 buffn[i + 1] = (FTYPE)sp[chan1];
1994 
1995                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
1996                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1997 
1998                 dp[0    ] = FROM_S32(d0);
1999                 dp[chan1] = FROM_S32(d1);
2000 
2001                 buffd[i    ] = 0.0;
2002                 buffd[i + 1] = 0.0;
2003 
2004                 sp += chan2;
2005                 dp += chan2;
2006               }
2007             }
2008           }
2009         }
2010       }
2011 
2012       /* last pixels */
2013       for (; i < wid; i++) {
2014         FTYPE    *pk = k, s = 0;
2015         mlib_s32 x, d0;
2016 
2017         for (l = 0; l < n; l++) {
2018           FTYPE    *buff = buffc[l] + i;
2019 
2020           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2021         }
2022 
2023         d0 = D2I(s);
2024         dp[0] = FROM_S32(d0);
2025 
2026         buffn[i] = (FTYPE)sp[0];
2027 
2028         sp += chan1;
2029         dp += chan1;
2030       }
2031 
2032       for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
2033 
2034       /* next line */
2035       sl += sll;
2036       dl += dll;
2037 
2038       buff_ind++;
2039 
2040       if (buff_ind >= n + 1) buff_ind = 0;
2041     }
2042   }
2043 
2044   FREE_AND_RETURN_STATUS;
2045 }
2046 
2047 /***************************************************************/
2048 #ifndef __sparc /* for x86, using integer multiplies is faster */
2049 
2050 #define STORE_RES(res, x)                                       \
2051   x >>= shift2;                                                 \
2052   CLAMP_STORE(res, x)
2053 
CONV_FUNC_I(MxN)2054 mlib_status CONV_FUNC_I(MxN)(mlib_image       *dst,
2055                              const mlib_image *src,
2056                              const mlib_s32   *kernel,
2057                              mlib_s32         m,
2058                              mlib_s32         n,
2059                              mlib_s32         dm,
2060                              mlib_s32         dn,
2061                              mlib_s32         scale,
2062                              mlib_s32         cmask)
2063 {
2064   mlib_s32 buff[BUFF_SIZE], *buffd = buff;
2065   mlib_s32 l, off, kw;
2066   mlib_s32 d0, d1, shift1, shift2;
2067   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2068   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2069   DTYPE    *adr_src, *sl, *sp = NULL;
2070   DTYPE    *adr_dst, *dl, *dp = NULL;
2071   mlib_s32 wid, hgt, sll, dll;
2072   mlib_s32 nchannel, chan1;
2073   mlib_s32 i, j, c;
2074   mlib_s32 chan2;
2075   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2076   GET_SRC_DST_PARAMETERS(DTYPE);
2077 
2078 #if IMG_TYPE != 1
2079   shift1 = 16;
2080 #else
2081   shift1 = 8;
2082 #endif /* IMG_TYPE != 1 */
2083   shift2 = scale - shift1;
2084 
2085   chan1 = nchannel;
2086   chan2 = chan1 + chan1;
2087 
2088   wid -= (m - 1);
2089   hgt -= (n - 1);
2090   adr_dst += dn*dll + dm*nchannel;
2091 
2092   if (wid > BUFF_SIZE) {
2093     buffd = mlib_malloc(sizeof(mlib_s32)*wid);
2094 
2095     if (buffd == NULL) return MLIB_FAILURE;
2096   }
2097 
2098   if (m*n > MAX_N*MAX_N) {
2099     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2100 
2101     if (k == NULL) {
2102       if (buffd != buff) mlib_free(buffd);
2103       return MLIB_FAILURE;
2104     }
2105   }
2106 
2107   for (i = 0; i < m*n; i++) {
2108     k[i] = kernel[i] >> shift1;
2109   }
2110 
2111   for (c = 0; c < nchannel; c++) {
2112     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2113 
2114     sl = adr_src + c;
2115     dl = adr_dst + c;
2116 
2117 #ifdef __SUNPRO_C
2118 #pragma pipeloop(0)
2119 #endif /* __SUNPRO_C */
2120     for (i = 0; i < wid; i++) buffd[i] = 0;
2121 
2122     for (j = 0; j < hgt; j++) {
2123       mlib_s32 *pk = k;
2124 
2125       for (l = 0; l < n; l++) {
2126         DTYPE *sp0 = sl + l*sll;
2127 
2128         for (off = 0; off < m;) {
2129           sp = sp0 + off*chan1;
2130           dp = dl;
2131 
2132           kw = m - off;
2133 
2134           if (kw > 2*MAX_KER) kw = MAX_KER; else
2135             if (kw > MAX_KER) kw = kw/2;
2136           off += kw;
2137 
2138           p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
2139           p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
2140 
2141           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2142           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2143           pk += kw;
2144 
2145           sp += (kw - 1)*chan1;
2146 
2147           if (kw == 7) {
2148 
2149             if (l < (n - 1) || off < m) {
2150 #ifdef __SUNPRO_C
2151 #pragma pipeloop(0)
2152 #endif /* __SUNPRO_C */
2153               for (i = 0; i <= (wid - 2); i += 2) {
2154                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2155                 p6 = sp[0];
2156                 p7 = sp[chan1];
2157 
2158                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2159                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2160 
2161                 sp += chan2;
2162               }
2163 
2164             } else {
2165 #ifdef __SUNPRO_C
2166 #pragma pipeloop(0)
2167 #endif /* __SUNPRO_C */
2168               for (i = 0; i <= (wid - 2); i += 2) {
2169                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2170                 p6 = sp[0];
2171                 p7 = sp[chan1];
2172 
2173                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
2174                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2175 
2176                 STORE_RES(dp[0    ], d0);
2177                 STORE_RES(dp[chan1], d1);
2178 
2179                 buffd[i    ] = 0;
2180                 buffd[i + 1] = 0;
2181 
2182                 sp += chan2;
2183                 dp += chan2;
2184               }
2185             }
2186 
2187           } else if (kw == 6) {
2188 
2189             if (l < (n - 1) || off < m) {
2190 #ifdef __SUNPRO_C
2191 #pragma pipeloop(0)
2192 #endif /* __SUNPRO_C */
2193               for (i = 0; i <= (wid - 2); i += 2) {
2194                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2195                 p5 = sp[0];
2196                 p6 = sp[chan1];
2197 
2198                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2199                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2200 
2201                 sp += chan2;
2202               }
2203 
2204             } else {
2205 #ifdef __SUNPRO_C
2206 #pragma pipeloop(0)
2207 #endif /* __SUNPRO_C */
2208               for (i = 0; i <= (wid - 2); i += 2) {
2209                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2210                 p5 = sp[0];
2211                 p6 = sp[chan1];
2212 
2213                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
2214                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2215 
2216                 STORE_RES(dp[0    ], d0);
2217                 STORE_RES(dp[chan1], d1);
2218 
2219                 buffd[i    ] = 0;
2220                 buffd[i + 1] = 0;
2221 
2222                 sp += chan2;
2223                 dp += chan2;
2224               }
2225             }
2226 
2227           } else if (kw == 5) {
2228 
2229             if (l < (n - 1) || off < m) {
2230 #ifdef __SUNPRO_C
2231 #pragma pipeloop(0)
2232 #endif /* __SUNPRO_C */
2233               for (i = 0; i <= (wid - 2); i += 2) {
2234                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2235                 p4 = sp[0];
2236                 p5 = sp[chan1];
2237 
2238                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2239                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2240 
2241                 sp += chan2;
2242               }
2243 
2244             } else {
2245 #ifdef __SUNPRO_C
2246 #pragma pipeloop(0)
2247 #endif /* __SUNPRO_C */
2248               for (i = 0; i <= (wid - 2); i += 2) {
2249                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2250                 p4 = sp[0];
2251                 p5 = sp[chan1];
2252 
2253                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
2254                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2255 
2256                 STORE_RES(dp[0    ], d0);
2257                 STORE_RES(dp[chan1], d1);
2258 
2259                 buffd[i    ] = 0;
2260                 buffd[i + 1] = 0;
2261 
2262                 sp += chan2;
2263                 dp += chan2;
2264               }
2265             }
2266 
2267           } else if (kw == 4) {
2268 
2269             if (l < (n - 1) || off < m) {
2270 #ifdef __SUNPRO_C
2271 #pragma pipeloop(0)
2272 #endif /* __SUNPRO_C */
2273               for (i = 0; i <= (wid - 2); i += 2) {
2274                 p0 = p2; p1 = p3; p2 = p4;
2275                 p3 = sp[0];
2276                 p4 = sp[chan1];
2277 
2278                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2279                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2280 
2281                 sp += chan2;
2282               }
2283 
2284             } else {
2285 #ifdef __SUNPRO_C
2286 #pragma pipeloop(0)
2287 #endif /* __SUNPRO_C */
2288               for (i = 0; i <= (wid - 2); i += 2) {
2289                 p0 = p2; p1 = p3; p2 = p4;
2290                 p3 = sp[0];
2291                 p4 = sp[chan1];
2292 
2293                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
2294                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2295 
2296                 STORE_RES(dp[0    ], d0);
2297                 STORE_RES(dp[chan1], d1);
2298 
2299                 buffd[i    ] = 0;
2300                 buffd[i + 1] = 0;
2301 
2302                 sp += chan2;
2303                 dp += chan2;
2304               }
2305             }
2306 
2307           } else if (kw == 3) {
2308 
2309             if (l < (n - 1) || off < m) {
2310 #ifdef __SUNPRO_C
2311 #pragma pipeloop(0)
2312 #endif /* __SUNPRO_C */
2313               for (i = 0; i <= (wid - 2); i += 2) {
2314                 p0 = p2; p1 = p3;
2315                 p2 = sp[0];
2316                 p3 = sp[chan1];
2317 
2318                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
2319                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2320 
2321                 sp += chan2;
2322               }
2323 
2324             } else {
2325 #ifdef __SUNPRO_C
2326 #pragma pipeloop(0)
2327 #endif /* __SUNPRO_C */
2328               for (i = 0; i <= (wid - 2); i += 2) {
2329                 p0 = p2; p1 = p3;
2330                 p2 = sp[0];
2331                 p3 = sp[chan1];
2332 
2333                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
2334                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2335 
2336                 STORE_RES(dp[0    ], d0);
2337                 STORE_RES(dp[chan1], d1);
2338 
2339                 buffd[i    ] = 0;
2340                 buffd[i + 1] = 0;
2341 
2342                 sp += chan2;
2343                 dp += chan2;
2344               }
2345             }
2346 
2347           } else if (kw == 2) {
2348 
2349             if (l < (n - 1) || off < m) {
2350 #ifdef __SUNPRO_C
2351 #pragma pipeloop(0)
2352 #endif /* __SUNPRO_C */
2353               for (i = 0; i <= (wid - 2); i += 2) {
2354                 p0 = p2;
2355                 p1 = sp[0];
2356                 p2 = sp[chan1];
2357 
2358                 buffd[i    ] += p0*k0 + p1*k1;
2359                 buffd[i + 1] += p1*k0 + p2*k1;
2360 
2361                 sp += chan2;
2362               }
2363 
2364             } else {
2365 #ifdef __SUNPRO_C
2366 #pragma pipeloop(0)
2367 #endif /* __SUNPRO_C */
2368               for (i = 0; i <= (wid - 2); i += 2) {
2369                 p0 = p2;
2370                 p1 = sp[0];
2371                 p2 = sp[chan1];
2372 
2373                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
2374                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2375 
2376                 STORE_RES(dp[0    ], d0);
2377                 STORE_RES(dp[chan1], d1);
2378 
2379                 buffd[i    ] = 0;
2380                 buffd[i + 1] = 0;
2381 
2382                 sp += chan2;
2383                 dp += chan2;
2384               }
2385             }
2386 
2387           } else /*if (kw == 1)*/ {
2388 
2389             if (l < (n - 1) || off < m) {
2390 #ifdef __SUNPRO_C
2391 #pragma pipeloop(0)
2392 #endif /* __SUNPRO_C */
2393               for (i = 0; i <= (wid - 2); i += 2) {
2394                 p0 = sp[0];
2395                 p1 = sp[chan1];
2396 
2397                 buffd[i    ] += p0*k0;
2398                 buffd[i + 1] += p1*k0;
2399 
2400                 sp += chan2;
2401               }
2402 
2403             } else {
2404 #ifdef __SUNPRO_C
2405 #pragma pipeloop(0)
2406 #endif /* __SUNPRO_C */
2407               for (i = 0; i <= (wid - 2); i += 2) {
2408                 p0 = sp[0];
2409                 p1 = sp[chan1];
2410 
2411                 d0 = (p0*k0 + buffd[i    ]);
2412                 d1 = (p1*k0 + buffd[i + 1]);
2413 
2414                 STORE_RES(dp[0    ], d0);
2415                 STORE_RES(dp[chan1], d1);
2416 
2417                 buffd[i    ] = 0;
2418                 buffd[i + 1] = 0;
2419 
2420                 sp += chan2;
2421                 dp += chan2;
2422               }
2423             }
2424           }
2425         }
2426       }
2427 
2428       /* last pixels */
2429       for (; i < wid; i++) {
2430         mlib_s32 *pk = k, s = 0;
2431         mlib_s32 x;
2432 
2433         for (l = 0; l < n; l++) {
2434           sp = sl + l*sll + i*chan1;
2435 
2436           for (x = 0; x < m; x++) {
2437             s += sp[0] * pk[0];
2438             sp += chan1;
2439             pk ++;
2440           }
2441         }
2442 
2443         STORE_RES(dp[0], s);
2444 
2445         sp += chan1;
2446         dp += chan1;
2447       }
2448 
2449       sl += sll;
2450       dl += dll;
2451     }
2452   }
2453 
2454   if (buffd != buff) mlib_free(buffd);
2455   if (k != k_locl) mlib_free(k);
2456 
2457   return MLIB_SUCCESS;
2458 }
2459 
2460 /***************************************************************/
2461 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
2462 
2463 /***************************************************************/
2464