1 /*
2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 /*
28  * FUNCTION
29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 types and
30  *   MLIB_EDGE_DST_NO_WRITE mask
31  */
32 
33 #include "mlib_image.h"
34 #include "mlib_ImageConv.h"
35 #include "mlib_c_ImageConv.h"
36 
37 /*
38   This define switches between functions of different data types
39 */
40 #define IMG_TYPE 1
41 
42 /***************************************************************/
43 #if IMG_TYPE == 1
44 
45 #define DTYPE             mlib_u8
46 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##nw_u8
47 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
48 #define DSCALE            (1 << 24)
49 #define FROM_S32(x)       (((x) >> 24) ^ 128)
50 #define S64TOS32(x)       (x)
51 #define SAT_OFF           -(1u << 31)
52 
53 #elif IMG_TYPE == 2
54 
55 #define DTYPE             mlib_s16
56 #define CONV_FUNC(KERN)   mlib_conv##KERN##nw_s16
57 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
58 #define DSCALE            65536.0
59 #define FROM_S32(x)       ((x) >> 16)
60 #define S64TOS32(x)       ((x) & 0xffffffff)
61 #define SAT_OFF
62 
63 #elif IMG_TYPE == 3
64 
65 #define DTYPE             mlib_u16
66 #define CONV_FUNC(KERN)   mlib_conv##KERN##nw_u16
67 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
68 #define DSCALE            65536.0
69 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
70 #define S64TOS32(x)       (x)
71 #define SAT_OFF           -(1u << 31)
72 
73 #endif /* IMG_TYPE == 1 */
74 
75 /***************************************************************/
76 #define BUFF_SIZE   1600
77 
78 #define CACHE_SIZE  (64*1024)
79 
80 /***************************************************************/
81 #define FTYPE mlib_d64
82 
83 #ifndef MLIB_USE_FTOI_CLAMPING
84 
85 #define CLAMP_S32(x)                                            \
86   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
87 
88 #else
89 
90 #define CLAMP_S32(x) ((mlib_s32)(x))
91 
92 #endif /* MLIB_USE_FTOI_CLAMPING */
93 
94 /***************************************************************/
95 #define D2I(x) CLAMP_S32((x) SAT_OFF)
96 
97 /***************************************************************/
98 #ifdef VM_LITTLE_ENDIAN
99 
100 #define STORE2(res0, res1)                                      \
101   dp[0    ] = res1;                                             \
102   dp[chan1] = res0
103 
104 #else
105 
106 #define STORE2(res0, res1)                                      \
107   dp[0    ] = res0;                                             \
108   dp[chan1] = res1
109 
110 #endif /* VM_LITTLE_ENDIAN */
111 
112 /***************************************************************/
113 #ifdef _NO_LONGLONG
114 
115 #define LOAD_BUFF(buff)                                         \
116   buff[i    ] = sp[0];                                          \
117   buff[i + 1] = sp[chan1]
118 
119 #else /* _NO_LONGLONG */
120 
121 #ifdef VM_LITTLE_ENDIAN
122 
123 #define LOAD_BUFF(buff)                                         \
124   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
125 
126 #else /* VM_LITTLE_ENDIAN */
127 
128 #define LOAD_BUFF(buff)                                         \
129   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
130 
131 #endif /* VM_LITTLE_ENDIAN */
132 #endif /* _NO_LONGLONG */
133 
134 /***************************************************************/
135 typedef union {
136   mlib_d64 d64;
137   struct {
138     mlib_s32 i0;
139     mlib_s32 i1;
140   } i32s;
141   struct {
142     mlib_s32 f0;
143     mlib_s32 f1;
144   } f32s;
145 } d64_2x32;
146 
147 /***************************************************************/
148 #define BUFF_LINE 256
149 
150 /***************************************************************/
151 #define DEF_VARS(type)                                          \
152   type     *adr_src, *sl, *sp = NULL;                           \
153   type     *adr_dst, *dl, *dp = NULL;                           \
154   FTYPE    *pbuff = buff;                                       \
155   mlib_s32 wid, hgt, sll, dll;                                  \
156   mlib_s32 nchannel, chan1;                                     \
157   mlib_s32 i, j, c
158 
159 /***************************************************************/
160 #define LOAD_KERNEL3()                                                   \
161   FTYPE    scalef = DSCALE;                                              \
162   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8;                           \
163   FTYPE    p00, p01, p02, p03,                                           \
164            p10, p11, p12, p13,                                           \
165            p20, p21, p22, p23;                                           \
166                                                                          \
167   while (scalef_expon > 30) {                                            \
168     scalef /= (1 << 30);                                                 \
169     scalef_expon -= 30;                                                  \
170   }                                                                      \
171                                                                          \
172   scalef /= (1 << scalef_expon);                                         \
173                                                                          \
174   /* keep kernel in regs */                                              \
175   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2]; \
176   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5]; \
177   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8]
178 
179 /***************************************************************/
180 #define LOAD_KERNEL(SIZE)                                       \
181   FTYPE    scalef = DSCALE;                                     \
182                                                                 \
183   while (scalef_expon > 30) {                                   \
184     scalef /= (1 << 30);                                        \
185     scalef_expon -= 30;                                         \
186   }                                                             \
187                                                                 \
188   scalef /= (1 << scalef_expon);                                \
189                                                                 \
190   for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
191 
192 /***************************************************************/
193 #define GET_SRC_DST_PARAMETERS(type)                            \
194   hgt = mlib_ImageGetHeight(src);                               \
195   wid = mlib_ImageGetWidth(src);                                \
196   nchannel = mlib_ImageGetChannels(src);                        \
197   sll = mlib_ImageGetStride(src) / sizeof(type);                \
198   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
199   adr_src = (type *)mlib_ImageGetData(src);                     \
200   adr_dst = (type *)mlib_ImageGetData(dst)
201 
202 /***************************************************************/
203 #ifndef __sparc
204 
205 #if IMG_TYPE == 1
206 
207 /* Test for the presence of any "1" bit in bits
208    8 to 31 of val. If present, then val is either
209    negative or >255. If over/underflows of 8 bits
210    are uncommon, then this technique can be a win,
211    since only a single test, rather than two, is
212    necessary to determine if clamping is needed.
213    On the other hand, if over/underflows are common,
214    it adds an extra test.
215 */
216 #define CLAMP_STORE(dst, val)                                   \
217   if (val & 0xffffff00) {                                       \
218     if (val < MLIB_U8_MIN)                                      \
219       dst = MLIB_U8_MIN;                                        \
220     else                                                        \
221       dst = MLIB_U8_MAX;                                        \
222   } else {                                                      \
223     dst = (mlib_u8)val;                                         \
224   }
225 
226 #elif IMG_TYPE == 2
227 
228 #define CLAMP_STORE(dst, val)                                   \
229   if (val >= MLIB_S16_MAX)                                      \
230     dst = MLIB_S16_MAX;                                         \
231   else if (val <= MLIB_S16_MIN)                                 \
232     dst = MLIB_S16_MIN;                                         \
233   else                                                          \
234     dst = (mlib_s16)val
235 
236 #elif IMG_TYPE == 3
237 
238 #define CLAMP_STORE(dst, val)                                   \
239   if (val >= MLIB_U16_MAX)                                      \
240     dst = MLIB_U16_MAX;                                         \
241   else if (val <= MLIB_U16_MIN)                                 \
242     dst = MLIB_U16_MIN;                                         \
243   else                                                          \
244     dst = (mlib_u16)val
245 
246 #endif /* IMG_TYPE == 1 */
247 #endif /* __sparc */
248 
249 /***************************************************************/
250 #define KSIZE  3
251 
252 mlib_status CONV_FUNC(3x3)(mlib_image       *dst,
253                            const mlib_image *src,
254                            const mlib_s32   *kern,
255                            mlib_s32         scalef_expon,
256                            mlib_s32         cmask)
257 {
258   FTYPE    buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
259   DEF_VARS(DTYPE);
260   DTYPE *sl1;
261   mlib_s32 chan2;
262   mlib_s32 *buffo, *buffi;
263   DTYPE *sl2;
264 #ifndef __sparc
265   mlib_s32 d0, d1;
266 #endif /* __sparc */
267   LOAD_KERNEL3();
268   GET_SRC_DST_PARAMETERS(DTYPE);
269 
270   if (wid > BUFF_LINE) {
271     pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE)*wid);
272 
273     if (pbuff == NULL) return MLIB_FAILURE;
274   }
275 
276   buff0 = pbuff;
277   buff1 = buff0 + wid;
278   buff2 = buff1 + wid;
279   buff3 = buff2 + wid;
280   buffo = (mlib_s32*)(buff3 + wid);
281   buffi = buffo + (wid &~ 1);
282 
283   chan1 = nchannel;
284   chan2 = chan1 + chan1;
285 
286   wid -= (KSIZE - 1);
287   hgt -= (KSIZE - 1);
288 
289   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
290 
291   for (c = 0; c < nchannel; c++) {
292     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
293 
294     sl = adr_src + c;
295     dl = adr_dst + c;
296 
297     sl1 = sl  + sll;
298     sl2 = sl1 + sll;
299 #ifdef __SUNPRO_C
300 #pragma pipeloop(0)
301 #endif /* __SUNPRO_C */
302     for (i = 0; i < wid + (KSIZE - 1); i++) {
303       buff0[i] = (FTYPE)sl[i*chan1];
304       buff1[i] = (FTYPE)sl1[i*chan1];
305       buff2[i] = (FTYPE)sl2[i*chan1];
306     }
307 
308     sl += KSIZE*sll;
309 
310     for (j = 0; j < hgt; j++) {
311       FTYPE    s0, s1;
312 
313       p02 = buff0[0];
314       p12 = buff1[0];
315       p22 = buff2[0];
316 
317       p03 = buff0[1];
318       p13 = buff1[1];
319       p23 = buff2[1];
320 
321       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
322       s1 = p03 * k0 + p13 * k3 + p23 * k6;
323 
324       sp = sl;
325       dp = dl;
326 
327 #ifdef __SUNPRO_C
328 #pragma pipeloop(0)
329 #endif /* __SUNPRO_C */
330       for (i = 0; i <= (wid - 2); i += 2) {
331 #ifdef __sparc
332 #ifdef _NO_LONGLONG
333         mlib_s32 o64_1, o64_2;
334 #else /* _NO_LONGLONG */
335         mlib_s64 o64;
336 #endif /* _NO_LONGLONG */
337 #endif /* __sparc */
338         d64_2x32 dd;
339 
340         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
341         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
342 
343         LOAD_BUFF(buffi);
344 
345         dd.d64 = *(FTYPE   *)(buffi + i);
346         buff3[i    ] = (FTYPE)dd.i32s.i0;
347         buff3[i + 1] = (FTYPE)dd.i32s.i1;
348 
349 #ifndef __sparc
350         d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
351         d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
352 
353         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
354         s1 = p03 * k0 + p13 * k3 + p23 * k6;
355 
356         dp[0    ] = FROM_S32(d0);
357         dp[chan1] = FROM_S32(d1);
358 
359 #else /* __sparc */
360 
361         dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
362         dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
363         *(FTYPE   *)(buffo + i) = dd.d64;
364 
365         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
366         s1 = p03 * k0 + p13 * k3 + p23 * k6;
367 
368 #ifdef _NO_LONGLONG
369 
370         o64_1 = buffo[i];
371         o64_2 = buffo[i+1];
372 #if IMG_TYPE != 1
373         STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
374 #else
375         STORE2(o64_1 >> 24, o64_2 >> 24);
376 #endif /* IMG_TYPE != 1 */
377 
378 #else /* _NO_LONGLONG */
379 
380         o64 = *(mlib_s64*)(buffo + i);
381 #if IMG_TYPE != 1
382         STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
383 #else
384         STORE2(o64 >> 56, o64 >> 24);
385 #endif /* IMG_TYPE != 1 */
386 #endif /* _NO_LONGLONG */
387 #endif /* __sparc */
388 
389         sp += chan2;
390         dp += chan2;
391       }
392 
393       for (; i < wid; i++) {
394         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
395         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
396         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
397 
398         buffi[i] = (mlib_s32)sp[0];
399         buff3[i] = (FTYPE)buffi[i];
400 
401 #ifndef __sparc
402 
403         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
404                  p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
405 
406         dp[0] = FROM_S32(d0);
407 
408 #else  /* __sparc */
409 
410         buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
411                        p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
412 #if IMG_TYPE != 1
413         dp[0] = FROM_S32(buffo[i]);
414 #else
415         dp[0] = buffo[i] >> 24;
416 #endif /* IMG_TYPE != 1 */
417 #endif /* __sparc */
418 
419         sp += chan1;
420         dp += chan1;
421       }
422 
423       buffi[wid] = (mlib_s32)sp[0];
424       buff3[wid] = (FTYPE)buffi[wid];
425       buffi[wid + 1] = (mlib_s32)sp[chan1];
426       buff3[wid + 1] = (FTYPE)buffi[wid + 1];
427 
428       sl += sll;
429       dl += dll;
430 
431       buffT = buff0;
432       buff0 = buff1;
433       buff1 = buff2;
434       buff2 = buff3;
435       buff3 = buffT;
436     }
437   }
438 
439 #ifdef __sparc
440 #if IMG_TYPE == 1
441   {
442     mlib_s32 amask = (1 << nchannel) - 1;
443 
444     if ((cmask & amask) != amask) {
445       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
446     } else {
447       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
448     }
449   }
450 
451 #endif /* IMG_TYPE == 1 */
452 #endif /* __sparc */
453 
454   if (pbuff != buff) mlib_free(pbuff);
455 
456   return MLIB_SUCCESS;
457 }
458 
459 /***************************************************************/
460 #ifndef __sparc /* for x86, using integer multiplies is faster */
461 
462 mlib_status CONV_FUNC_I(3x3)(mlib_image       *dst,
463                              const mlib_image *src,
464                              const mlib_s32   *kern,
465                              mlib_s32         scalef_expon,
466                              mlib_s32         cmask)
467 {
468   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2;
469   DTYPE    *adr_dst, *dl, *dp;
470   mlib_s32 wid, hgt, sll, dll;
471   mlib_s32 nchannel, chan1, chan2;
472   mlib_s32 i, j, c;
473   mlib_s32 shift1, shift2;
474   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
475   mlib_s32 p02, p03,
476            p12, p13,
477            p22, p23;
478 
479 #if IMG_TYPE != 1
480   shift1 = 16;
481 #else
482   shift1 = 8;
483 #endif /* IMG_TYPE != 1 */
484 
485   shift2 = scalef_expon - shift1;
486 
487   /* keep kernel in regs */
488   k0 = kern[0] >> shift1;  k1 = kern[1] >> shift1;  k2 = kern[2] >> shift1;
489   k3 = kern[3] >> shift1;  k4 = kern[4] >> shift1;  k5 = kern[5] >> shift1;
490   k6 = kern[6] >> shift1;  k7 = kern[7] >> shift1;  k8 = kern[8] >> shift1;
491 
492   GET_SRC_DST_PARAMETERS(DTYPE);
493 
494   chan1 = nchannel;
495   chan2 = chan1 + chan1;
496 
497   wid -= (KSIZE - 1);
498   hgt -= (KSIZE - 1);
499 
500   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
501 
502   for (c = 0; c < chan1; c++) {
503     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
504 
505     sl = adr_src + c;
506     dl = adr_dst + c;
507 
508     for (j = 0; j < hgt; j++) {
509       mlib_s32 s0, s1;
510       mlib_s32 pix0, pix1;
511 
512       dp  = dl;
513       sp0 = sl;
514       sp1 = sp0 + sll;
515       sp2 = sp1 + sll;
516 
517       p02 = sp0[0];
518       p12 = sp1[0];
519       p22 = sp2[0];
520 
521       p03 = sp0[chan1];
522       p13 = sp1[chan1];
523       p23 = sp2[chan1];
524 
525       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
526       s1 = p03 * k0 + p13 * k3 + p23 * k6;
527 
528       sp0 += chan2;
529       sp1 += chan2;
530       sp2 += chan2;
531 
532 #ifdef __SUNPRO_C
533 #pragma pipeloop(0)
534 #endif /* __SUNPRO_C */
535       for (i = 0; i <= (wid - 2); i += 2) {
536         p02 = sp0[0];     p12 = sp1[0];     p22 = sp2[0];
537         p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
538 
539         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
540         pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
541                 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
542 
543         CLAMP_STORE(dp[0],     pix0)
544         CLAMP_STORE(dp[chan1], pix1)
545 
546         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
547         s1 = p03 * k0 + p13 * k3 + p23 * k6;
548 
549         sp0 += chan2;
550         sp1 += chan2;
551         sp2 += chan2;
552         dp += chan2;
553       }
554 
555       if (wid & 1) {
556         p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
557         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
558         CLAMP_STORE(dp[0], pix0)
559       }
560 
561       sl += sll;
562       dl += dll;
563     }
564   }
565 
566   return MLIB_SUCCESS;
567 }
568 
569 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
570 
571 /***************************************************************/
572 #undef  KSIZE
573 #define KSIZE 4
574 
575 mlib_status CONV_FUNC(4x4)(mlib_image       *dst,
576                            const mlib_image *src,
577                            const mlib_s32   *kern,
578                            mlib_s32         scalef_expon,
579                            mlib_s32         cmask)
580 {
581   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
582   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
583   FTYPE    k[KSIZE*KSIZE];
584   mlib_s32 d0, d1;
585   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7;
586   FTYPE    p00, p01, p02, p03, p04,
587            p10, p11, p12, p13, p14,
588            p20, p21, p22, p23,
589            p30, p31, p32, p33;
590   DEF_VARS(DTYPE);
591   DTYPE *sl1;
592   mlib_s32 chan2;
593   mlib_s32 *buffo, *buffi;
594   DTYPE *sl2, *sl3;
595   LOAD_KERNEL(KSIZE*KSIZE);
596   GET_SRC_DST_PARAMETERS(DTYPE);
597 
598   if (wid > BUFF_LINE) {
599     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
600 
601     if (pbuff == NULL) return MLIB_FAILURE;
602   }
603 
604   buff0 = pbuff;
605   buff1 = buff0 + wid;
606   buff2 = buff1 + wid;
607   buff3 = buff2 + wid;
608   buff4 = buff3 + wid;
609   buffd = buff4 + wid;
610   buffo = (mlib_s32*)(buffd + wid);
611   buffi = buffo + (wid &~ 1);
612 
613   chan1 = nchannel;
614   chan2 = chan1 + chan1;
615 
616   wid -= (KSIZE - 1);
617   hgt -= (KSIZE - 1);
618 
619   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
620 
621   for (c = 0; c < nchannel; c++) {
622     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
623 
624     sl = adr_src + c;
625     dl = adr_dst + c;
626 
627     sl1 = sl  + sll;
628     sl2 = sl1 + sll;
629     sl3 = sl2 + sll;
630 #ifdef __SUNPRO_C
631 #pragma pipeloop(0)
632 #endif /* __SUNPRO_C */
633     for (i = 0; i < wid + (KSIZE - 1); i++) {
634       buff0[i] = (FTYPE)sl[i*chan1];
635       buff1[i] = (FTYPE)sl1[i*chan1];
636       buff2[i] = (FTYPE)sl2[i*chan1];
637       buff3[i] = (FTYPE)sl3[i*chan1];
638     }
639 
640     sl += KSIZE*sll;
641 
642     for (j = 0; j < hgt; j++) {
643       d64_2x32 dd;
644 
645       /*
646        *  First loop on two first lines of kernel
647        */
648       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
649       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
650 
651       sp = sl;
652       dp = dl;
653 
654       p02 = buff0[0];
655       p12 = buff1[0];
656       p03 = buff0[1];
657       p13 = buff1[1];
658       p04 = buff0[2];
659 
660 #ifdef __SUNPRO_C
661 #pragma pipeloop(0)
662 #endif /* __SUNPRO_C */
663       for (i = 0; i <= (wid - 2); i += 2) {
664         p00 = p02; p10 = p12;
665         p01 = p03; p11 = p13;
666         p02 = p04; p12 = buff1[i + 2];
667         p03 = buff0[i + 3]; p13 = buff1[i + 3];
668         p04 = buff0[i + 4]; p14 = buff1[i + 4];
669 
670         LOAD_BUFF(buffi);
671 
672         dd.d64 = *(FTYPE   *)(buffi + i);
673         buff4[i    ] = (FTYPE)dd.i32s.i0;
674         buff4[i + 1] = (FTYPE)dd.i32s.i1;
675 
676         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
677                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
678         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
679                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
680 
681         sp += chan2;
682         dp += chan2;
683       }
684 
685       /*
686        *  Second loop on two last lines of kernel
687        */
688       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
689       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
690 
691       sp = sl;
692       dp = dl;
693 
694       p02 = buff2[0];
695       p12 = buff3[0];
696       p03 = buff2[1];
697       p13 = buff3[1];
698       p04 = buff2[2];
699 
700 #ifdef __SUNPRO_C
701 #pragma pipeloop(0)
702 #endif /* __SUNPRO_C */
703       for (i = 0; i <= (wid - 2); i += 2) {
704         p00 = p02; p10 = p12;
705         p01 = p03; p11 = p13;
706         p02 = p04; p12 = buff3[i + 2];
707         p03 = buff2[i + 3]; p13 = buff3[i + 3];
708         p04 = buff2[i + 4]; p14 = buff3[i + 4];
709 
710         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
711                  p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
712         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
713                  p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
714 
715         dp[0    ] = FROM_S32(d0);
716         dp[chan1] = FROM_S32(d1);
717 
718         sp += chan2;
719         dp += chan2;
720       }
721 
722       /* last pixels */
723       for (; i < wid; i++) {
724         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
725         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
726         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
727         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
728 
729         buff4[i] = (FTYPE)sp[0];
730 
731         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
732                        p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
733                        p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
734                        p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
735 
736         dp[0] = FROM_S32(buffo[i]);
737 
738         sp += chan1;
739         dp += chan1;
740       }
741 
742       buff4[wid    ] = (FTYPE)sp[0];
743       buff4[wid + 1] = (FTYPE)sp[chan1];
744       buff4[wid + 2] = (FTYPE)sp[chan2];
745 
746       /* next line */
747       sl += sll;
748       dl += dll;
749 
750       buffT = buff0;
751       buff0 = buff1;
752       buff1 = buff2;
753       buff2 = buff3;
754       buff3 = buff4;
755       buff4 = buffT;
756     }
757   }
758 
759   if (pbuff != buff) mlib_free(pbuff);
760 
761   return MLIB_SUCCESS;
762 }
763 
764 /***************************************************************/
765 #undef  KSIZE
766 #define KSIZE 5
767 
768 mlib_status CONV_FUNC(5x5)(mlib_image       *dst,
769                            const mlib_image *src,
770                            const mlib_s32   *kern,
771                            mlib_s32         scalef_expon,
772                            mlib_s32         cmask)
773 {
774   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
775   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
776   FTYPE    k[KSIZE*KSIZE];
777   mlib_s32 d0, d1;
778   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
779   FTYPE    p00, p01, p02, p03, p04, p05,
780            p10, p11, p12, p13, p14, p15,
781            p20, p21, p22, p23, p24,
782            p30, p31, p32, p33, p34,
783            p40, p41, p42, p43, p44;
784   DEF_VARS(DTYPE);
785   DTYPE *sl1;
786   mlib_s32 chan2;
787   mlib_s32 *buffo, *buffi;
788   DTYPE *sl2, *sl3, *sl4;
789   LOAD_KERNEL(KSIZE*KSIZE);
790   GET_SRC_DST_PARAMETERS(DTYPE);
791 
792   if (wid > BUFF_LINE) {
793     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
794 
795     if (pbuff == NULL) return MLIB_FAILURE;
796   }
797 
798   buff0 = pbuff;
799   buff1 = buff0 + wid;
800   buff2 = buff1 + wid;
801   buff3 = buff2 + wid;
802   buff4 = buff3 + wid;
803   buff5 = buff4 + wid;
804   buffd = buff5 + wid;
805   buffo = (mlib_s32*)(buffd + wid);
806   buffi = buffo + (wid &~ 1);
807 
808   chan1 = nchannel;
809   chan2 = chan1 + chan1;
810 
811   wid -= (KSIZE - 1);
812   hgt -= (KSIZE - 1);
813 
814   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
815 
816   for (c = 0; c < nchannel; c++) {
817     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
818 
819     sl = adr_src + c;
820     dl = adr_dst + c;
821 
822     sl1 = sl  + sll;
823     sl2 = sl1 + sll;
824     sl3 = sl2 + sll;
825     sl4 = sl3 + sll;
826 #ifdef __SUNPRO_C
827 #pragma pipeloop(0)
828 #endif /* __SUNPRO_C */
829     for (i = 0; i < wid + (KSIZE - 1); i++) {
830       buff0[i] = (FTYPE)sl[i*chan1];
831       buff1[i] = (FTYPE)sl1[i*chan1];
832       buff2[i] = (FTYPE)sl2[i*chan1];
833       buff3[i] = (FTYPE)sl3[i*chan1];
834       buff4[i] = (FTYPE)sl4[i*chan1];
835     }
836 
837     sl += KSIZE*sll;
838 
839     for (j = 0; j < hgt; j++) {
840       d64_2x32 dd;
841 
842       /*
843        *  First loop
844        */
845       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
846       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
847 
848       sp = sl;
849       dp = dl;
850 
851       p02 = buff0[0];
852       p12 = buff1[0];
853       p03 = buff0[1];
854       p13 = buff1[1];
855       p04 = buff0[2];
856       p14 = buff1[2];
857 
858 #ifdef __SUNPRO_C
859 #pragma pipeloop(0)
860 #endif /* __SUNPRO_C */
861       for (i = 0; i <= (wid - 2); i += 2) {
862         p00 = p02; p10 = p12;
863         p01 = p03; p11 = p13;
864         p02 = p04; p12 = p14;
865 
866         LOAD_BUFF(buffi);
867 
868         p03 = buff0[i + 3]; p13 = buff1[i + 3];
869         p04 = buff0[i + 4]; p14 = buff1[i + 4];
870         p05 = buff0[i + 5]; p15 = buff1[i + 5];
871 
872         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
873                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
874         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
875                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
876 
877         sp += chan2;
878         dp += chan2;
879       }
880 
881       /*
882        *  Second loop
883        */
884       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
885       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
886 
887       sp = sl;
888       dp = dl;
889 
890       p02 = buff2[0];
891       p12 = buff3[0];
892       p03 = buff2[1];
893       p13 = buff3[1];
894       p04 = buff2[2];
895       p14 = buff3[2];
896 
897 #ifdef __SUNPRO_C
898 #pragma pipeloop(0)
899 #endif /* __SUNPRO_C */
900       for (i = 0; i <= (wid - 2); i += 2) {
901         p00 = p02; p10 = p12;
902         p01 = p03; p11 = p13;
903 
904         p02 = buff2[i + 2]; p12 = buff3[i + 2];
905         p03 = buff2[i + 3]; p13 = buff3[i + 3];
906         p04 = buff2[i + 4]; p14 = buff3[i + 4];
907         p05 = buff2[i + 5]; p15 = buff3[i + 5];
908 
909         dd.d64 = *(FTYPE   *)(buffi + i);
910         buff5[i    ] = (FTYPE)dd.i32s.i0;
911         buff5[i + 1] = (FTYPE)dd.i32s.i1;
912 
913         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
914                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
915         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
916                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
917 
918         sp += chan2;
919         dp += chan2;
920       }
921 
922       /*
923        *  3 loop
924        */
925       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
926 
927       sp = sl;
928       dp = dl;
929 
930       p02 = buff4[0];
931       p03 = buff4[1];
932       p04 = buff4[2];
933       p05 = buff4[3];
934 
935 #ifdef __SUNPRO_C
936 #pragma pipeloop(0)
937 #endif /* __SUNPRO_C */
938       for (i = 0; i <= (wid - 2); i += 2) {
939         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
940 
941         p04 = buff4[i + 4]; p05 = buff4[i + 5];
942 
943         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
944         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
945 
946         dp[0    ] = FROM_S32(d0);
947         dp[chan1] = FROM_S32(d1);
948 
949         sp += chan2;
950         dp += chan2;
951       }
952 
953       /* last pixels */
954       for (; i < wid; i++) {
955         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
956         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
957         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
958         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
959         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
960 
961         p40 = buff4[i];     p41 = buff4[i + 1]; p42 = buff4[i + 2];
962         p43 = buff4[i + 3]; p44 = buff4[i + 4];
963 
964         buff5[i] = (FTYPE)sp[0];
965 
966         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
967                        p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
968                        p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
969                        p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
970                        p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
971 
972         dp[0] = FROM_S32(buffo[i]);
973 
974         sp += chan1;
975         dp += chan1;
976       }
977 
978       buff5[wid    ] = (FTYPE)sp[0];
979       buff5[wid + 1] = (FTYPE)sp[chan1];
980       buff5[wid + 2] = (FTYPE)sp[chan2];
981       buff5[wid + 3] = (FTYPE)sp[chan2 + chan1];
982 
983       /* next line */
984       sl += sll;
985       dl += dll;
986 
987       buffT = buff0;
988       buff0 = buff1;
989       buff1 = buff2;
990       buff2 = buff3;
991       buff3 = buff4;
992       buff4 = buff5;
993       buff5 = buffT;
994     }
995   }
996 
997   if (pbuff != buff) mlib_free(pbuff);
998 
999   return MLIB_SUCCESS;
1000 }
1001 
1002 /***************************************************************/
1003 #ifndef __sparc /* for x86, using integer multiplies is faster */
1004 
1005 mlib_status CONV_FUNC_I(5x5)(mlib_image       *dst,
1006                              const mlib_image *src,
1007                              const mlib_s32   *kern,
1008                              mlib_s32         scalef_expon,
1009                              mlib_s32         cmask)
1010 {
1011   mlib_s32 buff[BUFF_LINE];
1012   mlib_s32 *buffd;
1013   mlib_s32 k[KSIZE*KSIZE];
1014   mlib_s32 shift1, shift2;
1015   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1016   mlib_s32 p00, p01, p02, p03, p04, p05,
1017            p10, p11, p12, p13, p14, p15;
1018   DTYPE    *adr_src, *sl, *sp0, *sp1;
1019   DTYPE    *adr_dst, *dl, *dp;
1020   mlib_s32 *pbuff = buff;
1021   mlib_s32 wid, hgt, sll, dll;
1022   mlib_s32 nchannel, chan1, chan2, chan3, chan4;
1023   mlib_s32 i, j, c;
1024 
1025 #if IMG_TYPE != 1
1026   shift1 = 16;
1027 #else
1028   shift1 = 8;
1029 #endif /* IMG_TYPE != 1 */
1030 
1031   shift2 = scalef_expon - shift1;
1032 
1033   for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1034 
1035   GET_SRC_DST_PARAMETERS(DTYPE);
1036 
1037   if (wid > BUFF_LINE) {
1038     pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1039 
1040     if (pbuff == NULL) return MLIB_FAILURE;
1041   }
1042 
1043   buffd = pbuff;
1044 
1045   chan1 = nchannel;
1046   chan2 = chan1 + chan1;
1047   chan3 = chan2 + chan1;
1048   chan4 = chan3 + chan1;
1049 
1050   wid -= (KSIZE - 1);
1051   hgt -= (KSIZE - 1);
1052 
1053   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1054 
1055   for (c = 0; c < chan1; c++) {
1056     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1057 
1058     sl = adr_src + c;
1059     dl = adr_dst + c;
1060 
1061     for (j = 0; j < hgt; j++) {
1062       mlib_s32 pix0, pix1;
1063       /*
1064        *  First loop
1065        */
1066       sp0 = sl;
1067       sp1 = sp0 + sll;
1068       dp = dl;
1069 
1070       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1071       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1072 
1073       p02 = sp0[0];     p12 = sp1[0];
1074       p03 = sp0[chan1]; p13 = sp1[chan1];
1075       p04 = sp0[chan2]; p14 = sp1[chan2];
1076       p05 = sp0[chan3]; p15 = sp1[chan3];
1077 
1078       sp0 += chan4;
1079       sp1 += chan4;
1080 
1081 #ifdef __SUNPRO_C
1082 #pragma pipeloop(0)
1083 #endif /* __SUNPRO_C */
1084       for (i = 0; i <= (wid - 2); i += 2) {
1085         p00 = p02; p10 = p12;
1086         p01 = p03; p11 = p13;
1087         p02 = p04; p12 = p14;
1088         p03 = p05; p13 = p15;
1089 
1090         p04 = sp0[0];     p14 = sp1[0];
1091         p05 = sp0[chan1]; p15 = sp1[chan1];
1092 
1093         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1094                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1095         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1096                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1097 
1098         sp0 += chan2;
1099         sp1 += chan2;
1100         dp += chan2;
1101       }
1102 
1103       if (wid & 1) {
1104         p00 = p02; p10 = p12;
1105         p01 = p03; p11 = p13;
1106         p02 = p04; p12 = p14;
1107         p03 = p05; p13 = p15;
1108 
1109         p04 = sp0[0];     p14 = sp1[0];
1110 
1111         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1112                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1113       }
1114 
1115       /*
1116        *  Second loop
1117        */
1118       sp0 = sl + 2*sll;
1119       sp1 = sp0 + sll;
1120       dp = dl;
1121 
1122       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1123       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1124 
1125       p02 = sp0[0];     p12 = sp1[0];
1126       p03 = sp0[chan1]; p13 = sp1[chan1];
1127       p04 = sp0[chan2]; p14 = sp1[chan2];
1128       p05 = sp0[chan3]; p15 = sp1[chan3];
1129 
1130       sp0 += chan4;
1131       sp1 += chan4;
1132 
1133 #ifdef __SUNPRO_C
1134 #pragma pipeloop(0)
1135 #endif /* __SUNPRO_C */
1136       for (i = 0; i <= (wid - 2); i += 2) {
1137         p00 = p02; p10 = p12;
1138         p01 = p03; p11 = p13;
1139         p02 = p04; p12 = p14;
1140         p03 = p05; p13 = p15;
1141 
1142         p04 = sp0[0];     p14 = sp1[0];
1143         p05 = sp0[chan1]; p15 = sp1[chan1];
1144 
1145         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1146                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1147         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1148                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1149 
1150         sp0 += chan2;
1151         sp1 += chan2;
1152         dp += chan2;
1153       }
1154 
1155       if (wid & 1) {
1156         p00 = p02; p10 = p12;
1157         p01 = p03; p11 = p13;
1158         p02 = p04; p12 = p14;
1159         p03 = p05; p13 = p15;
1160 
1161         p04 = sp0[0];     p14 = sp1[0];
1162 
1163         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1164                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1165       }
1166 
1167       /*
1168        *  3 loop
1169        */
1170       dp = dl;
1171       sp0 = sl + 4*sll;
1172 
1173       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1174 
1175       p02 = sp0[0];
1176       p03 = sp0[chan1];
1177       p04 = sp0[chan2];
1178       p05 = sp0[chan3];
1179 
1180       sp0 += chan2 + chan2;
1181 
1182 #ifdef __SUNPRO_C
1183 #pragma pipeloop(0)
1184 #endif /* __SUNPRO_C */
1185       for (i = 0; i <= (wid - 2); i += 2) {
1186         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1187 
1188         p04 = sp0[0]; p05 = sp0[chan1];
1189 
1190         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1191                 p03 * k3 + p04 * k4) >> shift2;
1192         pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1193                 p04 * k3 + p05 * k4) >> shift2;
1194 
1195         CLAMP_STORE(dp[0],     pix0)
1196         CLAMP_STORE(dp[chan1], pix1)
1197 
1198         dp  += chan2;
1199         sp0 += chan2;
1200       }
1201 
1202       if (wid & 1) {
1203         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1204 
1205         p04 = sp0[0];
1206 
1207         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1208                 p03 * k3 + p04 * k4) >> shift2;
1209         CLAMP_STORE(dp[0],     pix0)
1210       }
1211 
1212       /* next line */
1213       sl += sll;
1214       dl += dll;
1215     }
1216   }
1217 
1218   if (pbuff != buff) mlib_free(pbuff);
1219 
1220   return MLIB_SUCCESS;
1221 }
1222 
1223 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1224 
1225 /***************************************************************/
1226 #if IMG_TYPE == 1
1227 
1228 #undef  KSIZE
1229 #define KSIZE 7
1230 
1231 mlib_status CONV_FUNC(7x7)(mlib_image       *dst,
1232                            const mlib_image *src,
1233                            const mlib_s32   *kern,
1234                            mlib_s32         scalef_expon,
1235                            mlib_s32         cmask)
1236 {
1237   FTYPE    buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1238   FTYPE    k[KSIZE*KSIZE];
1239   mlib_s32 l, m, buff_ind;
1240   mlib_s32 d0, d1;
1241   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1242   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1243   DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1244   DEF_VARS(DTYPE);
1245   DTYPE *sl1;
1246   mlib_s32 chan2;
1247   mlib_s32 *buffo, *buffi;
1248   LOAD_KERNEL(KSIZE*KSIZE);
1249   GET_SRC_DST_PARAMETERS(DTYPE);
1250 
1251   if (wid > BUFF_LINE) {
1252     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
1253 
1254     if (pbuff == NULL) return MLIB_FAILURE;
1255   }
1256 
1257   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
1258   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1259   buffd = buffs[KSIZE] + wid;
1260   buffo = (mlib_s32*)(buffd + wid);
1261   buffi = buffo + (wid &~ 1);
1262 
1263   chan1 = nchannel;
1264   chan2 = chan1 + chan1;
1265 
1266   wid -= (KSIZE - 1);
1267   hgt -= (KSIZE - 1);
1268 
1269   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1270 
1271   for (c = 0; c < nchannel; c++) {
1272     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1273 
1274     sl = adr_src + c;
1275     dl = adr_dst + c;
1276 
1277     sl1 = sl  + sll;
1278     sl2 = sl1 + sll;
1279     sl3 = sl2 + sll;
1280     sl4 = sl3 + sll;
1281     sl5 = sl4 + sll;
1282     sl6 = sl5 + sll;
1283 #ifdef __SUNPRO_C
1284 #pragma pipeloop(0)
1285 #endif /* __SUNPRO_C */
1286     for (i = 0; i < wid + (KSIZE - 1); i++) {
1287       buffs[0][i] = (FTYPE)sl[i*chan1];
1288       buffs[1][i] = (FTYPE)sl1[i*chan1];
1289       buffs[2][i] = (FTYPE)sl2[i*chan1];
1290       buffs[3][i] = (FTYPE)sl3[i*chan1];
1291       buffs[4][i] = (FTYPE)sl4[i*chan1];
1292       buffs[5][i] = (FTYPE)sl5[i*chan1];
1293       buffs[6][i] = (FTYPE)sl6[i*chan1];
1294     }
1295 
1296     buff_ind = 0;
1297 
1298 #ifdef __SUNPRO_C
1299 #pragma pipeloop(0)
1300 #endif /* __SUNPRO_C */
1301     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1302 
1303     sl += KSIZE*sll;
1304 
1305     for (j = 0; j < hgt; j++) {
1306       FTYPE    **buffc = buffs + buff_ind;
1307       FTYPE    *buffn = buffc[KSIZE];
1308       FTYPE    *pk = k;
1309 
1310       for (l = 0; l < KSIZE; l++) {
1311         FTYPE    *buff = buffc[l];
1312         d64_2x32 dd;
1313 
1314         sp = sl;
1315         dp = dl;
1316 
1317         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1318         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1319 
1320         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1321         k4 = *pk++; k5 = *pk++; k6 = *pk++;
1322 
1323         if (l < (KSIZE - 1)) {
1324 #ifdef __SUNPRO_C
1325 #pragma pipeloop(0)
1326 #endif /* __SUNPRO_C */
1327           for (i = 0; i <= (wid - 2); i += 2) {
1328             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1329 
1330             p6 = buff[i + 6]; p7 = buff[i + 7];
1331 
1332             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1333             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1334           }
1335 
1336         } else {
1337 #ifdef __SUNPRO_C
1338 #pragma pipeloop(0)
1339 #endif /* __SUNPRO_C */
1340           for (i = 0; i <= (wid - 2); i += 2) {
1341             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1342 
1343             p6 = buff[i + 6]; p7 = buff[i + 7];
1344 
1345             LOAD_BUFF(buffi);
1346 
1347             dd.d64 = *(FTYPE   *)(buffi + i);
1348             buffn[i    ] = (FTYPE)dd.i32s.i0;
1349             buffn[i + 1] = (FTYPE)dd.i32s.i1;
1350 
1351             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1352             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1353 
1354             dp[0    ] = FROM_S32(d0);
1355             dp[chan1] = FROM_S32(d1);
1356 
1357             buffd[i    ] = 0.0;
1358             buffd[i + 1] = 0.0;
1359 
1360             sp += chan2;
1361             dp += chan2;
1362           }
1363         }
1364       }
1365 
1366       /* last pixels */
1367       for (; i < wid; i++) {
1368         FTYPE    *pk = k, s = 0;
1369         mlib_s32 d0;
1370 
1371         for (l = 0; l < KSIZE; l++) {
1372           FTYPE    *buff = buffc[l] + i;
1373 
1374           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1375         }
1376 
1377         d0 = D2I(s);
1378         dp[0] = FROM_S32(d0);
1379 
1380         buffn[i] = (FTYPE)sp[0];
1381 
1382         sp += chan1;
1383         dp += chan1;
1384       }
1385 
1386       for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
1387 
1388       /* next line */
1389       sl += sll;
1390       dl += dll;
1391 
1392       buff_ind++;
1393 
1394       if (buff_ind >= KSIZE + 1) buff_ind = 0;
1395     }
1396   }
1397 
1398   if (pbuff != buff) mlib_free(pbuff);
1399 
1400   return MLIB_SUCCESS;
1401 }
1402 
1403 #endif /* IMG_TYPE == 1 */
1404 
1405 /***************************************************************/
1406 #define MAX_KER   7
1407 #define MAX_N    15
1408 
mlib_ImageConv1xN(mlib_image * dst,const mlib_image * src,const mlib_d64 * k,mlib_s32 n,mlib_s32 dn,mlib_s32 cmask)1409 static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
1410                                      const mlib_image *src,
1411                                      const mlib_d64   *k,
1412                                      mlib_s32         n,
1413                                      mlib_s32         dn,
1414                                      mlib_s32         cmask)
1415 {
1416   FTYPE    buff[BUFF_SIZE];
1417   mlib_s32 off, kh;
1418   mlib_s32 d0, d1;
1419   const FTYPE    *pk;
1420   FTYPE    k0, k1, k2, k3;
1421   FTYPE    p0, p1, p2, p3, p4;
1422   DEF_VARS(DTYPE);
1423   DTYPE    *sl_c, *dl_c, *sl0;
1424   mlib_s32 l, hsize, max_hsize;
1425   GET_SRC_DST_PARAMETERS(DTYPE);
1426 
1427   hgt -= (n - 1);
1428   adr_dst += dn*dll;
1429 
1430   max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
1431 
1432   if (!max_hsize) max_hsize = 1;
1433 
1434   if (max_hsize > BUFF_SIZE) {
1435     pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
1436   }
1437 
1438   chan1 = nchannel;
1439 
1440   sl_c = adr_src;
1441   dl_c = adr_dst;
1442 
1443   for (l = 0; l < hgt; l += hsize) {
1444     hsize = hgt - l;
1445 
1446     if (hsize > max_hsize) hsize = max_hsize;
1447 
1448     for (c = 0; c < nchannel; c++) {
1449       if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1450 
1451       sl = sl_c + c;
1452       dl = dl_c + c;
1453 
1454 #ifdef __SUNPRO_C
1455 #pragma pipeloop(0)
1456 #endif /* __SUNPRO_C */
1457       for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
1458 
1459       for (i = 0; i < wid; i++) {
1460         sl0 = sl;
1461 
1462         for (off = 0; off < (n - 4); off += 4) {
1463           pk = k + off;
1464           sp = sl0;
1465 
1466           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1467           p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1468           sp += 3*sll;
1469 
1470 #ifdef __SUNPRO_C
1471 #pragma pipeloop(0)
1472 #endif /* __SUNPRO_C */
1473           for (j = 0; j < hsize; j += 2) {
1474             p0 = p2; p1 = p3; p2 = p4;
1475             p3 = sp[0];
1476             p4 = sp[sll];
1477 
1478             pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1479             pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1480 
1481             sp += 2*sll;
1482           }
1483 
1484           sl0 += 4*sll;
1485         }
1486 
1487         pk = k + off;
1488         sp = sl0;
1489 
1490         k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1491         p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1492 
1493         dp = dl;
1494         kh = n - off;
1495 
1496         if (kh == 4) {
1497           sp += 3*sll;
1498 
1499 #ifdef __SUNPRO_C
1500 #pragma pipeloop(0)
1501 #endif /* __SUNPRO_C */
1502           for (j = 0; j <= (hsize - 2); j += 2) {
1503             p0 = p2; p1 = p3; p2 = p4;
1504             p3 = sp[0];
1505             p4 = sp[sll];
1506 
1507             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1508             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
1509 
1510             dp[0  ] = FROM_S32(d0);
1511             dp[dll] = FROM_S32(d1);
1512 
1513             pbuff[j] = 0;
1514             pbuff[j + 1] = 0;
1515 
1516             sp += 2*sll;
1517             dp += 2*dll;
1518           }
1519 
1520           if (j < hsize) {
1521             p0 = p2; p1 = p3; p2 = p4;
1522             p3 = sp[0];
1523 
1524             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1525 
1526             pbuff[j] = 0;
1527 
1528             dp[0] = FROM_S32(d0);
1529           }
1530 
1531         } else if (kh == 3) {
1532           sp += 2*sll;
1533 
1534 #ifdef __SUNPRO_C
1535 #pragma pipeloop(0)
1536 #endif /* __SUNPRO_C */
1537           for (j = 0; j <= (hsize - 2); j += 2) {
1538             p0 = p2; p1 = p3;
1539             p2 = sp[0];
1540             p3 = sp[sll];
1541 
1542             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1543             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
1544 
1545             dp[0  ] = FROM_S32(d0);
1546             dp[dll] = FROM_S32(d1);
1547 
1548             pbuff[j] = 0;
1549             pbuff[j + 1] = 0;
1550 
1551             sp += 2*sll;
1552             dp += 2*dll;
1553           }
1554 
1555           if (j < hsize) {
1556             p0 = p2; p1 = p3;
1557             p2 = sp[0];
1558 
1559             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1560 
1561             pbuff[j] = 0;
1562 
1563             dp[0] = FROM_S32(d0);
1564           }
1565 
1566         } else if (kh == 2) {
1567           sp += sll;
1568 
1569 #ifdef __SUNPRO_C
1570 #pragma pipeloop(0)
1571 #endif /* __SUNPRO_C */
1572           for (j = 0; j <= (hsize - 2); j += 2) {
1573             p0 = p2;
1574             p1 = sp[0];
1575             p2 = sp[sll];
1576 
1577             d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1578             d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
1579 
1580             dp[0  ] = FROM_S32(d0);
1581             dp[dll] = FROM_S32(d1);
1582 
1583             pbuff[j] = 0;
1584             pbuff[j + 1] = 0;
1585 
1586             sp += 2*sll;
1587             dp += 2*dll;
1588           }
1589 
1590           if (j < hsize) {
1591             p0 = p2;
1592             p1 = sp[0];
1593 
1594             d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1595 
1596             pbuff[j] = 0;
1597 
1598             dp[0] = FROM_S32(d0);
1599           }
1600 
1601         } else /* if (kh == 1) */ {
1602 #ifdef __SUNPRO_C
1603 #pragma pipeloop(0)
1604 #endif /* __SUNPRO_C */
1605           for (j = 0; j < hsize; j++) {
1606             p0 = sp[0];
1607 
1608             d0 = D2I(p0*k0 + pbuff[j]);
1609 
1610             dp[0] = FROM_S32(d0);
1611 
1612             pbuff[j] = 0;
1613 
1614             sp += sll;
1615             dp += dll;
1616           }
1617         }
1618 
1619         sl += chan1;
1620         dl += chan1;
1621       }
1622     }
1623 
1624     sl_c += max_hsize*sll;
1625     dl_c += max_hsize*dll;
1626   }
1627 
1628   if (pbuff != buff) mlib_free(pbuff);
1629 
1630   return MLIB_SUCCESS;
1631 }
1632 
1633 /***************************************************************/
CONV_FUNC(MxN)1634 mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
1635                            const mlib_image *src,
1636                            const mlib_s32   *kernel,
1637                            mlib_s32         m,
1638                            mlib_s32         n,
1639                            mlib_s32         dm,
1640                            mlib_s32         dn,
1641                            mlib_s32         scale,
1642                            mlib_s32         cmask)
1643 {
1644   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1645   FTYPE    **buffs = buffs_arr, *buffd;
1646   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
1647   mlib_s32 mn, l, off, kw, bsize, buff_ind;
1648   mlib_s32 d0, d1;
1649   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1650   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1651   d64_2x32 dd;
1652   DEF_VARS(DTYPE);
1653   mlib_s32 chan2;
1654   mlib_s32 *buffo, *buffi;
1655   mlib_status status = MLIB_SUCCESS;
1656 
1657   GET_SRC_DST_PARAMETERS(DTYPE);
1658 
1659   if (scale > 30) {
1660     fscale *= 1.0/(1 << 30);
1661     scale -= 30;
1662   }
1663 
1664   fscale /= (1 << scale);
1665 
1666   mn = m*n;
1667 
1668   if (mn > 256) {
1669     k = mlib_malloc(mn*sizeof(mlib_d64));
1670 
1671     if (k == NULL) return MLIB_FAILURE;
1672   }
1673 
1674   for (i = 0; i < mn; i++) {
1675     k[i] = kernel[i]*fscale;
1676   }
1677 
1678   if (m == 1) {
1679     status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1680     FREE_AND_RETURN_STATUS;
1681   }
1682 
1683   bsize = (n + 3)*wid;
1684 
1685   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1686     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1687 
1688     if (pbuff == NULL) {
1689       status = MLIB_FAILURE;
1690       FREE_AND_RETURN_STATUS;
1691     }
1692     buffs = (FTYPE   **)(pbuff + bsize);
1693   }
1694 
1695   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1696   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1697   buffd = buffs[n] + wid;
1698   buffo = (mlib_s32*)(buffd + wid);
1699   buffi = buffo + (wid &~ 1);
1700 
1701   chan1 = nchannel;
1702   chan2 = chan1 + chan1;
1703 
1704   wid -= (m - 1);
1705   hgt -= (n - 1);
1706   adr_dst += dn*dll + dm*nchannel;
1707 
1708   for (c = 0; c < nchannel; c++) {
1709     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1710 
1711     sl = adr_src + c;
1712     dl = adr_dst + c;
1713 
1714     for (l = 0; l < n; l++) {
1715       FTYPE    *buff = buffs[l];
1716 
1717 #ifdef __SUNPRO_C
1718 #pragma pipeloop(0)
1719 #endif /* __SUNPRO_C */
1720       for (i = 0; i < wid + (m - 1); i++) {
1721         buff[i] = (FTYPE)sl[i*chan1];
1722       }
1723 
1724       sl += sll;
1725     }
1726 
1727     buff_ind = 0;
1728 
1729 #ifdef __SUNPRO_C
1730 #pragma pipeloop(0)
1731 #endif /* __SUNPRO_C */
1732     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1733 
1734     for (j = 0; j < hgt; j++) {
1735       FTYPE    **buffc = buffs + buff_ind;
1736       FTYPE    *buffn = buffc[n];
1737       FTYPE    *pk = k;
1738 
1739       for (l = 0; l < n; l++) {
1740         FTYPE    *buff_l = buffc[l];
1741 
1742         for (off = 0; off < m;) {
1743           FTYPE    *buff = buff_l + off;
1744 
1745           kw = m - off;
1746 
1747           if (kw > 2*MAX_KER) kw = MAX_KER; else
1748             if (kw > MAX_KER) kw = kw/2;
1749           off += kw;
1750 
1751           sp = sl;
1752           dp = dl;
1753 
1754           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1755           p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1756 
1757           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1758           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1759           pk += kw;
1760 
1761           if (kw == 7) {
1762 
1763             if (l < (n - 1) || off < m) {
1764 #ifdef __SUNPRO_C
1765 #pragma pipeloop(0)
1766 #endif /* __SUNPRO_C */
1767               for (i = 0; i <= (wid - 2); i += 2) {
1768                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1769 
1770                 p6 = buff[i + 6]; p7 = buff[i + 7];
1771 
1772                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1773                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1774               }
1775 
1776             } else {
1777 #ifdef __SUNPRO_C
1778 #pragma pipeloop(0)
1779 #endif /* __SUNPRO_C */
1780               for (i = 0; i <= (wid - 2); i += 2) {
1781                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1782 
1783                 p6 = buff[i + 6]; p7 = buff[i + 7];
1784 
1785                 LOAD_BUFF(buffi);
1786 
1787                 dd.d64 = *(FTYPE   *)(buffi + i);
1788                 buffn[i    ] = (FTYPE)dd.i32s.i0;
1789                 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1790 
1791                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1792                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1793 
1794                 dp[0    ] = FROM_S32(d0);
1795                 dp[chan1] = FROM_S32(d1);
1796 
1797                 buffd[i    ] = 0.0;
1798                 buffd[i + 1] = 0.0;
1799 
1800                 sp += chan2;
1801                 dp += chan2;
1802               }
1803             }
1804 
1805           } else if (kw == 6) {
1806 
1807             if (l < (n - 1) || off < m) {
1808 #ifdef __SUNPRO_C
1809 #pragma pipeloop(0)
1810 #endif /* __SUNPRO_C */
1811               for (i = 0; i <= (wid - 2); i += 2) {
1812                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1813 
1814                 p5 = buff[i + 5]; p6 = buff[i + 6];
1815 
1816                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1817                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1818               }
1819 
1820             } else {
1821 #ifdef __SUNPRO_C
1822 #pragma pipeloop(0)
1823 #endif /* __SUNPRO_C */
1824               for (i = 0; i <= (wid - 2); i += 2) {
1825                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1826 
1827                 p5 = buff[i + 5]; p6 = buff[i + 6];
1828 
1829                 buffn[i    ] = (FTYPE)sp[0];
1830                 buffn[i + 1] = (FTYPE)sp[chan1];
1831 
1832                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1833                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1834 
1835                 dp[0    ] = FROM_S32(d0);
1836                 dp[chan1] = FROM_S32(d1);
1837 
1838                 buffd[i    ] = 0.0;
1839                 buffd[i + 1] = 0.0;
1840 
1841                 sp += chan2;
1842                 dp += chan2;
1843               }
1844             }
1845 
1846           } else if (kw == 5) {
1847 
1848             if (l < (n - 1) || off < m) {
1849 #ifdef __SUNPRO_C
1850 #pragma pipeloop(0)
1851 #endif /* __SUNPRO_C */
1852               for (i = 0; i <= (wid - 2); i += 2) {
1853                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1854 
1855                 p4 = buff[i + 4]; p5 = buff[i + 5];
1856 
1857                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1858                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1859               }
1860 
1861             } else {
1862 #ifdef __SUNPRO_C
1863 #pragma pipeloop(0)
1864 #endif /* __SUNPRO_C */
1865               for (i = 0; i <= (wid - 2); i += 2) {
1866                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1867 
1868                 p4 = buff[i + 4]; p5 = buff[i + 5];
1869 
1870                 buffn[i    ] = (FTYPE)sp[0];
1871                 buffn[i + 1] = (FTYPE)sp[chan1];
1872 
1873                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1874                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1875 
1876                 dp[0    ] = FROM_S32(d0);
1877                 dp[chan1] = FROM_S32(d1);
1878 
1879                 buffd[i    ] = 0.0;
1880                 buffd[i + 1] = 0.0;
1881 
1882                 sp += chan2;
1883                 dp += chan2;
1884               }
1885             }
1886 
1887           } else if (kw == 4) {
1888 
1889             if (l < (n - 1) || off < m) {
1890 #ifdef __SUNPRO_C
1891 #pragma pipeloop(0)
1892 #endif /* __SUNPRO_C */
1893               for (i = 0; i <= (wid - 2); i += 2) {
1894                 p0 = p2; p1 = p3; p2 = p4;
1895 
1896                 p3 = buff[i + 3]; p4 = buff[i + 4];
1897 
1898                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1899                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1900               }
1901 
1902             } else {
1903 #ifdef __SUNPRO_C
1904 #pragma pipeloop(0)
1905 #endif /* __SUNPRO_C */
1906               for (i = 0; i <= (wid - 2); i += 2) {
1907                 p0 = p2; p1 = p3; p2 = p4;
1908 
1909                 p3 = buff[i + 3]; p4 = buff[i + 4];
1910 
1911                 buffn[i    ] = (FTYPE)sp[0];
1912                 buffn[i + 1] = (FTYPE)sp[chan1];
1913 
1914                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1915                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1916 
1917                 dp[0    ] = FROM_S32(d0);
1918                 dp[chan1] = FROM_S32(d1);
1919 
1920                 buffd[i    ] = 0.0;
1921                 buffd[i + 1] = 0.0;
1922 
1923                 sp += chan2;
1924                 dp += chan2;
1925               }
1926             }
1927 
1928           } else if (kw == 3) {
1929 
1930             if (l < (n - 1) || off < m) {
1931 #ifdef __SUNPRO_C
1932 #pragma pipeloop(0)
1933 #endif /* __SUNPRO_C */
1934               for (i = 0; i <= (wid - 2); i += 2) {
1935                 p0 = p2; p1 = p3;
1936 
1937                 p2 = buff[i + 2]; p3 = buff[i + 3];
1938 
1939                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1940                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1941               }
1942 
1943             } else {
1944 #ifdef __SUNPRO_C
1945 #pragma pipeloop(0)
1946 #endif /* __SUNPRO_C */
1947               for (i = 0; i <= (wid - 2); i += 2) {
1948                 p0 = p2; p1 = p3;
1949 
1950                 p2 = buff[i + 2]; p3 = buff[i + 3];
1951 
1952                 buffn[i    ] = (FTYPE)sp[0];
1953                 buffn[i + 1] = (FTYPE)sp[chan1];
1954 
1955                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1956                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1957 
1958                 dp[0    ] = FROM_S32(d0);
1959                 dp[chan1] = FROM_S32(d1);
1960 
1961                 buffd[i    ] = 0.0;
1962                 buffd[i + 1] = 0.0;
1963 
1964                 sp += chan2;
1965                 dp += chan2;
1966               }
1967             }
1968 
1969           } else /*if (kw == 2)*/ {
1970 
1971             if (l < (n - 1) || off < m) {
1972 #ifdef __SUNPRO_C
1973 #pragma pipeloop(0)
1974 #endif /* __SUNPRO_C */
1975               for (i = 0; i <= (wid - 2); i += 2) {
1976                 p0 = p2;
1977 
1978                 p1 = buff[i + 1]; p2 = buff[i + 2];
1979 
1980                 buffd[i    ] += p0*k0 + p1*k1;
1981                 buffd[i + 1] += p1*k0 + p2*k1;
1982               }
1983 
1984             } else {
1985 #ifdef __SUNPRO_C
1986 #pragma pipeloop(0)
1987 #endif /* __SUNPRO_C */
1988               for (i = 0; i <= (wid - 2); i += 2) {
1989                 p0 = p2;
1990 
1991                 p1 = buff[i + 1]; p2 = buff[i + 2];
1992 
1993                 buffn[i    ] = (FTYPE)sp[0];
1994                 buffn[i + 1] = (FTYPE)sp[chan1];
1995 
1996                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
1997                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1998 
1999                 dp[0    ] = FROM_S32(d0);
2000                 dp[chan1] = FROM_S32(d1);
2001 
2002                 buffd[i    ] = 0.0;
2003                 buffd[i + 1] = 0.0;
2004 
2005                 sp += chan2;
2006                 dp += chan2;
2007               }
2008             }
2009           }
2010         }
2011       }
2012 
2013       /* last pixels */
2014       for (; i < wid; i++) {
2015         FTYPE    *pk = k, s = 0;
2016         mlib_s32 x, d0;
2017 
2018         for (l = 0; l < n; l++) {
2019           FTYPE    *buff = buffc[l] + i;
2020 
2021           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2022         }
2023 
2024         d0 = D2I(s);
2025         dp[0] = FROM_S32(d0);
2026 
2027         buffn[i] = (FTYPE)sp[0];
2028 
2029         sp += chan1;
2030         dp += chan1;
2031       }
2032 
2033       for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
2034 
2035       /* next line */
2036       sl += sll;
2037       dl += dll;
2038 
2039       buff_ind++;
2040 
2041       if (buff_ind >= n + 1) buff_ind = 0;
2042     }
2043   }
2044 
2045   FREE_AND_RETURN_STATUS;
2046 }
2047 
2048 /***************************************************************/
2049 #ifndef __sparc /* for x86, using integer multiplies is faster */
2050 
2051 #define STORE_RES(res, x)                                       \
2052   x >>= shift2;                                                 \
2053   CLAMP_STORE(res, x)
2054 
CONV_FUNC_I(MxN)2055 mlib_status CONV_FUNC_I(MxN)(mlib_image       *dst,
2056                              const mlib_image *src,
2057                              const mlib_s32   *kernel,
2058                              mlib_s32         m,
2059                              mlib_s32         n,
2060                              mlib_s32         dm,
2061                              mlib_s32         dn,
2062                              mlib_s32         scale,
2063                              mlib_s32         cmask)
2064 {
2065   mlib_s32 buff[BUFF_SIZE], *buffd = buff;
2066   mlib_s32 l, off, kw;
2067   mlib_s32 d0, d1, shift1, shift2;
2068   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2069   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2070   DTYPE    *adr_src, *sl, *sp = NULL;
2071   DTYPE    *adr_dst, *dl, *dp = NULL;
2072   mlib_s32 wid, hgt, sll, dll;
2073   mlib_s32 nchannel, chan1;
2074   mlib_s32 i, j, c;
2075   mlib_s32 chan2;
2076   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2077   GET_SRC_DST_PARAMETERS(DTYPE);
2078 
2079 #if IMG_TYPE != 1
2080   shift1 = 16;
2081 #else
2082   shift1 = 8;
2083 #endif /* IMG_TYPE != 1 */
2084   shift2 = scale - shift1;
2085 
2086   chan1 = nchannel;
2087   chan2 = chan1 + chan1;
2088 
2089   wid -= (m - 1);
2090   hgt -= (n - 1);
2091   adr_dst += dn*dll + dm*nchannel;
2092 
2093   if (wid > BUFF_SIZE) {
2094     buffd = mlib_malloc(sizeof(mlib_s32)*wid);
2095 
2096     if (buffd == NULL) return MLIB_FAILURE;
2097   }
2098 
2099   if (m*n > MAX_N*MAX_N) {
2100     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2101 
2102     if (k == NULL) {
2103       if (buffd != buff) mlib_free(buffd);
2104       return MLIB_FAILURE;
2105     }
2106   }
2107 
2108   for (i = 0; i < m*n; i++) {
2109     k[i] = kernel[i] >> shift1;
2110   }
2111 
2112   for (c = 0; c < nchannel; c++) {
2113     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2114 
2115     sl = adr_src + c;
2116     dl = adr_dst + c;
2117 
2118 #ifdef __SUNPRO_C
2119 #pragma pipeloop(0)
2120 #endif /* __SUNPRO_C */
2121     for (i = 0; i < wid; i++) buffd[i] = 0;
2122 
2123     for (j = 0; j < hgt; j++) {
2124       mlib_s32 *pk = k;
2125 
2126       for (l = 0; l < n; l++) {
2127         DTYPE *sp0 = sl + l*sll;
2128 
2129         for (off = 0; off < m;) {
2130           sp = sp0 + off*chan1;
2131           dp = dl;
2132 
2133           kw = m - off;
2134 
2135           if (kw > 2*MAX_KER) kw = MAX_KER; else
2136             if (kw > MAX_KER) kw = kw/2;
2137           off += kw;
2138 
2139           p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
2140           p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
2141 
2142           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2143           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2144           pk += kw;
2145 
2146           sp += (kw - 1)*chan1;
2147 
2148           if (kw == 7) {
2149 
2150             if (l < (n - 1) || off < m) {
2151 #ifdef __SUNPRO_C
2152 #pragma pipeloop(0)
2153 #endif /* __SUNPRO_C */
2154               for (i = 0; i <= (wid - 2); i += 2) {
2155                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2156                 p6 = sp[0];
2157                 p7 = sp[chan1];
2158 
2159                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2160                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2161 
2162                 sp += chan2;
2163               }
2164 
2165             } else {
2166 #ifdef __SUNPRO_C
2167 #pragma pipeloop(0)
2168 #endif /* __SUNPRO_C */
2169               for (i = 0; i <= (wid - 2); i += 2) {
2170                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2171                 p6 = sp[0];
2172                 p7 = sp[chan1];
2173 
2174                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
2175                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2176 
2177                 STORE_RES(dp[0    ], d0);
2178                 STORE_RES(dp[chan1], d1);
2179 
2180                 buffd[i    ] = 0;
2181                 buffd[i + 1] = 0;
2182 
2183                 sp += chan2;
2184                 dp += chan2;
2185               }
2186             }
2187 
2188           } else if (kw == 6) {
2189 
2190             if (l < (n - 1) || off < m) {
2191 #ifdef __SUNPRO_C
2192 #pragma pipeloop(0)
2193 #endif /* __SUNPRO_C */
2194               for (i = 0; i <= (wid - 2); i += 2) {
2195                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2196                 p5 = sp[0];
2197                 p6 = sp[chan1];
2198 
2199                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2200                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2201 
2202                 sp += chan2;
2203               }
2204 
2205             } else {
2206 #ifdef __SUNPRO_C
2207 #pragma pipeloop(0)
2208 #endif /* __SUNPRO_C */
2209               for (i = 0; i <= (wid - 2); i += 2) {
2210                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2211                 p5 = sp[0];
2212                 p6 = sp[chan1];
2213 
2214                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
2215                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2216 
2217                 STORE_RES(dp[0    ], d0);
2218                 STORE_RES(dp[chan1], d1);
2219 
2220                 buffd[i    ] = 0;
2221                 buffd[i + 1] = 0;
2222 
2223                 sp += chan2;
2224                 dp += chan2;
2225               }
2226             }
2227 
2228           } else if (kw == 5) {
2229 
2230             if (l < (n - 1) || off < m) {
2231 #ifdef __SUNPRO_C
2232 #pragma pipeloop(0)
2233 #endif /* __SUNPRO_C */
2234               for (i = 0; i <= (wid - 2); i += 2) {
2235                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2236                 p4 = sp[0];
2237                 p5 = sp[chan1];
2238 
2239                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2240                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2241 
2242                 sp += chan2;
2243               }
2244 
2245             } else {
2246 #ifdef __SUNPRO_C
2247 #pragma pipeloop(0)
2248 #endif /* __SUNPRO_C */
2249               for (i = 0; i <= (wid - 2); i += 2) {
2250                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2251                 p4 = sp[0];
2252                 p5 = sp[chan1];
2253 
2254                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
2255                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2256 
2257                 STORE_RES(dp[0    ], d0);
2258                 STORE_RES(dp[chan1], d1);
2259 
2260                 buffd[i    ] = 0;
2261                 buffd[i + 1] = 0;
2262 
2263                 sp += chan2;
2264                 dp += chan2;
2265               }
2266             }
2267 
2268           } else if (kw == 4) {
2269 
2270             if (l < (n - 1) || off < m) {
2271 #ifdef __SUNPRO_C
2272 #pragma pipeloop(0)
2273 #endif /* __SUNPRO_C */
2274               for (i = 0; i <= (wid - 2); i += 2) {
2275                 p0 = p2; p1 = p3; p2 = p4;
2276                 p3 = sp[0];
2277                 p4 = sp[chan1];
2278 
2279                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2280                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2281 
2282                 sp += chan2;
2283               }
2284 
2285             } else {
2286 #ifdef __SUNPRO_C
2287 #pragma pipeloop(0)
2288 #endif /* __SUNPRO_C */
2289               for (i = 0; i <= (wid - 2); i += 2) {
2290                 p0 = p2; p1 = p3; p2 = p4;
2291                 p3 = sp[0];
2292                 p4 = sp[chan1];
2293 
2294                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
2295                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2296 
2297                 STORE_RES(dp[0    ], d0);
2298                 STORE_RES(dp[chan1], d1);
2299 
2300                 buffd[i    ] = 0;
2301                 buffd[i + 1] = 0;
2302 
2303                 sp += chan2;
2304                 dp += chan2;
2305               }
2306             }
2307 
2308           } else if (kw == 3) {
2309 
2310             if (l < (n - 1) || off < m) {
2311 #ifdef __SUNPRO_C
2312 #pragma pipeloop(0)
2313 #endif /* __SUNPRO_C */
2314               for (i = 0; i <= (wid - 2); i += 2) {
2315                 p0 = p2; p1 = p3;
2316                 p2 = sp[0];
2317                 p3 = sp[chan1];
2318 
2319                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
2320                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2321 
2322                 sp += chan2;
2323               }
2324 
2325             } else {
2326 #ifdef __SUNPRO_C
2327 #pragma pipeloop(0)
2328 #endif /* __SUNPRO_C */
2329               for (i = 0; i <= (wid - 2); i += 2) {
2330                 p0 = p2; p1 = p3;
2331                 p2 = sp[0];
2332                 p3 = sp[chan1];
2333 
2334                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
2335                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2336 
2337                 STORE_RES(dp[0    ], d0);
2338                 STORE_RES(dp[chan1], d1);
2339 
2340                 buffd[i    ] = 0;
2341                 buffd[i + 1] = 0;
2342 
2343                 sp += chan2;
2344                 dp += chan2;
2345               }
2346             }
2347 
2348           } else if (kw == 2) {
2349 
2350             if (l < (n - 1) || off < m) {
2351 #ifdef __SUNPRO_C
2352 #pragma pipeloop(0)
2353 #endif /* __SUNPRO_C */
2354               for (i = 0; i <= (wid - 2); i += 2) {
2355                 p0 = p2;
2356                 p1 = sp[0];
2357                 p2 = sp[chan1];
2358 
2359                 buffd[i    ] += p0*k0 + p1*k1;
2360                 buffd[i + 1] += p1*k0 + p2*k1;
2361 
2362                 sp += chan2;
2363               }
2364 
2365             } else {
2366 #ifdef __SUNPRO_C
2367 #pragma pipeloop(0)
2368 #endif /* __SUNPRO_C */
2369               for (i = 0; i <= (wid - 2); i += 2) {
2370                 p0 = p2;
2371                 p1 = sp[0];
2372                 p2 = sp[chan1];
2373 
2374                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
2375                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2376 
2377                 STORE_RES(dp[0    ], d0);
2378                 STORE_RES(dp[chan1], d1);
2379 
2380                 buffd[i    ] = 0;
2381                 buffd[i + 1] = 0;
2382 
2383                 sp += chan2;
2384                 dp += chan2;
2385               }
2386             }
2387 
2388           } else /*if (kw == 1)*/ {
2389 
2390             if (l < (n - 1) || off < m) {
2391 #ifdef __SUNPRO_C
2392 #pragma pipeloop(0)
2393 #endif /* __SUNPRO_C */
2394               for (i = 0; i <= (wid - 2); i += 2) {
2395                 p0 = sp[0];
2396                 p1 = sp[chan1];
2397 
2398                 buffd[i    ] += p0*k0;
2399                 buffd[i + 1] += p1*k0;
2400 
2401                 sp += chan2;
2402               }
2403 
2404             } else {
2405 #ifdef __SUNPRO_C
2406 #pragma pipeloop(0)
2407 #endif /* __SUNPRO_C */
2408               for (i = 0; i <= (wid - 2); i += 2) {
2409                 p0 = sp[0];
2410                 p1 = sp[chan1];
2411 
2412                 d0 = (p0*k0 + buffd[i    ]);
2413                 d1 = (p1*k0 + buffd[i + 1]);
2414 
2415                 STORE_RES(dp[0    ], d0);
2416                 STORE_RES(dp[chan1], d1);
2417 
2418                 buffd[i    ] = 0;
2419                 buffd[i + 1] = 0;
2420 
2421                 sp += chan2;
2422                 dp += chan2;
2423               }
2424             }
2425           }
2426         }
2427       }
2428 
2429       /* last pixels */
2430       for (; i < wid; i++) {
2431         mlib_s32 *pk = k, s = 0;
2432         mlib_s32 x;
2433 
2434         for (l = 0; l < n; l++) {
2435           sp = sl + l*sll + i*chan1;
2436 
2437           for (x = 0; x < m; x++) {
2438             s += sp[0] * pk[0];
2439             sp += chan1;
2440             pk ++;
2441           }
2442         }
2443 
2444         STORE_RES(dp[0], s);
2445 
2446         sp += chan1;
2447         dp += chan1;
2448       }
2449 
2450       sl += sll;
2451       dl += dll;
2452     }
2453   }
2454 
2455   if (buffd != buff) mlib_free(buffd);
2456   if (k != k_locl) mlib_free(k);
2457 
2458   return MLIB_SUCCESS;
2459 }
2460 
2461 /***************************************************************/
2462 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
2463 
2464 /***************************************************************/
2465