1 /*
2  * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 /*
28  * FUNCTION
29  *   Internal functions for mlib_ImageConv* on S32 type and
30  *   MLIB_EDGE_DST_NO_WRITE mask
31  *
32  */
33 
34 #include "mlib_image.h"
35 #include "mlib_ImageConv.h"
36 
37 /***************************************************************/
38 #define BUFF_LINE  256
39 
40 #define CACHE_SIZE (64*1024)
41 
42 /***************************************************************/
43 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_s32
44 
45 /***************************************************************/
46 #ifndef MLIB_USE_FTOI_CLAMPING
47 
48 #define CLAMP_S32(dst, src)                                       \
49   if (src > (mlib_d64)MLIB_S32_MAX) src = (mlib_d64)MLIB_S32_MAX; \
50   if (src < (mlib_d64)MLIB_S32_MIN) src = (mlib_d64)MLIB_S32_MIN; \
51   dst = (mlib_s32)src
52 
53 #else
54 
55 #define CLAMP_S32(dst, src) dst = (mlib_s32)(src)
56 
57 #endif /* MLIB_USE_FTOI_CLAMPING */
58 
59 /***************************************************************/
60 #define GET_SRC_DST_PARAMETERS(type)                            \
61   mlib_s32 hgt = mlib_ImageGetHeight(src);                      \
62   mlib_s32 wid = mlib_ImageGetWidth(src);                       \
63   mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(type);       \
64   mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(type);       \
65   type*    adr_src = mlib_ImageGetData(src);                    \
66   type*    adr_dst = mlib_ImageGetData(dst);                    \
67   mlib_s32 chan1 = mlib_ImageGetChannels(src)
68 /*  mlib_s32 chan2 = chan1 + chan1 */
69 
70 /***************************************************************/
71 #define DEF_VARS(type)                                          \
72   GET_SRC_DST_PARAMETERS(type);                                 \
73   type     *sl, *sp, *sl1, *dl, *dp;                            \
74   mlib_d64 *pbuff = buff, *buff0, *buff1, *buff2, *buffT;       \
75   mlib_s32 i, j, c;                                             \
76   mlib_d64 scalef, d0, d1
77 
78 /***************************************************************/
79 #define DEF_VARS_MxN(type)                                      \
80   GET_SRC_DST_PARAMETERS(type);                                 \
81   type     *sl, *sp = NULL, *dl, *dp = NULL;                    \
82   mlib_d64 *pbuff = buff;                                       \
83   mlib_s32 i, j, c
84 
85 /***************************************************************/
86 #define CALC_SCALE()                                            \
87   scalef = 1.0;                                                 \
88   while (scalef_expon > 30) {                                   \
89     scalef /= (1 << 30);                                        \
90     scalef_expon -= 30;                                         \
91   }                                                             \
92                                                                 \
93   scalef /= (1 << scalef_expon)
94 
95 /***************************************************************/
96 #undef  KSIZE
97 #define KSIZE 2
98 
99 mlib_status CONV_FUNC(2x2)(mlib_image       *dst,
100                            const mlib_image *src,
101                            const mlib_s32   *kern,
102                            mlib_s32         scalef_expon,
103                            mlib_s32         cmask)
104 {
105   mlib_d64 buff[(KSIZE + 1)*BUFF_LINE];
106   mlib_d64 k0, k1, k2, k3;
107   mlib_d64 p00, p01, p02, p03,
108            p10, p11, p12, p13;
109   mlib_d64 d2;
110   DEF_VARS(mlib_s32);
111   mlib_s32 chan2 = chan1 + chan1;
112   mlib_s32 chan3 = chan1 + chan2;
113 
114   if (wid > BUFF_LINE) {
115     pbuff = mlib_malloc((KSIZE + 1)*sizeof(mlib_d64)*wid);
116 
117     if (pbuff == NULL) return MLIB_FAILURE;
118   }
119 
120   buff0 = pbuff;
121   buff1 = buff0 + wid;
122   buff2 = buff1 + wid;
123 
124   wid -= (KSIZE - 1);
125   hgt -= (KSIZE - 1);
126 
127   /* keep kernel in regs */
128   CALC_SCALE();
129   k0 = scalef * kern[0];  k1 = scalef * kern[1];
130   k2 = scalef * kern[2];  k3 = scalef * kern[3];
131 
132   for (c = 0; c < chan1; c++) {
133     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
134 
135     sl = adr_src + c;
136     dl = adr_dst + c;
137 
138     sl1 = sl + sll;
139 #ifdef __SUNPRO_C
140 #pragma pipeloop(0)
141 #endif /* __SUNPRO_C */
142     for (i = 0; i < wid + (KSIZE - 1); i++) {
143       buff0[i] = (mlib_d64)sl[i*chan1];
144       buff1[i] = (mlib_d64)sl1[i*chan1];
145     }
146 
147     sl += KSIZE*sll;
148 
149     for (j = 0; j < hgt; j++) {
150       p03 = buff0[0];
151       p13 = buff1[0];
152 
153       sp = sl;
154       dp = dl;
155 
156 #ifdef __SUNPRO_C
157 #pragma pipeloop(0)
158 #endif /* __SUNPRO_C */
159       for (i = 0; i <= (wid - 3); i += 3) {
160 
161         p00 = p03; p10 = p13;
162 
163         p01 = buff0[i + 1]; p11 = buff1[i + 1];
164         p02 = buff0[i + 2]; p12 = buff1[i + 2];
165         p03 = buff0[i + 3]; p13 = buff1[i + 3];
166 
167         buff2[i    ] = (mlib_d64)sp[0];
168         buff2[i + 1] = (mlib_d64)sp[chan1];
169         buff2[i + 2] = (mlib_d64)sp[chan2];
170 
171         d0 = p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3;
172         d1 = p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3;
173         d2 = p02 * k0 + p03 * k1 + p12 * k2 + p13 * k3;
174 
175         CLAMP_S32(dp[0    ], d0);
176         CLAMP_S32(dp[chan1], d1);
177         CLAMP_S32(dp[chan2], d2);
178 
179         sp += chan3;
180         dp += chan3;
181       }
182 
183       for (; i < wid; i++) {
184         p00 = buff0[i];     p10 = buff1[i];
185         p01 = buff0[i + 1]; p11 = buff1[i + 1];
186 
187         buff2[i] = (mlib_d64)sp[0];
188 
189         d0 = p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3;
190         CLAMP_S32(dp[0], d0);
191 
192         sp += chan1;
193         dp += chan1;
194       }
195 
196       buff2[wid] = (mlib_d64)sp[0];
197 
198       sl += sll;
199       dl += dll;
200 
201       buffT = buff0;
202       buff0 = buff1;
203       buff1 = buff2;
204       buff2 = buffT;
205     }
206   }
207 
208   if (pbuff != buff) mlib_free(pbuff);
209 
210   return MLIB_SUCCESS;
211 }
212 
213 /***************************************************************/
214 #undef  KSIZE
215 #define KSIZE 3
216 
217 mlib_status CONV_FUNC(3x3)(mlib_image       *dst,
218                            const mlib_image *src,
219                            const mlib_s32   *kern,
220                            mlib_s32         scalef_expon,
221                            mlib_s32         cmask)
222 {
223   mlib_d64 buff[(KSIZE + 1)*BUFF_LINE], *buff3;
224   mlib_d64 k0, k1, k2, k3, k4, k5, k6, k7, k8;
225   mlib_d64 p00, p01, p02, p03,
226            p10, p11, p12, p13,
227            p20, p21, p22, p23;
228   mlib_s32 *sl2;
229   DEF_VARS(mlib_s32);
230   mlib_s32 chan2 = chan1 + chan1;
231 
232   if (wid > BUFF_LINE) {
233     pbuff = mlib_malloc((KSIZE + 1)*sizeof(mlib_d64)*wid);
234 
235     if (pbuff == NULL) return MLIB_FAILURE;
236   }
237 
238   buff0 = pbuff;
239   buff1 = buff0 + wid;
240   buff2 = buff1 + wid;
241   buff3 = buff2 + wid;
242 
243   wid -= (KSIZE - 1);
244   hgt -= (KSIZE - 1);
245 
246   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
247 
248   CALC_SCALE();
249   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2];
250   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5];
251   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8];
252 
253   for (c = 0; c < chan1; c++) {
254     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
255 
256     sl = adr_src + c;
257     dl = adr_dst + c;
258 
259     sl1 = sl  + sll;
260     sl2 = sl1 + sll;
261 #ifdef __SUNPRO_C
262 #pragma pipeloop(0)
263 #endif /* __SUNPRO_C */
264     for (i = 0; i < wid + (KSIZE - 1); i++) {
265       buff0[i] = (mlib_d64)sl[i*chan1];
266       buff1[i] = (mlib_d64)sl1[i*chan1];
267       buff2[i] = (mlib_d64)sl2[i*chan1];
268     }
269 
270     sl += KSIZE*sll;
271 
272     for (j = 0; j < hgt; j++) {
273       mlib_d64 s0, s1;
274 
275       p02 = buff0[0];
276       p12 = buff1[0];
277       p22 = buff2[0];
278 
279       p03 = buff0[1];
280       p13 = buff1[1];
281       p23 = buff2[1];
282 
283       sp = sl;
284       dp = dl;
285 
286       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
287       s1 = p03 * k0 + p13 * k3 + p23 * k6;
288 
289 #ifdef __SUNPRO_C
290 #pragma pipeloop(0)
291 #endif /* __SUNPRO_C */
292       for (i = 0; i <= (wid - 2); i += 2) {
293         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
294         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
295 
296         buff3[i    ] = (mlib_d64)sp[0];
297         buff3[i + 1] = (mlib_d64)sp[chan1];
298 
299         d0 = s0 + p02 * k2 + p12 * k5 + p22 * k8;
300         d1 = s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8;
301 
302         CLAMP_S32(dp[0    ], d0);
303         CLAMP_S32(dp[chan1], d1);
304 
305         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
306         s1 = p03 * k0 + p13 * k3 + p23 * k6;
307 
308         sp += chan2;
309         dp += chan2;
310       }
311 
312       for (; i < wid; i++) {
313         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
314         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
315         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
316 
317         buff3[i] = (mlib_d64)sp[0];
318 
319         d0 = (p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
320               p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
321 
322         CLAMP_S32(dp[0], d0);
323 
324         sp += chan1;
325         dp += chan1;
326       }
327 
328       buff3[wid    ] = (mlib_d64)sp[0];
329       buff3[wid + 1] = (mlib_d64)sp[chan1];
330 
331       sl += sll;
332       dl += dll;
333 
334       buffT = buff0;
335       buff0 = buff1;
336       buff1 = buff2;
337       buff2 = buff3;
338       buff3 = buffT;
339     }
340   }
341 
342   if (pbuff != buff) mlib_free(pbuff);
343 
344   return MLIB_SUCCESS;
345 }
346 
347 /***************************************************************/
348 #undef  KSIZE
349 #define KSIZE 4
350 
351 mlib_status CONV_FUNC(4x4)(mlib_image       *dst,
352                            const mlib_image *src,
353                            const mlib_s32   *kern,
354                            mlib_s32         scalef_expon,
355                            mlib_s32         cmask)
356 {
357   mlib_d64 buff[(KSIZE + 2)*BUFF_LINE], *buff3, *buff4, *buff5;
358   mlib_d64 k[KSIZE*KSIZE];
359   mlib_d64 k0, k1, k2, k3, k4, k5, k6, k7;
360   mlib_d64 p00, p01, p02, p03, p04,
361            p10, p11, p12, p13, p14,
362            p20, p21, p22, p23,
363            p30, p31, p32, p33;
364   mlib_s32 *sl2, *sl3;
365   DEF_VARS(mlib_s32);
366   mlib_s32 chan2 = chan1 + chan1;
367 
368   if (wid > BUFF_LINE) {
369     pbuff = mlib_malloc((KSIZE + 2)*sizeof(mlib_d64)*wid);
370 
371     if (pbuff == NULL) return MLIB_FAILURE;
372   }
373 
374   buff0 = pbuff;
375   buff1 = buff0 + wid;
376   buff2 = buff1 + wid;
377   buff3 = buff2 + wid;
378   buff4 = buff3 + wid;
379   buff5 = buff4 + wid;
380 
381   wid -= (KSIZE - 1);
382   hgt -= (KSIZE - 1);
383 
384   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
385 
386   CALC_SCALE();
387   for (j = 0; j < 16; j++) k[j] = scalef * kern[j];
388 
389   for (c = 0; c < chan1; c++) {
390     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
391 
392     sl = adr_src + c;
393     dl = adr_dst + c;
394 
395     sl1 = sl  + sll;
396     sl2 = sl1 + sll;
397     sl3 = sl2 + sll;
398 #ifdef __SUNPRO_C
399 #pragma pipeloop(0)
400 #endif /* __SUNPRO_C */
401     for (i = 0; i < wid + (KSIZE - 1); i++) {
402       buff0[i] = (mlib_d64)sl[i*chan1];
403       buff1[i] = (mlib_d64)sl1[i*chan1];
404       buff2[i] = (mlib_d64)sl2[i*chan1];
405       buff3[i] = (mlib_d64)sl3[i*chan1];
406     }
407 
408     sl += KSIZE*sll;
409 
410     for (j = 0; j < hgt; j++) {
411       /*
412        *  First loop on two first lines of kernel
413        */
414       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
415       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
416 
417       sp = sl;
418       dp = dl;
419 
420       p02 = buff0[0];
421       p12 = buff1[0];
422       p03 = buff0[1];
423       p13 = buff1[1];
424       p04 = buff0[2];
425 
426 #ifdef __SUNPRO_C
427 #pragma pipeloop(0)
428 #endif /* __SUNPRO_C */
429       for (i = 0; i <= (wid - 2); i += 2) {
430         p00 = p02; p10 = p12;
431         p01 = p03; p11 = p13;
432         p02 = p04; p12 = buff1[i + 2];
433         p03 = buff0[i + 3]; p13 = buff1[i + 3];
434         p04 = buff0[i + 4]; p14 = buff1[i + 4];
435 
436         buff4[i] = (mlib_d64)sp[0];
437         buff4[i + 1] = (mlib_d64)sp[chan1];
438 
439         buff5[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
440                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
441         buff5[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
442                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
443 
444         sp += chan2;
445         dp += chan2;
446       }
447 
448       /*
449        *  Second loop on two last lines of kernel
450        */
451       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
452       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
453 
454       sp = sl;
455       dp = dl;
456 
457       p02 = buff2[0];
458       p12 = buff3[0];
459       p03 = buff2[1];
460       p13 = buff3[1];
461       p04 = buff2[2];
462 
463 #ifdef __SUNPRO_C
464 #pragma pipeloop(0)
465 #endif /* __SUNPRO_C */
466       for (i = 0; i <= (wid - 2); i += 2) {
467         p00 = p02; p10 = p12;
468         p01 = p03; p11 = p13;
469         p02 = p04; p12 = buff3[i + 2];
470         p03 = buff2[i + 3]; p13 = buff3[i + 3];
471         p04 = buff2[i + 4]; p14 = buff3[i + 4];
472 
473         d0 = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
474               p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buff5[i]);
475         d1 = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
476               p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buff5[i + 1]);
477 
478         CLAMP_S32(dp[0    ], d0);
479         CLAMP_S32(dp[chan1], d1);
480 
481         sp += chan2;
482         dp += chan2;
483       }
484 
485       /* last pixels */
486       for (; i < wid; i++) {
487         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
488         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
489         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
490         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
491 
492         buff4[i] = (mlib_d64)sp[0];
493 
494         d0 = (p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
495               p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
496               p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
497               p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
498 
499         CLAMP_S32(dp[0], d0);
500 
501         sp += chan1;
502         dp += chan1;
503       }
504 
505       buff4[wid    ] = (mlib_d64)sp[0];
506       buff4[wid + 1] = (mlib_d64)sp[chan1];
507       buff4[wid + 2] = (mlib_d64)sp[chan2];
508 
509       /* next line */
510       sl += sll;
511       dl += dll;
512 
513       buffT = buff0;
514       buff0 = buff1;
515       buff1 = buff2;
516       buff2 = buff3;
517       buff3 = buff4;
518       buff4 = buffT;
519     }
520   }
521 
522   if (pbuff != buff) mlib_free(pbuff);
523 
524   return MLIB_SUCCESS;
525 }
526 
527 /***************************************************************/
528 #undef  KSIZE
529 #define KSIZE 5
530 
531 mlib_status CONV_FUNC(5x5)(mlib_image       *dst,
532                            const mlib_image *src,
533                            const mlib_s32   *kern,
534                            mlib_s32         scalef_expon,
535                            mlib_s32         cmask)
536 {
537   mlib_d64 buff[(KSIZE + 2)*BUFF_LINE], *buff3, *buff4, *buff5, *buff6;
538   mlib_d64 k[KSIZE*KSIZE];
539   mlib_d64 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
540   mlib_d64 p00, p01, p02, p03, p04, p05,
541            p10, p11, p12, p13, p14, p15,
542            p20, p21, p22, p23, p24,
543            p30, p31, p32, p33, p34,
544            p40, p41, p42, p43, p44;
545   mlib_s32 *sl2, *sl3, *sl4;
546   DEF_VARS(mlib_s32);
547   mlib_s32 chan2 = chan1 + chan1;
548   mlib_s32 chan3 = chan1 + chan2;
549 
550   if (wid > BUFF_LINE) {
551     pbuff = mlib_malloc((KSIZE + 2)*sizeof(mlib_d64)*wid);
552 
553     if (pbuff == NULL) return MLIB_FAILURE;
554   }
555 
556   buff0 = pbuff;
557   buff1 = buff0 + wid;
558   buff2 = buff1 + wid;
559   buff3 = buff2 + wid;
560   buff4 = buff3 + wid;
561   buff5 = buff4 + wid;
562   buff6 = buff5 + wid;
563 
564   wid -= (KSIZE - 1);
565   hgt -= (KSIZE - 1);
566 
567   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
568 
569   CALC_SCALE();
570   for (j = 0; j < 25; j++) k[j] = scalef * kern[j];
571 
572   for (c = 0; c < chan1; c++) {
573     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
574 
575     sl = adr_src + c;
576     dl = adr_dst + c;
577 
578     sl1 = sl  + sll;
579     sl2 = sl1 + sll;
580     sl3 = sl2 + sll;
581     sl4 = sl3 + sll;
582 #ifdef __SUNPRO_C
583 #pragma pipeloop(0)
584 #endif /* __SUNPRO_C */
585     for (i = 0; i < wid + (KSIZE - 1); i++) {
586       buff0[i] = (mlib_d64)sl[i*chan1];
587       buff1[i] = (mlib_d64)sl1[i*chan1];
588       buff2[i] = (mlib_d64)sl2[i*chan1];
589       buff3[i] = (mlib_d64)sl3[i*chan1];
590       buff4[i] = (mlib_d64)sl4[i*chan1];
591     }
592 
593     sl += KSIZE*sll;
594 
595     for (j = 0; j < hgt; j++) {
596       /*
597        *  First loop
598        */
599       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
600       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
601 
602       sp = sl;
603       dp = dl;
604 
605       p02 = buff0[0];
606       p12 = buff1[0];
607       p03 = buff0[1];
608       p13 = buff1[1];
609       p04 = buff0[2];
610       p14 = buff1[2];
611 
612 #ifdef __SUNPRO_C
613 #pragma pipeloop(0)
614 #endif /* __SUNPRO_C */
615       for (i = 0; i <= (wid - 2); i += 2) {
616         p00 = p02; p10 = p12;
617         p01 = p03; p11 = p13;
618         p02 = p04; p12 = p14;
619 
620         p03 = buff0[i + 3]; p13 = buff1[i + 3];
621         p04 = buff0[i + 4]; p14 = buff1[i + 4];
622         p05 = buff0[i + 5]; p15 = buff1[i + 5];
623 
624         buff6[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
625                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
626         buff6[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
627                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
628 
629         sp += chan2;
630         dp += chan2;
631       }
632 
633       /*
634        *  Second loop
635        */
636       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
637       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
638 
639       sp = sl;
640       dp = dl;
641 
642       p02 = buff2[0];
643       p12 = buff3[0];
644       p03 = buff2[1];
645       p13 = buff3[1];
646 
647 #ifdef __SUNPRO_C
648 #pragma pipeloop(0)
649 #endif /* __SUNPRO_C */
650       for (i = 0; i <= (wid - 2); i += 2) {
651         p00 = p02; p10 = p12;
652         p01 = p03; p11 = p13;
653 
654         p02 = buff2[i + 2]; p12 = buff3[i + 2];
655         p03 = buff2[i + 3]; p13 = buff3[i + 3];
656         p04 = buff2[i + 4]; p14 = buff3[i + 4];
657         p05 = buff2[i + 5]; p15 = buff3[i + 5];
658 
659         buff6[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
660                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
661         buff6[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
662                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
663 
664         sp += chan2;
665         dp += chan2;
666       }
667 
668       /*
669        *  3 loop
670        */
671       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
672 
673       sp = sl;
674       dp = dl;
675 
676       p02 = buff4[0];
677       p03 = buff4[1];
678       p04 = buff4[2];
679       p05 = buff4[3];
680 
681 #ifdef __SUNPRO_C
682 #pragma pipeloop(0)
683 #endif /* __SUNPRO_C */
684       for (i = 0; i <= (wid - 2); i += 2) {
685         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
686 
687         p04 = buff4[i + 4]; p05 = buff4[i + 5];
688 
689         buff5[i    ] = (mlib_d64)sp[0];
690         buff5[i + 1] = (mlib_d64)sp[chan1];
691 
692         d0 = p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buff6[i];
693         d1 = p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buff6[i + 1];
694 
695         CLAMP_S32(dp[0    ], d0);
696         CLAMP_S32(dp[chan1], d1);
697 
698         sp += chan2;
699         dp += chan2;
700       }
701 
702       /* last pixels */
703       for (; i < wid; i++) {
704         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
705         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
706         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
707         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
708         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
709 
710         p40 = buff4[i];        p41 = buff4[i + 1]; p42 = buff4[i + 2];
711         p43 = buff4[i + 3]; p44 = buff4[i + 4];
712 
713         buff5[i] = (mlib_d64)sp[0];
714 
715         d0 = (p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
716               p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
717               p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
718               p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
719               p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
720 
721         CLAMP_S32(dp[0], d0);
722 
723         sp += chan1;
724         dp += chan1;
725       }
726 
727       buff5[wid    ] = (mlib_d64)sp[0];
728       buff5[wid + 1] = (mlib_d64)sp[chan1];
729       buff5[wid + 2] = (mlib_d64)sp[chan2];
730       buff5[wid + 3] = (mlib_d64)sp[chan3];
731 
732       /* next line */
733       sl += sll;
734       dl += dll;
735 
736       buffT = buff0;
737       buff0 = buff1;
738       buff1 = buff2;
739       buff2 = buff3;
740       buff3 = buff4;
741       buff4 = buff5;
742       buff5 = buffT;
743     }
744   }
745 
746   if (pbuff != buff) mlib_free(pbuff);
747 
748   return MLIB_SUCCESS;
749 }
750 
751 /***************************************************************/
752 #undef  KSIZE
753 #define KSIZE 7
754 
755 mlib_status CONV_FUNC(7x7)(mlib_image       *dst,
756                            const mlib_image *src,
757                            const mlib_s32   *kern,
758                            mlib_s32         scalef_expon,
759                            mlib_s32         cmask)
760 {
761   mlib_d64 buff[(KSIZE + 2)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
762   mlib_d64 k[KSIZE*KSIZE];
763   mlib_d64 k0, k1, k2, k3, k4, k5, k6;
764   mlib_d64 p0, p1, p2, p3, p4, p5, p6, p7;
765   mlib_d64 d0, d1;
766   mlib_s32 l, m, buff_ind, *sl2, *sl3, *sl4, *sl5, *sl6;
767   mlib_d64 scalef;
768   DEF_VARS_MxN(mlib_s32);
769   mlib_s32 chan2 = chan1 + chan1;
770   mlib_s32 *sl1;
771 
772   if (wid > BUFF_LINE) {
773     pbuff = mlib_malloc((KSIZE + 2)*sizeof(mlib_d64)*wid);
774 
775     if (pbuff == NULL) return MLIB_FAILURE;
776   }
777 
778   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
779   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
780   buffd = buffs[KSIZE] + wid;
781 
782   wid -= (KSIZE - 1);
783   hgt -= (KSIZE - 1);
784 
785   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
786 
787   CALC_SCALE();
788   for (j = 0; j < 49; j++) k[j] = scalef * kern[j];
789 
790   for (c = 0; c < chan1; c++) {
791     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
792 
793     sl = adr_src + c;
794     dl = adr_dst + c;
795 
796     sl1 = sl  + sll;
797     sl2 = sl1 + sll;
798     sl3 = sl2 + sll;
799     sl4 = sl3 + sll;
800     sl5 = sl4 + sll;
801     sl6 = sl5 + sll;
802 #ifdef __SUNPRO_C
803 #pragma pipeloop(0)
804 #endif /* __SUNPRO_C */
805     for (i = 0; i < wid + (KSIZE - 1); i++) {
806       buffs[0][i] = (mlib_d64)sl[i*chan1];
807       buffs[1][i] = (mlib_d64)sl1[i*chan1];
808       buffs[2][i] = (mlib_d64)sl2[i*chan1];
809       buffs[3][i] = (mlib_d64)sl3[i*chan1];
810       buffs[4][i] = (mlib_d64)sl4[i*chan1];
811       buffs[5][i] = (mlib_d64)sl5[i*chan1];
812       buffs[6][i] = (mlib_d64)sl6[i*chan1];
813     }
814 
815     buff_ind = 0;
816 
817 #ifdef __SUNPRO_C
818 #pragma pipeloop(0)
819 #endif /* __SUNPRO_C */
820     for (i = 0; i < wid; i++) buffd[i] = 0.0;
821 
822     sl += KSIZE*sll;
823 
824     for (j = 0; j < hgt; j++) {
825       mlib_d64 **buffc = buffs + buff_ind;
826       mlib_d64 *buffn = buffc[KSIZE];
827       mlib_d64 *pk = k;
828 
829       for (l = 0; l < KSIZE; l++) {
830         mlib_d64 *buff = buffc[l];
831 
832         sp = sl;
833         dp = dl;
834 
835         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
836         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
837 
838         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
839         k4 = *pk++; k5 = *pk++; k6 = *pk++;
840 
841         if (l < (KSIZE - 1)) {
842 #ifdef __SUNPRO_C
843 #pragma pipeloop(0)
844 #endif /* __SUNPRO_C */
845           for (i = 0; i <= (wid - 2); i += 2) {
846             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
847 
848             p6 = buff[i + 6]; p7 = buff[i + 7];
849 
850             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
851             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
852           }
853 
854         } else {
855 #ifdef __SUNPRO_C
856 #pragma pipeloop(0)
857 #endif /* __SUNPRO_C */
858           for (i = 0; i <= (wid - 2); i += 2) {
859             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
860 
861             p6 = buff[i + 6]; p7 = buff[i + 7];
862 
863             buffn[i    ] = (mlib_d64)sp[0];
864             buffn[i + 1] = (mlib_d64)sp[chan1];
865 
866             d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ];
867             d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1];
868 
869             CLAMP_S32(dp[0    ], d0);
870             CLAMP_S32(dp[chan1], d1);
871 
872             buffd[i    ] = 0.0;
873             buffd[i + 1] = 0.0;
874 
875             sp += chan2;
876             dp += chan2;
877           }
878         }
879       }
880 
881       /* last pixels */
882       for (; i < wid; i++) {
883         mlib_d64 *pk = k, s = 0;
884 
885         for (l = 0; l < KSIZE; l++) {
886           mlib_d64 *buff = buffc[l] + i;
887 
888           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
889         }
890 
891         CLAMP_S32(dp[0], s);
892 
893         buffn[i] = (mlib_d64)sp[0];
894 
895         sp += chan1;
896         dp += chan1;
897       }
898 
899       for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
900 
901       /* next line */
902       sl += sll;
903       dl += dll;
904 
905       buff_ind++;
906 
907       if (buff_ind >= KSIZE + 1) buff_ind = 0;
908     }
909   }
910 
911   if (pbuff != buff) mlib_free(pbuff);
912 
913   return MLIB_SUCCESS;
914 }
915 
916 /***************************************************************/
917 #define FTYPE  mlib_d64
918 #define DTYPE  mlib_s32
919 
920 #define BUFF_SIZE  1600
921 
mlib_ImageConv1xN(mlib_image * dst,const mlib_image * src,const mlib_d64 * k,mlib_s32 n,mlib_s32 dn,mlib_s32 cmask)922 static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
923                                      const mlib_image *src,
924                                      const mlib_d64   *k,
925                                      mlib_s32         n,
926                                      mlib_s32         dn,
927                                      mlib_s32         cmask)
928 {
929   FTYPE    buff[BUFF_SIZE];
930   mlib_s32 off, kh;
931   const FTYPE    *pk;
932   FTYPE    k0, k1, k2, k3, d0, d1;
933   FTYPE    p0, p1, p2, p3, p4;
934   DTYPE    *sl_c, *dl_c, *sl0;
935   mlib_s32 l, hsize, max_hsize;
936   DEF_VARS_MxN(DTYPE);
937 
938   hgt -= (n - 1);
939   adr_dst += dn*dll;
940 
941   max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
942 
943   if (!max_hsize) max_hsize = 1;
944 
945   if (max_hsize > BUFF_SIZE) {
946     pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
947   }
948 
949   sl_c = adr_src;
950   dl_c = adr_dst;
951 
952   for (l = 0; l < hgt; l += hsize) {
953     hsize = hgt - l;
954 
955     if (hsize > max_hsize) hsize = max_hsize;
956 
957     for (c = 0; c < chan1; c++) {
958     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
959 
960       sl = sl_c + c;
961       dl = dl_c + c;
962 
963 #ifdef __SUNPRO_C
964 #pragma pipeloop(0)
965 #endif /* __SUNPRO_C */
966       for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
967 
968       for (i = 0; i < wid; i++) {
969         sl0 = sl;
970 
971         for (off = 0; off < (n - 4); off += 4) {
972           pk = k + off;
973           sp = sl0;
974 
975           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
976           p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
977           sp += 3*sll;
978 
979 #ifdef __SUNPRO_C
980 #pragma pipeloop(0)
981 #endif /* __SUNPRO_C */
982           for (j = 0; j < hsize; j += 2) {
983             p0 = p2; p1 = p3; p2 = p4;
984             p3 = sp[0];
985             p4 = sp[sll];
986 
987             pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
988             pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
989 
990             sp += 2*sll;
991           }
992 
993           sl0 += 4*sll;
994         }
995 
996         pk = k + off;
997         sp = sl0;
998 
999         k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1000         p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1001 
1002         dp = dl;
1003         kh = n - off;
1004 
1005         if (kh == 4) {
1006           sp += 3*sll;
1007 
1008 #ifdef __SUNPRO_C
1009 #pragma pipeloop(0)
1010 #endif /* __SUNPRO_C */
1011           for (j = 0; j <= (hsize - 2); j += 2) {
1012             p0 = p2; p1 = p3; p2 = p4;
1013             p3 = sp[0];
1014             p4 = sp[sll];
1015 
1016             d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j];
1017             d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1];
1018             CLAMP_S32(dp[0  ], d0);
1019             CLAMP_S32(dp[dll], d1);
1020 
1021             pbuff[j] = 0;
1022             pbuff[j + 1] = 0;
1023 
1024             sp += 2*sll;
1025             dp += 2*dll;
1026           }
1027 
1028           if (j < hsize) {
1029             p0 = p2; p1 = p3; p2 = p4;
1030             p3 = sp[0];
1031 
1032             d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j];
1033             CLAMP_S32(dp[0], d0);
1034 
1035             pbuff[j] = 0;
1036           }
1037 
1038         } else if (kh == 3) {
1039           sp += 2*sll;
1040 
1041 #ifdef __SUNPRO_C
1042 #pragma pipeloop(0)
1043 #endif /* __SUNPRO_C */
1044           for (j = 0; j <= (hsize - 2); j += 2) {
1045             p0 = p2; p1 = p3;
1046             p2 = sp[0];
1047             p3 = sp[sll];
1048 
1049             d0 = p0*k0 + p1*k1 + p2*k2 + pbuff[j];
1050             d1 = p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1];
1051             CLAMP_S32(dp[0  ], d0);
1052             CLAMP_S32(dp[dll], d1);
1053 
1054             pbuff[j] = 0;
1055             pbuff[j + 1] = 0;
1056 
1057             sp += 2*sll;
1058             dp += 2*dll;
1059           }
1060 
1061           if (j < hsize) {
1062             p0 = p2; p1 = p3;
1063             p2 = sp[0];
1064 
1065             d0 = p0*k0 + p1*k1 + p2*k2 + pbuff[j];
1066             CLAMP_S32(dp[0], d0);
1067 
1068             pbuff[j] = 0;
1069           }
1070 
1071         } else if (kh == 2) {
1072           sp += sll;
1073 
1074 #ifdef __SUNPRO_C
1075 #pragma pipeloop(0)
1076 #endif /* __SUNPRO_C */
1077           for (j = 0; j <= (hsize - 2); j += 2) {
1078             p0 = p2;
1079             p1 = sp[0];
1080             p2 = sp[sll];
1081 
1082             d0 = p0*k0 + p1*k1 + pbuff[j];
1083             d1 = p1*k0 + p2*k1 + pbuff[j + 1];
1084             CLAMP_S32(dp[0  ], d0);
1085             CLAMP_S32(dp[dll], d1);
1086 
1087             pbuff[j] = 0;
1088             pbuff[j + 1] = 0;
1089 
1090             sp += 2*sll;
1091             dp += 2*dll;
1092           }
1093 
1094           if (j < hsize) {
1095             p0 = p2;
1096             p1 = sp[0];
1097 
1098             d0 = p0*k0 + p1*k1 + pbuff[j];
1099             CLAMP_S32(dp[0], d0);
1100 
1101             pbuff[j] = 0;
1102           }
1103 
1104         } else /* if (kh == 1) */ {
1105 #ifdef __SUNPRO_C
1106 #pragma pipeloop(0)
1107 #endif /* __SUNPRO_C */
1108           for (j = 0; j < hsize; j++) {
1109             p0 = sp[0];
1110 
1111             d0 = p0*k0 + pbuff[j];
1112             CLAMP_S32(dp[0], d0);
1113 
1114             pbuff[j] = 0;
1115 
1116             sp += sll;
1117             dp += dll;
1118           }
1119         }
1120 
1121         sl += chan1;
1122         dl += chan1;
1123       }
1124     }
1125 
1126     sl_c += max_hsize*sll;
1127     dl_c += max_hsize*dll;
1128   }
1129 
1130   if (pbuff != buff) mlib_free(pbuff);
1131 
1132   return MLIB_SUCCESS;
1133 }
1134 
1135 /***************************************************************/
1136 #define MAX_KER 7
1137 
1138 #define MAX_N     15
1139 
1140 #undef  BUFF_SIZE
1141 #define BUFF_SIZE 1500
1142 
CONV_FUNC(MxN)1143 mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
1144                            const mlib_image *src,
1145                            const mlib_s32   *kernel,
1146                            mlib_s32         m,
1147                            mlib_s32         n,
1148                            mlib_s32         dm,
1149                            mlib_s32         dn,
1150                            mlib_s32         scale,
1151                            mlib_s32         cmask)
1152 {
1153   mlib_d64  buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1154   mlib_d64  **buffs = buffs_arr, *buffd;
1155   mlib_d64  akernel[256], *k = akernel, fscale = 1.0;
1156   mlib_s32  l, off, kw, bsize, buff_ind, mn;
1157   mlib_d64  d0, d1;
1158   mlib_d64  k0, k1, k2, k3, k4, k5, k6;
1159   mlib_d64  p0, p1, p2, p3, p4, p5, p6, p7;
1160   DEF_VARS_MxN(mlib_s32);
1161   mlib_s32 chan2 = chan1 + chan1;
1162 
1163   mlib_status status = MLIB_SUCCESS;
1164 
1165   if (scale > 30) {
1166     fscale *= 1.0/(1 << 30);
1167     scale -= 30;
1168   }
1169 
1170   fscale /= (1 << scale);
1171 
1172   mn = m*n;
1173 
1174   if (mn > 256) {
1175     k = mlib_malloc(mn*sizeof(mlib_d64));
1176 
1177     if (k == NULL) return MLIB_FAILURE;
1178   }
1179 
1180   for (i = 0; i < mn; i++) {
1181     k[i] = kernel[i]*fscale;
1182   }
1183 
1184   if (m == 1) {
1185     status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1186     FREE_AND_RETURN_STATUS;
1187   }
1188 
1189   bsize = (n + 2)*wid;
1190 
1191   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1192     pbuff = mlib_malloc(sizeof(mlib_d64)*bsize + sizeof(mlib_d64*)*2*(n + 1));
1193 
1194     if (pbuff == NULL) {
1195       status = MLIB_FAILURE;
1196       FREE_AND_RETURN_STATUS;
1197     }
1198     buffs = (mlib_d64**)(pbuff + bsize);
1199   }
1200 
1201   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1202   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1203   buffd = buffs[n] + wid;
1204 
1205   wid -= (m - 1);
1206   hgt -= (n - 1);
1207   adr_dst += dn*dll + dm*chan1;
1208 
1209   for (c = 0; c < chan1; c++) {
1210     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1211 
1212     sl = adr_src + c;
1213     dl = adr_dst + c;
1214 
1215     for (l = 0; l < n; l++) {
1216       mlib_d64 *buff = buffs[l];
1217 
1218 #ifdef __SUNPRO_C
1219 #pragma pipeloop(0)
1220 #endif /* __SUNPRO_C */
1221       for (i = 0; i < wid + (m - 1); i++) {
1222         buff[i] = (mlib_d64)sl[i*chan1];
1223       }
1224 
1225       sl += sll;
1226     }
1227 
1228     buff_ind = 0;
1229 
1230 #ifdef __SUNPRO_C
1231 #pragma pipeloop(0)
1232 #endif /* __SUNPRO_C */
1233     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1234 
1235     for (j = 0; j < hgt; j++) {
1236       mlib_d64 **buffc = buffs + buff_ind;
1237       mlib_d64 *buffn = buffc[n];
1238       mlib_d64 *pk = k;
1239 
1240       for (l = 0; l < n; l++) {
1241         mlib_d64 *buff_l = buffc[l];
1242 
1243         for (off = 0; off < m;) {
1244           mlib_d64 *buff = buff_l + off;
1245 
1246           kw = m - off;
1247 
1248           if (kw > 2*MAX_KER) kw = MAX_KER; else
1249             if (kw > MAX_KER) kw = kw/2;
1250           off += kw;
1251 
1252           sp = sl;
1253           dp = dl;
1254 
1255           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1256           p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1257 
1258           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1259           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1260           pk += kw;
1261 
1262           if (kw == 7) {
1263 
1264             if (l < (n - 1) || off < m) {
1265 #ifdef __SUNPRO_C
1266 #pragma pipeloop(0)
1267 #endif /* __SUNPRO_C */
1268               for (i = 0; i <= (wid - 2); i += 2) {
1269                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1270 
1271                 p6 = buff[i + 6]; p7 = buff[i + 7];
1272 
1273                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1274                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1275               }
1276 
1277             } else {
1278 #ifdef __SUNPRO_C
1279 #pragma pipeloop(0)
1280 #endif /* __SUNPRO_C */
1281               for (i = 0; i <= (wid - 2); i += 2) {
1282                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1283 
1284                 p6 = buff[i + 6]; p7 = buff[i + 7];
1285 
1286                 buffn[i    ] = (mlib_d64)sp[0];
1287                 buffn[i + 1] = (mlib_d64)sp[chan1];
1288 
1289                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ];
1290                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1];
1291 
1292                 CLAMP_S32(dp[0],     d0);
1293                 CLAMP_S32(dp[chan1], d1);
1294 
1295                 buffd[i    ] = 0.0;
1296                 buffd[i + 1] = 0.0;
1297 
1298                 sp += chan2;
1299                 dp += chan2;
1300               }
1301             }
1302 
1303           } else if (kw == 6) {
1304 
1305             if (l < (n - 1) || off < m) {
1306 #ifdef __SUNPRO_C
1307 #pragma pipeloop(0)
1308 #endif /* __SUNPRO_C */
1309               for (i = 0; i <= (wid - 2); i += 2) {
1310                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1311 
1312                 p5 = buff[i + 5]; p6 = buff[i + 6];
1313 
1314                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1315                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1316               }
1317 
1318             } else {
1319 #ifdef __SUNPRO_C
1320 #pragma pipeloop(0)
1321 #endif /* __SUNPRO_C */
1322               for (i = 0; i <= (wid - 2); i += 2) {
1323                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1324 
1325                 p5 = buff[i + 5]; p6 = buff[i + 6];
1326 
1327                 buffn[i    ] = (mlib_d64)sp[0];
1328                 buffn[i + 1] = (mlib_d64)sp[chan1];
1329 
1330                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ];
1331                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1];
1332 
1333                 CLAMP_S32(dp[0],     d0);
1334                 CLAMP_S32(dp[chan1], d1);
1335 
1336                 buffd[i    ] = 0.0;
1337                 buffd[i + 1] = 0.0;
1338 
1339                 sp += chan2;
1340                 dp += chan2;
1341               }
1342             }
1343 
1344           } else if (kw == 5) {
1345 
1346             if (l < (n - 1) || off < m) {
1347 #ifdef __SUNPRO_C
1348 #pragma pipeloop(0)
1349 #endif /* __SUNPRO_C */
1350               for (i = 0; i <= (wid - 2); i += 2) {
1351                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1352 
1353                 p4 = buff[i + 4]; p5 = buff[i + 5];
1354 
1355                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1356                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1357               }
1358 
1359             } else {
1360 #ifdef __SUNPRO_C
1361 #pragma pipeloop(0)
1362 #endif /* __SUNPRO_C */
1363               for (i = 0; i <= (wid - 2); i += 2) {
1364                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1365 
1366                 p4 = buff[i + 4]; p5 = buff[i + 5];
1367 
1368                 buffn[i    ] = (mlib_d64)sp[0];
1369                 buffn[i + 1] = (mlib_d64)sp[chan1];
1370 
1371                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ];
1372                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1];
1373 
1374                 CLAMP_S32(dp[0],     d0);
1375                 CLAMP_S32(dp[chan1], d1);
1376 
1377                 buffd[i    ] = 0.0;
1378                 buffd[i + 1] = 0.0;
1379 
1380                 sp += chan2;
1381                 dp += chan2;
1382               }
1383             }
1384 
1385           } else if (kw == 4) {
1386 
1387             if (l < (n - 1) || off < m) {
1388 #ifdef __SUNPRO_C
1389 #pragma pipeloop(0)
1390 #endif /* __SUNPRO_C */
1391               for (i = 0; i <= (wid - 2); i += 2) {
1392                 p0 = p2; p1 = p3; p2 = p4;
1393 
1394                 p3 = buff[i + 3]; p4 = buff[i + 4];
1395 
1396                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1397                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1398               }
1399 
1400             } else {
1401 #ifdef __SUNPRO_C
1402 #pragma pipeloop(0)
1403 #endif /* __SUNPRO_C */
1404               for (i = 0; i <= (wid - 2); i += 2) {
1405                 p0 = p2; p1 = p3; p2 = p4;
1406 
1407                 p3 = buff[i + 3]; p4 = buff[i + 4];
1408 
1409                 buffn[i    ] = (mlib_d64)sp[0];
1410                 buffn[i + 1] = (mlib_d64)sp[chan1];
1411 
1412                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ];
1413                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1];
1414 
1415                 CLAMP_S32(dp[0],     d0);
1416                 CLAMP_S32(dp[chan1], d1);
1417 
1418                 buffd[i    ] = 0.0;
1419                 buffd[i + 1] = 0.0;
1420 
1421                 sp += chan2;
1422                 dp += chan2;
1423               }
1424             }
1425 
1426           } else if (kw == 3) {
1427 
1428             if (l < (n - 1) || off < m) {
1429 #ifdef __SUNPRO_C
1430 #pragma pipeloop(0)
1431 #endif /* __SUNPRO_C */
1432               for (i = 0; i <= (wid - 2); i += 2) {
1433                 p0 = p2; p1 = p3;
1434 
1435                 p2 = buff[i + 2]; p3 = buff[i + 3];
1436 
1437                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1438                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1439               }
1440 
1441             } else {
1442 #ifdef __SUNPRO_C
1443 #pragma pipeloop(0)
1444 #endif /* __SUNPRO_C */
1445               for (i = 0; i <= (wid - 2); i += 2) {
1446                 p0 = p2; p1 = p3;
1447 
1448                 p2 = buff[i + 2]; p3 = buff[i + 3];
1449 
1450                 buffn[i    ] = (mlib_d64)sp[0];
1451                 buffn[i + 1] = (mlib_d64)sp[chan1];
1452 
1453                 d0 = p0*k0 + p1*k1 + p2*k2 + buffd[i    ];
1454                 d1 = p1*k0 + p2*k1 + p3*k2 + buffd[i + 1];
1455 
1456                 CLAMP_S32(dp[0],     d0);
1457                 CLAMP_S32(dp[chan1], d1);
1458 
1459                 buffd[i    ] = 0.0;
1460                 buffd[i + 1] = 0.0;
1461 
1462                 sp += chan2;
1463                 dp += chan2;
1464               }
1465             }
1466 
1467           } else { /* kw == 2 */
1468 
1469             if (l < (n - 1) || off < m) {
1470 #ifdef __SUNPRO_C
1471 #pragma pipeloop(0)
1472 #endif /* __SUNPRO_C */
1473               for (i = 0; i <= (wid - 2); i += 2) {
1474                 p0 = p2;
1475 
1476                 p1 = buff[i + 1]; p2 = buff[i + 2];
1477 
1478                 buffd[i    ] += p0*k0 + p1*k1;
1479                 buffd[i + 1] += p1*k0 + p2*k1;
1480               }
1481 
1482             } else {
1483 #ifdef __SUNPRO_C
1484 #pragma pipeloop(0)
1485 #endif /* __SUNPRO_C */
1486               for (i = 0; i <= (wid - 2); i += 2) {
1487                 p0 = p2;
1488 
1489                 p1 = buff[i + 1]; p2 = buff[i + 2];
1490 
1491                 buffn[i    ] = (mlib_d64)sp[0];
1492                 buffn[i + 1] = (mlib_d64)sp[chan1];
1493 
1494                 d0 = p0*k0 + p1*k1 + buffd[i    ];
1495                 d1 = p1*k0 + p2*k1 + buffd[i + 1];
1496 
1497                 CLAMP_S32(dp[0],     d0);
1498                 CLAMP_S32(dp[chan1], d1);
1499 
1500                 buffd[i    ] = 0.0;
1501                 buffd[i + 1] = 0.0;
1502 
1503                 sp += chan2;
1504                 dp += chan2;
1505               }
1506             }
1507           }
1508         }
1509       }
1510 
1511       /* last pixels */
1512       for (; i < wid; i++) {
1513         mlib_d64 *pk = k, s = 0;
1514         mlib_s32 x;
1515 
1516         for (l = 0; l < n; l++) {
1517           mlib_d64 *buff = buffc[l] + i;
1518 
1519           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1520         }
1521 
1522         CLAMP_S32(dp[0], s);
1523 
1524         buffn[i] = (mlib_d64)sp[0];
1525 
1526         sp += chan1;
1527         dp += chan1;
1528       }
1529 
1530       for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
1531 
1532       /* next line */
1533       sl += sll;
1534       dl += dll;
1535 
1536       buff_ind++;
1537 
1538       if (buff_ind >= n + 1) buff_ind = 0;
1539     }
1540   }
1541 
1542   FREE_AND_RETURN_STATUS;
1543 }
1544 
1545 /***************************************************************/
1546