1 /*
2  * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 
28 /*
29  *      The functions step along the lines from xLeft to xRight and apply
30  *      the bicubic filtering.
31  *
32  */
33 
34 #include "vis_proto.h"
35 #include "mlib_ImageAffine.h"
36 #include "mlib_v_ImageFilters.h"
37 
38 /*#define MLIB_VIS2*/
39 
40 /***************************************************************/
41 #define DTYPE  mlib_u8
42 
43 #define FILTER_BITS  8
44 
45 /***************************************************************/
46 #ifdef MLIB_VIS2
47 #define MLIB_WRITE_BMASK(bmask) vis_write_bmask(bmask, 0)
48 #else
49 #define MLIB_WRITE_BMASK(bmask)
50 #endif /* MLIB_VIS2 */
51 
52 /***************************************************************/
53 #define sPtr srcPixelPtr
54 
55 /***************************************************************/
56 #define NEXT_PIXEL_1BC_U8()                                     \
57   xSrc = (X>>MLIB_SHIFT)-1;                                     \
58   ySrc = (Y>>MLIB_SHIFT)-1;                                     \
59   sPtr = (mlib_u8 *)lineAddr[ySrc] + xSrc
60 
61 /***************************************************************/
62 #ifndef MLIB_VIS2
63 
64 #define ALIGN_ADDR(da, dp)                                      \
65   da = vis_alignaddr(dp, 0)
66 
67 #else
68 
69 #define ALIGN_ADDR(da, dp)                                      \
70   vis_alignaddr(dp, 0);                                         \
71   da = (mlib_d64*)(((mlib_addr)(dp)) &~ 7)
72 
73 #endif /* MLIB_VIS2 */
74 
75 /***************************************************************/
76 #define LOAD_BC_U8_1CH_1PIXEL(mlib_filters_u8)                         \
77   ALIGN_ADDR(dpSrc, sPtr);                                             \
78   data0 = dpSrc[0];                                                    \
79   data1 = dpSrc[1];                                                    \
80   row00 = vis_faligndata(data0, data1);                                \
81   sPtr += srcYStride;                                                  \
82   ALIGN_ADDR(dpSrc, sPtr);                                             \
83   data0 = dpSrc[0];                                                    \
84   data1 = dpSrc[1];                                                    \
85   row10 = vis_faligndata(data0, data1);                                \
86   sPtr += srcYStride;                                                  \
87   ALIGN_ADDR(dpSrc, sPtr);                                             \
88   data0 = dpSrc[0];                                                    \
89   data1 = dpSrc[1];                                                    \
90   row20 = vis_faligndata(data0, data1);                                \
91   sPtr += srcYStride;                                                  \
92   ALIGN_ADDR(dpSrc, sPtr);                                             \
93   data0 = dpSrc[0];                                                    \
94   data1 = dpSrc[1];                                                    \
95   row30 = vis_faligndata(data0, data1);                                \
96   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                      \
97   yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
98   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                      \
99   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx));  \
100   X += dX;                                                             \
101   Y += dY
102 
103 /***************************************************************/
104 #ifndef MLIB_VIS2
105 
106 #define SUM_4x16(v1, v3)                                        \
107   vis_alignaddr((void*)2, 0);                                   \
108   v0 = vis_faligndata(v3, v3);                                  \
109   v2 = vis_fpadd16(v3, v0);                                     \
110   v1 = vis_write_lo(v1, vis_fpadd16s(vis_read_hi(v2), vis_read_lo(v2)))
111 
112 #else
113 
114 #define SUM_4x16(v1, v3)                                              \
115   v2 = vis_freg_pair(vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)),  \
116                      vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3))); \
117   v3 = vis_bshuffle(v2, v2);                                          \
118   v1 = vis_write_lo(v1, vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)))
119 
120 #endif /* MLIB_VIS2 */
121 
122 /***************************************************************/
123 #define RESULT_1BC_U8_1PIXEL(ind)                                    \
124   v0 = vis_fmul8x16au(vis_read_hi(row0##ind), vis_read_hi(yFilter)); \
125   v1 = vis_fmul8x16al(vis_read_hi(row1##ind), vis_read_hi(yFilter)); \
126   sum = vis_fpadd16(v0, v1);                                         \
127   v2 = vis_fmul8x16au(vis_read_hi(row2##ind), vis_read_lo(yFilter)); \
128   sum = vis_fpadd16(sum, v2);                                        \
129   v3 = vis_fmul8x16al(vis_read_hi(row3##ind), vis_read_lo(yFilter)); \
130   sum = vis_fpadd16(sum, v3);                                        \
131   v0 = vis_fmul8sux16(sum, xFilter);                                 \
132   v1 = vis_fmul8ulx16(sum, xFilter);                                 \
133   v3 = vis_fpadd16(v1, v0);                                          \
134   SUM_4x16(v1, v3);                                                  \
135   res = vis_write_lo(res, vis_fpack16(v1))
136 
137 /***************************************************************/
138 #define BC_U8_1CH(index, ind1, ind2, mlib_filters_u8)                  \
139   ALIGN_ADDR(dpSrc, sPtr);                                             \
140   data0 = dpSrc[0];                                                    \
141   v0 = vis_fmul8x16au(vis_read_hi(row0##ind1), vis_read_hi(yFilter));  \
142   filterposy = (Y >> FILTER_SHIFT);                                    \
143   data1 = dpSrc[1];                                                    \
144   v1 = vis_fmul8x16al(vis_read_hi(row1##ind1), vis_read_hi(yFilter));  \
145   row0##ind2 = vis_faligndata(data0, data1);                           \
146   filterposx = (X >> FILTER_SHIFT);                                    \
147   sPtr += srcYStride;                                                  \
148   ALIGN_ADDR(dpSrc, sPtr);                                             \
149   sum = vis_fpadd16(v0, v1);                                           \
150   data0 = dpSrc[0];                                                    \
151   v2 = vis_fmul8x16au(vis_read_hi(row2##ind1), vis_read_lo(yFilter));  \
152   X += dX;                                                             \
153   data1 = dpSrc[1];                                                    \
154   row1##ind2 = vis_faligndata(data0, data1);                           \
155   sPtr += srcYStride;                                                  \
156   ALIGN_ADDR(dpSrc, sPtr);                                             \
157   Y += dY;                                                             \
158   sum = vis_fpadd16(sum, v2);                                          \
159   xSrc = (X>>MLIB_SHIFT)-1;                                            \
160   v3 = vis_fmul8x16al(vis_read_hi(row3##ind1), vis_read_lo(yFilter));  \
161   data0 = dpSrc[0];                                                    \
162   ySrc = (Y>>MLIB_SHIFT)-1;                                            \
163   sum = vis_fpadd16(sum, v3);                                          \
164   data1 = dpSrc[1];                                                    \
165   filterposy &= FILTER_MASK;                                           \
166   v0 = vis_fmul8sux16(sum, xFilter);                                   \
167   row2##ind2 = vis_faligndata(data0, data1);                           \
168   sPtr += srcYStride;                                                  \
169   v1 = vis_fmul8ulx16(sum, xFilter);                                   \
170   filterposx &= FILTER_MASK;                                           \
171   ALIGN_ADDR(dpSrc, sPtr);                                             \
172   data0 = dpSrc[0];                                                    \
173   d##index = vis_fpadd16(v0, v1);                                      \
174   data1 = dpSrc[1];                                                    \
175   row3##ind2 = vis_faligndata(data0, data1);                           \
176   yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
177   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx));  \
178   sPtr = (mlib_u8 *)lineAddr[ySrc] + xSrc
179 
180 /***************************************************************/
181 #ifndef MLIB_VIS2
182 
183 #define FADD_1BC_U8()                                           \
184   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
185   p1 = vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1));          \
186   p2 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2));          \
187   p3 = vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3));          \
188   m02 = vis_fpmerge(p0, p2);                                    \
189   m13 = vis_fpmerge(p1, p3);                                    \
190   m0213 = vis_fpmerge(vis_read_hi(m02), vis_read_hi(m13));      \
191   e0 = vis_fpmerge(vis_read_hi(m0213), vis_read_lo(m0213));     \
192   m0213 = vis_fpmerge(vis_read_lo(m02), vis_read_lo(m13));      \
193   e1 = vis_fpmerge(vis_read_hi(m0213), vis_read_lo(m0213));     \
194   res = vis_fpadd16(e0, e1)
195 
196 #else
197 
198 #define FADD_1BC_U8()                                                 \
199   v0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0)),  \
200                      vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1))); \
201   v1 = vis_freg_pair(vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2)),  \
202                      vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3))); \
203   v2 = vis_bshuffle(v0, v0);                                          \
204   v3 = vis_bshuffle(v1, v1);                                          \
205   res = vis_freg_pair(vis_fpadd16s(vis_read_hi(v2), vis_read_lo(v2)), \
206                       vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)))
207 
208 #endif /* MLIB_VIS2 */
209 
210 /***************************************************************/
mlib_ImageAffine_u8_1ch_bc(mlib_affine_param * param)211 mlib_status mlib_ImageAffine_u8_1ch_bc (mlib_affine_param *param)
212 {
213   DECLAREVAR_BC();
214   mlib_s32  filterposx, filterposy;
215   mlib_d64  data0, data1;
216   mlib_d64  sum;
217   mlib_d64  row00, row10, row20, row30;
218   mlib_d64  row01, row11, row21, row31;
219   mlib_d64  xFilter, yFilter;
220   mlib_d64  v0, v1, v2, v3;
221   mlib_d64  d0, d1, d2, d3;
222 #ifndef MLIB_VIS2
223   mlib_f32  p0, p1, p2, p3;
224   mlib_d64  e0, e1;
225   mlib_d64  m02, m13, m0213;
226 #endif /* MLIB_VIS2 */
227   mlib_d64  *dpSrc;
228   mlib_s32  align, cols, i;
229   mlib_d64  res;
230   const mlib_s16 *mlib_filters_table;
231 
232   if (filter == MLIB_BICUBIC) {
233     mlib_filters_table = mlib_filters_u8_bc;
234   } else {
235     mlib_filters_table = mlib_filters_u8_bc2;
236   }
237 
238   for (j = yStart; j <= yFinish; j++) {
239 
240     vis_write_gsr(3 << 3);
241     MLIB_WRITE_BMASK(0x0145ABEF);
242 
243     CLIP(1);
244 
245     cols = xRight - xLeft + 1;
246     align = (4 - ((mlib_addr)dstPixelPtr) & 3) & 3;
247     align = (cols < align)? cols : align;
248 
249     for (i = 0; i < align; i++) {
250       NEXT_PIXEL_1BC_U8();
251       LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
252       RESULT_1BC_U8_1PIXEL(0);
253       vis_st_u8(res, dstPixelPtr++);
254     }
255 
256     if (i <= cols - 10) {
257 
258       NEXT_PIXEL_1BC_U8();
259       LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
260 
261       NEXT_PIXEL_1BC_U8();
262 
263       BC_U8_1CH(0, 0, 1, mlib_filters_table);
264       BC_U8_1CH(1, 1, 0, mlib_filters_table);
265       BC_U8_1CH(2, 0, 1, mlib_filters_table);
266       BC_U8_1CH(3, 1, 0, mlib_filters_table);
267 
268       FADD_1BC_U8();
269 
270       BC_U8_1CH(0, 0, 1, mlib_filters_table);
271       BC_U8_1CH(1, 1, 0, mlib_filters_table);
272       BC_U8_1CH(2, 0, 1, mlib_filters_table);
273       BC_U8_1CH(3, 1, 0, mlib_filters_table);
274 
275 #pragma pipeloop(0)
276       for (; i <= cols - 14; i+=4) {
277         *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
278         FADD_1BC_U8();
279         BC_U8_1CH(0, 0, 1, mlib_filters_table);
280         BC_U8_1CH(1, 1, 0, mlib_filters_table);
281         BC_U8_1CH(2, 0, 1, mlib_filters_table);
282         BC_U8_1CH(3, 1, 0, mlib_filters_table);
283         dstPixelPtr += 4;
284       }
285 
286       *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
287       dstPixelPtr += 4;
288       FADD_1BC_U8();
289       *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
290       dstPixelPtr += 4;
291 
292       RESULT_1BC_U8_1PIXEL(0);
293       vis_st_u8(res, dstPixelPtr++);
294 
295       LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
296       RESULT_1BC_U8_1PIXEL(0);
297       vis_st_u8(res, dstPixelPtr++);
298       i += 10;
299     }
300 
301     for (; i < cols; i++) {
302       NEXT_PIXEL_1BC_U8();
303       LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
304       RESULT_1BC_U8_1PIXEL(0);
305       vis_st_u8(res, dstPixelPtr++);
306     }
307   }
308 
309   return MLIB_SUCCESS;
310 }
311 
312 /***************************************************************/
313 #define FADD_2BC_U8()                                           \
314   d0 = vis_fpadd16(d00, d10);                                   \
315   d1 = vis_fpadd16(d01, d11);                                   \
316   d2 = vis_fpadd16(d02, d12);                                   \
317   d3 = vis_fpadd16(d03, d13);                                   \
318   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
319   p1 = vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1));          \
320   p2 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2));          \
321   p3 = vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3));          \
322   e0 = vis_freg_pair(p0, p1);                                   \
323   e1 = vis_freg_pair(p2, p3);                                   \
324   res = vis_fpack16_pair(e0, e1)
325 
326 /***************************************************************/
327 #define LOAD_BC_U8_2CH_1PIXEL(mlib_filters_u8)                         \
328   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                      \
329   yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
330   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                      \
331   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx));  \
332   X += dX;                                                             \
333   Y += dY;                                                             \
334   ALIGN_ADDR(dpSrc, sPtr);                                             \
335   data0 = dpSrc[0];                                                    \
336   data1 = dpSrc[1];                                                    \
337   row0 = vis_faligndata(data0, data1);                                 \
338   sPtr += srcYStride;                                                  \
339   ALIGN_ADDR(dpSrc, sPtr);                                             \
340   data0 = dpSrc[0];                                                    \
341   data1 = dpSrc[1];                                                    \
342   row1 = vis_faligndata(data0, data1);                                 \
343   sPtr += srcYStride;                                                  \
344   ALIGN_ADDR(dpSrc, sPtr);                                             \
345   data0 = dpSrc[0];                                                    \
346   data1 = dpSrc[1];                                                    \
347   row2 = vis_faligndata(data0, data1);                                 \
348   sPtr += srcYStride;                                                  \
349   ALIGN_ADDR(dpSrc, sPtr);                                             \
350   data0 = dpSrc[0];                                                    \
351   data1 = dpSrc[1];                                                    \
352   row3 = vis_faligndata(data0, data1)
353 
354 /***************************************************************/
355 #define NEXT_PIXEL_2BC_U8()                                     \
356   xSrc = (X>>MLIB_SHIFT)-1;                                     \
357   ySrc = (Y>>MLIB_SHIFT)-1;                                     \
358   sPtr = (mlib_u8 *)lineAddr[ySrc] + (xSrc<<1)
359 
360 /***************************************************************/
361 #define RESULT_2BC_U8_1PIXEL()                                   \
362   v00 = vis_fmul8x16au(vis_read_hi(row0), vis_read_hi(yFilter)); \
363   dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter));  \
364   v01 = vis_fmul8x16au(vis_read_lo(row0), vis_read_hi(yFilter)); \
365   dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));            \
366   v10 = vis_fmul8x16al(vis_read_hi(row1), vis_read_hi(yFilter)); \
367   dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));           \
368   v11 = vis_fmul8x16al(vis_read_lo(row1), vis_read_hi(yFilter)); \
369   dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));            \
370   v20 = vis_fmul8x16au(vis_read_hi(row2), vis_read_lo(yFilter)); \
371   xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));     \
372   v21 = vis_fmul8x16au(vis_read_lo(row2), vis_read_lo(yFilter)); \
373   xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));     \
374   v30 = vis_fmul8x16al(vis_read_hi(row3), vis_read_lo(yFilter)); \
375   sum0 = vis_fpadd16(v00, v10);                                  \
376   v31 = vis_fmul8x16al(vis_read_lo(row3), vis_read_lo(yFilter)); \
377   sum1 = vis_fpadd16(v01, v11);                                  \
378   sum0 = vis_fpadd16(sum0, v20);                                 \
379   sum1 = vis_fpadd16(sum1, v21);                                 \
380   sum0 = vis_fpadd16(sum0, v30);                                 \
381   sum1 = vis_fpadd16(sum1, v31);                                 \
382   v00 = vis_fmul8sux16(sum0, xFilter0);                          \
383   v01 = vis_fmul8sux16(sum1, xFilter1);                          \
384   v10 = vis_fmul8ulx16(sum0, xFilter0);                          \
385   sum0 = vis_fpadd16(v00, v10);                                  \
386   v11 = vis_fmul8ulx16(sum1, xFilter1);                          \
387   sum1 = vis_fpadd16(v01, v11);                                  \
388   d0 = vis_fpadd16(sum0, sum1);                                  \
389   v00 = vis_write_lo(v00, vis_fpadd16s(vis_read_hi(d0),          \
390                                        vis_read_lo(d0)));        \
391   res = vis_write_lo(res, vis_fpack16(v00))
392 
393 /***************************************************************/
394 #define BC_U8_2CH(index, mlib_filters_u8)                              \
395   v00 = vis_fmul8x16au(vis_read_hi(row0), vis_read_hi(yFilter));       \
396   dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter));        \
397   v01 = vis_fmul8x16au(vis_read_lo(row0), vis_read_hi(yFilter));       \
398   dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));                  \
399   v10 = vis_fmul8x16al(vis_read_hi(row1), vis_read_hi(yFilter));       \
400   dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));                 \
401   v11 = vis_fmul8x16al(vis_read_lo(row1), vis_read_hi(yFilter));       \
402   dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));                  \
403   v20 = vis_fmul8x16au(vis_read_hi(row2), vis_read_lo(yFilter));       \
404   xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));           \
405   v21 = vis_fmul8x16au(vis_read_lo(row2), vis_read_lo(yFilter));       \
406   xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));           \
407   v30 = vis_fmul8x16al(vis_read_hi(row3), vis_read_lo(yFilter));       \
408   v31 = vis_fmul8x16al(vis_read_lo(row3), vis_read_lo(yFilter));       \
409   ALIGN_ADDR(dpSrc, sPtr);                                             \
410   data0 = dpSrc[0];                                                    \
411   sum0 = vis_fpadd16(v00, v10);                                        \
412   filterposy = (Y >> FILTER_SHIFT);                                    \
413   data1 = dpSrc[1];                                                    \
414   row0 = vis_faligndata(data0, data1);                                 \
415   filterposx = (X >> FILTER_SHIFT);                                    \
416   sPtr += srcYStride;                                                  \
417   ALIGN_ADDR(dpSrc, sPtr);                                             \
418   data0 = dpSrc[0];                                                    \
419   sum1 = vis_fpadd16(v01, v11);                                        \
420   X += dX;                                                             \
421   data1 = dpSrc[1];                                                    \
422   sum0 = vis_fpadd16(sum0, v20);                                       \
423   row1 = vis_faligndata(data0, data1);                                 \
424   sPtr += srcYStride;                                                  \
425   ALIGN_ADDR(dpSrc, sPtr);                                             \
426   Y += dY;                                                             \
427   sum1 = vis_fpadd16(sum1, v21);                                       \
428   xSrc = (X>>MLIB_SHIFT)-1;                                            \
429   data0 = dpSrc[0];                                                    \
430   ySrc = (Y>>MLIB_SHIFT)-1;                                            \
431   sum0 = vis_fpadd16(sum0, v30);                                       \
432   data1 = dpSrc[1];                                                    \
433   filterposy &= FILTER_MASK;                                           \
434   sum1 = vis_fpadd16(sum1, v31);                                       \
435   v00 = vis_fmul8sux16(sum0, xFilter0);                                \
436   row2 = vis_faligndata(data0, data1);                                 \
437   v01 = vis_fmul8sux16(sum1, xFilter1);                                \
438   sPtr += srcYStride;                                                  \
439   v10 = vis_fmul8ulx16(sum0, xFilter0);                                \
440   filterposx &= FILTER_MASK;                                           \
441   ALIGN_ADDR(dpSrc, sPtr);                                             \
442   v11= vis_fmul8ulx16(sum1, xFilter1);                                 \
443   data0 = dpSrc[0];                                                    \
444   d0##index = vis_fpadd16(v00, v10);                                   \
445   data1 = dpSrc[1];                                                    \
446   row3 = vis_faligndata(data0, data1);                                 \
447   yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
448   d1##index = vis_fpadd16(v01, v11);                                   \
449   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx));  \
450   sPtr = (mlib_u8 *)lineAddr[ySrc] + (xSrc<<1)
451 
452 /***************************************************************/
mlib_ImageAffine_u8_2ch_bc(mlib_affine_param * param)453 mlib_status mlib_ImageAffine_u8_2ch_bc (mlib_affine_param *param)
454 {
455   DECLAREVAR_BC();
456   DTYPE  *dstLineEnd;
457   mlib_s32  filterposx, filterposy;
458   mlib_d64  data0, data1;
459   mlib_d64  sum0, sum1;
460   mlib_d64  row0, row1, row2, row3;
461   mlib_f32  p0, p1, p2, p3;
462   mlib_d64  xFilter;
463   mlib_d64  xFilter0, xFilter1, yFilter;
464   mlib_d64  v00, v10, v20, v30;
465   mlib_d64  v01, v11, v21, v31;
466   mlib_d64  d0, d1, d2, d3;
467   mlib_d64  d00, d01, d02, d03;
468   mlib_d64  d10, d11, d12, d13;
469   mlib_d64  e0, e1;
470   mlib_d64  *dpSrc;
471   mlib_s32  cols, i, mask, off;
472   mlib_d64  dr, dr1;
473   mlib_d64  res, *dp;
474   const mlib_s16 *mlib_filters_table;
475 
476   if (filter == MLIB_BICUBIC) {
477     mlib_filters_table = mlib_filters_u8_bc;
478   } else {
479     mlib_filters_table = mlib_filters_u8_bc2;
480   }
481 
482   for (j = yStart; j <= yFinish; j++) {
483 
484     vis_write_gsr(3 << 3);
485 
486     CLIP(2);
487     dstLineEnd  = (DTYPE*)dstData + 2 * xRight;
488 
489     cols = xRight - xLeft + 1;
490     dp = vis_alignaddr(dstPixelPtr, 0);
491     off = dstPixelPtr - (mlib_u8*)dp;
492     dstLineEnd += 1;
493     mask = vis_edge8(dstPixelPtr, dstLineEnd);
494     i = 0;
495 
496     if (i <= cols - 10) {
497 
498       NEXT_PIXEL_2BC_U8();
499       LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
500 
501       NEXT_PIXEL_2BC_U8();
502 
503       BC_U8_2CH(0, mlib_filters_table);
504       BC_U8_2CH(1, mlib_filters_table);
505       BC_U8_2CH(2, mlib_filters_table);
506       BC_U8_2CH(3, mlib_filters_table);
507 
508       FADD_2BC_U8();
509 
510       BC_U8_2CH(0, mlib_filters_table);
511       BC_U8_2CH(1, mlib_filters_table);
512       BC_U8_2CH(2, mlib_filters_table);
513       BC_U8_2CH(3, mlib_filters_table);
514 
515 #pragma pipeloop(0)
516       for (; i <= cols-14; i+=4) {
517         vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
518         res = vis_faligndata(res, res);
519         vis_pst_8(res, dp++, mask);
520         vis_pst_8(res, dp, ~mask);
521         FADD_2BC_U8();
522         BC_U8_2CH(0, mlib_filters_table);
523         BC_U8_2CH(1, mlib_filters_table);
524         BC_U8_2CH(2, mlib_filters_table);
525         BC_U8_2CH(3, mlib_filters_table);
526       }
527 
528       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
529       res = vis_faligndata(res, res);
530       vis_pst_8(res, dp++, mask);
531       vis_pst_8(res, dp, ~mask);
532 
533       FADD_2BC_U8();
534       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
535       res = vis_faligndata(res, res);
536       vis_pst_8(res, dp++, mask);
537       vis_pst_8(res, dp, ~mask);
538 
539       dstPixelPtr = (mlib_u8*)dp + off;
540 
541       RESULT_2BC_U8_1PIXEL();
542       vis_alignaddr((void *)7, 0);
543       vis_st_u8(res, dstPixelPtr+1);
544       res = vis_faligndata(res, res);
545       vis_st_u8(res, dstPixelPtr);
546       dstPixelPtr += 2;
547 
548       LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
549       RESULT_2BC_U8_1PIXEL();
550       vis_alignaddr((void *)7, 0);
551       vis_st_u8(res, dstPixelPtr+1);
552       res = vis_faligndata(res, res);
553       vis_st_u8(res, dstPixelPtr);
554       dstPixelPtr += 2;
555       i += 10;
556     }
557 
558     for (; i < cols; i++) {
559       NEXT_PIXEL_2BC_U8();
560       LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
561       RESULT_2BC_U8_1PIXEL();
562       vis_alignaddr((void *)7, 0);
563       vis_st_u8(res, dstPixelPtr+1);
564       res = vis_faligndata(res, res);
565       vis_st_u8(res, dstPixelPtr);
566       dstPixelPtr += 2;
567     }
568   }
569 
570   return MLIB_SUCCESS;
571 }
572 
573 /***************************************************************/
574 #ifndef MLIB_VIS2
575 
576 #define FADD_3BC_U8()                                           \
577   vis_alignaddr((void*)6, 0);                                   \
578   d3 = vis_faligndata(d0, d1);                                  \
579   vis_alignaddr((void*)2, 0);                                   \
580   d4 = vis_faligndata(d1, d2);                                  \
581   d0 = vis_fpadd16(d0, d3);                                     \
582   d2 = vis_fpadd16(d2, d4);                                     \
583   d1 = vis_faligndata(d2, d2);                                  \
584   d0 = vis_fpadd16(d0, d1);                                     \
585   f0.f = vis_fpack16(d0)
586 
587 #else
588 
589 #define FADD_3BC_U8()                                           \
590   vis_alignaddr((void*)4, 0);                                   \
591   d3 = vis_bshuffle(d0, d1);                                    \
592   d1 = vis_faligndata(d1, d2);                                  \
593   d2 = vis_faligndata(d2, d2);                                  \
594   d4 = vis_bshuffle(d1, d2);                                    \
595   d0 = vis_fpadd16(d0, d3);                                     \
596   d1 = vis_fpadd16(d1, d4);                                     \
597   d0 = vis_fpadd16(d0, d1);                                     \
598   f0.f = vis_fpack16(d0)
599 
600 #endif /* MLIB_VIS2 */
601 
602 /***************************************************************/
603 #define LOAD_BC_U8_3CH_1PIXEL(mlib_filters_u8, mlib_filters_u8_3)      \
604   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                      \
605   yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
606   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                      \
607   xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_3+3*filterposx));      \
608   xFilter0 = xPtr[0];                                                  \
609   xFilter1 = xPtr[1];                                                  \
610   xFilter2 = xPtr[2];                                                  \
611   X += dX;                                                             \
612   Y += dY;                                                             \
613   ALIGN_ADDR(dpSrc, sPtr);                                             \
614   data0 = dpSrc[0];                                                    \
615   data1 = dpSrc[1];                                                    \
616   data2 = dpSrc[2];                                                    \
617   row00 = vis_faligndata(data0, data1);                                \
618   row01 = vis_faligndata(data1, data2);                                \
619   sPtr += srcYStride;                                                  \
620   ALIGN_ADDR(dpSrc, sPtr);                                             \
621   data0 = dpSrc[0];                                                    \
622   data1 = dpSrc[1];                                                    \
623   data2 = dpSrc[2];                                                    \
624   row10 = vis_faligndata(data0, data1);                                \
625   row11 = vis_faligndata(data1, data2);                                \
626   sPtr += srcYStride;                                                  \
627   ALIGN_ADDR(dpSrc, sPtr);                                             \
628   data0 = dpSrc[0];                                                    \
629   data1 = dpSrc[1];                                                    \
630   data2 = dpSrc[2];                                                    \
631   row20 = vis_faligndata(data0, data1);                                \
632   row21 = vis_faligndata(data1, data2);                                \
633   sPtr += srcYStride;                                                  \
634   ALIGN_ADDR(dpSrc, sPtr);                                             \
635   data0 = dpSrc[0];                                                    \
636   data1 = dpSrc[1];                                                    \
637   data2 = dpSrc[2];                                                    \
638   row30 = vis_faligndata(data0, data1);                                \
639   row31 = vis_faligndata(data1, data2)
640 
641 /***************************************************************/
642 #define STORE_BC_U8_3CH_1PIXEL()                                \
643  dstPixelPtr[0] = f0.t[0];                                      \
644  dstPixelPtr[1] = f0.t[1];                                      \
645  dstPixelPtr[2] = f0.t[2];                                      \
646  dstPixelPtr += 3
647 
648 /***************************************************************/
649 #define NEXT_PIXEL_3BC_U8()                                     \
650   xSrc = (X>>MLIB_SHIFT)-1;                                     \
651   ySrc = (Y>>MLIB_SHIFT)-1;                                     \
652   sPtr = (mlib_u8 *)lineAddr[ySrc] + (3*xSrc)
653 
654 /***************************************************************/
655 #define RESULT_3BC_U8_1PIXEL()                                    \
656   v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
657   v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
658   v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
659   v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
660   v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
661   v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
662   v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
663   sum0 = vis_fpadd16(v00, v10);                                   \
664   v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
665   sum1 = vis_fpadd16(v01, v11);                                   \
666   v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
667   sum2 = vis_fpadd16(v02, v12);                                   \
668   v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
669   sum0 = vis_fpadd16(sum0, v20);                                  \
670   v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
671   sum1 = vis_fpadd16(sum1, v21);                                  \
672   v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
673   sum2 = vis_fpadd16(sum2, v22);                                  \
674   sum0 = vis_fpadd16(sum0, v30);                                  \
675   sum1 = vis_fpadd16(sum1, v31);                                  \
676   v00 = vis_fmul8sux16(sum0, xFilter0);                           \
677   sum2 = vis_fpadd16(sum2, v32);                                  \
678   v01 = vis_fmul8ulx16(sum0, xFilter0);                           \
679   v10 = vis_fmul8sux16(sum1, xFilter1);                           \
680   d0 = vis_fpadd16(v00, v01);                                     \
681   v11 = vis_fmul8ulx16(sum1, xFilter1);                           \
682   v20 = vis_fmul8sux16(sum2, xFilter2);                           \
683   d1 = vis_fpadd16(v10, v11);                                     \
684   v21 = vis_fmul8ulx16(sum2, xFilter2);                           \
685   d2 = vis_fpadd16(v20, v21);                                     \
686   FADD_3BC_U8();
687 
688 /***************************************************************/
689 #define BC_U8_3CH(mlib_filters_u8, mlib_filters_u8_3)                 \
690   v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter));     \
691   v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter));     \
692   v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter));     \
693   ALIGN_ADDR(dpSrc, sPtr);                                            \
694   data0 = dpSrc[0];                                                   \
695   filterposy = (Y >> FILTER_SHIFT);                                   \
696   v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter));     \
697   data1 = dpSrc[1];                                                   \
698   v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter));     \
699   sum0 = vis_fpadd16(v00, v10);                                       \
700   data2 = dpSrc[2];                                                   \
701   row00 = vis_faligndata(data0, data1);                               \
702   v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter));     \
703   row01 = vis_faligndata(data1, data2);                               \
704   filterposx = (X >> FILTER_SHIFT);                                   \
705   sPtr += srcYStride;                                                 \
706   ALIGN_ADDR(dpSrc, sPtr);                                            \
707   v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter));     \
708   sum1 = vis_fpadd16(v01, v11);                                       \
709   data0 = dpSrc[0];                                                   \
710   X += dX;                                                            \
711   data1 = dpSrc[1];                                                   \
712   v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter));     \
713   sum2 = vis_fpadd16(v02, v12);                                       \
714   data2 = dpSrc[2];                                                   \
715   row10 = vis_faligndata(data0, data1);                               \
716   v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter));     \
717   row11 = vis_faligndata(data1, data2);                               \
718   sPtr += srcYStride;                                                 \
719   ALIGN_ADDR(dpSrc, sPtr);                                            \
720   Y += dY;                                                            \
721   xSrc = (X>>MLIB_SHIFT)-1;                                           \
722   v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter));     \
723   sum0 = vis_fpadd16(sum0, v20);                                      \
724   data0 = dpSrc[0];                                                   \
725   ySrc = (Y>>MLIB_SHIFT)-1;                                           \
726   data1 = dpSrc[1];                                                   \
727   v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter));     \
728   sum1 = vis_fpadd16(sum1, v21);                                      \
729   data2 = dpSrc[2];                                                   \
730   filterposy &= FILTER_MASK;                                          \
731   row20 = vis_faligndata(data0, data1);                               \
732   v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter));     \
733   row21 = vis_faligndata(data1, data2);                               \
734   sPtr += srcYStride;                                                 \
735   filterposx &= FILTER_MASK;                                          \
736   sum2 = vis_fpadd16(sum2, v22);                                      \
737   ALIGN_ADDR(dpSrc, sPtr);                                            \
738   sum0 = vis_fpadd16(sum0, v30);                                      \
739   data0 = dpSrc[0];                                                   \
740   sum1 = vis_fpadd16(sum1, v31);                                      \
741   v00 = vis_fmul8sux16(sum0, xFilter0);                               \
742   data1 = dpSrc[1];                                                   \
743   sum2 = vis_fpadd16(sum2, v32);                                      \
744   v01 = vis_fmul8ulx16(sum0, xFilter0);                               \
745   data2 = dpSrc[2];                                                   \
746   row30 = vis_faligndata(data0, data1);                               \
747   v10 = vis_fmul8sux16(sum1, xFilter1);                               \
748   d0 = vis_fpadd16(v00, v01);                                         \
749   row31 = vis_faligndata(data1, data2);                               \
750   yFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposy)); \
751   v11 = vis_fmul8ulx16(sum1, xFilter1);                               \
752   xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_3+3*filterposx));     \
753   xFilter0 = xPtr[0];                                                 \
754   v20 = vis_fmul8sux16(sum2, xFilter2);                               \
755   d1 = vis_fpadd16(v10, v11);                                         \
756   xFilter1 = xPtr[1];                                                 \
757   v21 = vis_fmul8ulx16(sum2, xFilter2);                               \
758   xFilter2 = xPtr[2];                                                 \
759   sPtr = (mlib_u8 *)lineAddr[ySrc] + (3*xSrc);                        \
760   d2 = vis_fpadd16(v20, v21)
761 
762 /***************************************************************/
mlib_ImageAffine_u8_3ch_bc(mlib_affine_param * param)763 mlib_status mlib_ImageAffine_u8_3ch_bc (mlib_affine_param *param)
764 {
765   DECLAREVAR_BC();
766   mlib_s32  filterposx, filterposy;
767   mlib_d64  data0, data1, data2;
768   mlib_d64  sum0, sum1, sum2;
769   mlib_d64  row00, row10, row20, row30;
770   mlib_d64  row01, row11, row21, row31;
771   mlib_d64  xFilter0, xFilter1, xFilter2, yFilter;
772   mlib_d64  v00, v10, v20, v30;
773   mlib_d64  v01, v11, v21, v31;
774   mlib_d64  v02, v12, v22, v32;
775   mlib_d64  d0, d1, d2, d3, d4;
776   mlib_d64  *dpSrc;
777   mlib_s32  cols, i;
778   mlib_d64  *xPtr;
779   union {
780     mlib_u8 t[4];
781     mlib_f32 f;
782   } f0;
783   const mlib_s16 *mlib_filters_table  ;
784   const mlib_s16 *mlib_filters_table_3;
785 
786   if (filter == MLIB_BICUBIC) {
787     mlib_filters_table   = mlib_filters_u8_bc;
788     mlib_filters_table_3 = mlib_filters_u8_bc_3;
789   } else {
790     mlib_filters_table   = mlib_filters_u8_bc2;
791     mlib_filters_table_3 = mlib_filters_u8_bc2_3;
792   }
793 
794   vis_write_gsr(3 << 3);
795   MLIB_WRITE_BMASK(0x6789ABCD);
796 
797   for (j = yStart; j <= yFinish; j ++) {
798 
799     CLIP(3);
800 
801     cols = xRight - xLeft + 1;
802     i = 0;
803 
804     if (i <= cols - 4) {
805 
806       NEXT_PIXEL_3BC_U8();
807       LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
808 
809       NEXT_PIXEL_3BC_U8();
810 
811       BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
812       FADD_3BC_U8();
813 
814       BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
815 
816 #pragma pipeloop(0)
817       for (; i < cols-4; i++) {
818         STORE_BC_U8_3CH_1PIXEL();
819 
820         FADD_3BC_U8();
821         BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
822       }
823 
824       STORE_BC_U8_3CH_1PIXEL();
825 
826       FADD_3BC_U8();
827       STORE_BC_U8_3CH_1PIXEL();
828 
829       RESULT_3BC_U8_1PIXEL();
830       STORE_BC_U8_3CH_1PIXEL();
831 
832       LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
833       RESULT_3BC_U8_1PIXEL();
834       STORE_BC_U8_3CH_1PIXEL();
835       i += 4;
836     }
837 
838     for (; i < cols; i++) {
839       NEXT_PIXEL_3BC_U8();
840       LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
841       RESULT_3BC_U8_1PIXEL();
842       STORE_BC_U8_3CH_1PIXEL();
843     }
844   }
845 
846   return MLIB_SUCCESS;
847 }
848 
849 /***************************************************************/
850 #define FADD_4BC_U8()                                           \
851   d0 = vis_fpadd16(d00, d10);                                   \
852   d1 = vis_fpadd16(d20, d30);                                   \
853   d0 = vis_fpadd16(d0, d1);                                     \
854   d2 = vis_fpadd16(d01, d11);                                   \
855   d3 = vis_fpadd16(d21, d31);                                   \
856   d2 = vis_fpadd16(d2, d3);                                     \
857   res = vis_fpack16_pair(d0, d2)
858 
859 /***************************************************************/
860 #define LOAD_BC_U8_4CH_1PIXEL(mlib_filters_u8, mlib_filters_u8_4)      \
861   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                      \
862   yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
863   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                      \
864   xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_4+4*filterposx));      \
865   xFilter0 = xPtr[0];                                                  \
866   xFilter1 = xPtr[1];                                                  \
867   xFilter2 = xPtr[2];                                                  \
868   xFilter3 = xPtr[3];                                                  \
869   X += dX;                                                             \
870   Y += dY;                                                             \
871   ALIGN_ADDR(dpSrc, sPtr);                                             \
872   data0 = dpSrc[0];                                                    \
873   data1 = dpSrc[1];                                                    \
874   data2 = dpSrc[2];                                                    \
875   row00 = vis_faligndata(data0, data1);                                \
876   row01 = vis_faligndata(data1, data2);                                \
877   sPtr += srcYStride;                                                  \
878   ALIGN_ADDR(dpSrc, sPtr);                                             \
879   data0 = dpSrc[0];                                                    \
880   data1 = dpSrc[1];                                                    \
881   data2 = dpSrc[2];                                                    \
882   row10 = vis_faligndata(data0, data1);                                \
883   row11 = vis_faligndata(data1, data2);                                \
884   sPtr += srcYStride;                                                  \
885   ALIGN_ADDR(dpSrc, sPtr);                                             \
886   data0 = dpSrc[0];                                                    \
887   data1 = dpSrc[1];                                                    \
888   data2 = dpSrc[2];                                                    \
889   row20 = vis_faligndata(data0, data1);                                \
890   row21 = vis_faligndata(data1, data2);                                \
891   sPtr += srcYStride;                                                  \
892   ALIGN_ADDR(dpSrc, sPtr);                                             \
893   data0 = dpSrc[0];                                                    \
894   data1 = dpSrc[1];                                                    \
895   data2 = dpSrc[2];                                                    \
896   row30 = vis_faligndata(data0, data1);                                \
897   row31 = vis_faligndata(data1, data2)
898 
899 /***************************************************************/
900 #define NEXT_PIXEL_4BC_U8()                                     \
901   xSrc = (X>>MLIB_SHIFT)-1;                                     \
902   ySrc = (Y>>MLIB_SHIFT)-1;                                     \
903   sPtr = (mlib_u8 *)lineAddr[ySrc] + (4*xSrc)
904 
905 /***************************************************************/
906 #define RESULT_4BC_U8_1PIXEL(ind)                                 \
907   v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
908   v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
909   v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
910   v03 = vis_fmul8x16au(vis_read_lo(row01), vis_read_hi(yFilter)); \
911   v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
912   v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
913   sum0 = vis_fpadd16(v00, v10);                                   \
914   v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
915   sum1 = vis_fpadd16(v01, v11);                                   \
916   v13 = vis_fmul8x16al(vis_read_lo(row11), vis_read_hi(yFilter)); \
917   sum2 = vis_fpadd16(v02, v12);                                   \
918   v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
919   sum3 = vis_fpadd16(v03, v13);                                   \
920   v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
921   sum0 = vis_fpadd16(sum0, v20);                                  \
922   v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
923   sum1 = vis_fpadd16(sum1, v21);                                  \
924   v23 = vis_fmul8x16au(vis_read_lo(row21), vis_read_lo(yFilter)); \
925   sum2 = vis_fpadd16(sum2, v22);                                  \
926   v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
927   sum3 = vis_fpadd16(sum3, v23);                                  \
928   v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
929   sum0 = vis_fpadd16(sum0, v30);                                  \
930   v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
931   sum1 = vis_fpadd16(sum1, v31);                                  \
932   v33 = vis_fmul8x16al(vis_read_lo(row31), vis_read_lo(yFilter)); \
933   sum2 = vis_fpadd16(sum2, v32);                                  \
934   v00 = vis_fmul8sux16(sum0, xFilter0);                           \
935   sum3 = vis_fpadd16(sum3, v33);                                  \
936   v01 = vis_fmul8ulx16(sum0, xFilter0);                           \
937   v10 = vis_fmul8sux16(sum1, xFilter1);                           \
938   d0##ind = vis_fpadd16(v00, v01);                                \
939   v11 = vis_fmul8ulx16(sum1, xFilter1);                           \
940   v20 = vis_fmul8sux16(sum2, xFilter2);                           \
941   d1##ind = vis_fpadd16(v10, v11);                                \
942   v21 = vis_fmul8ulx16(sum2, xFilter2);                           \
943   v30 = vis_fmul8sux16(sum3, xFilter3);                           \
944   d2##ind = vis_fpadd16(v20, v21);                                \
945   v31 = vis_fmul8ulx16(sum3, xFilter3);                           \
946   d3##ind = vis_fpadd16(v30, v31)
947 
948 /***************************************************************/
949 #define BC_U8_4CH(ind, mlib_filters_u8, mlib_filters_u8_4)            \
950   v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter));     \
951   v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter));     \
952   v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter));     \
953   v03 = vis_fmul8x16au(vis_read_lo(row01), vis_read_hi(yFilter));     \
954   ALIGN_ADDR(dpSrc, sPtr);                                            \
955   data0 = dpSrc[0];                                                   \
956   filterposy = (Y >> FILTER_SHIFT);                                   \
957   v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter));     \
958   data1 = dpSrc[1];                                                   \
959   v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter));     \
960   sum0 = vis_fpadd16(v00, v10);                                       \
961   data2 = dpSrc[2];                                                   \
962   row00 = vis_faligndata(data0, data1);                               \
963   v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter));     \
964   row01 = vis_faligndata(data1, data2);                               \
965   filterposx = (X >> FILTER_SHIFT);                                   \
966   v13 = vis_fmul8x16al(vis_read_lo(row11), vis_read_hi(yFilter));     \
967   sPtr += srcYStride;                                                 \
968   ALIGN_ADDR(dpSrc, sPtr);                                            \
969   v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter));     \
970   sum1 = vis_fpadd16(v01, v11);                                       \
971   data0 = dpSrc[0];                                                   \
972   X += dX;                                                            \
973   data1 = dpSrc[1];                                                   \
974   v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter));     \
975   sum2 = vis_fpadd16(v02, v12);                                       \
976   data2 = dpSrc[2];                                                   \
977   row10 = vis_faligndata(data0, data1);                               \
978   v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter));     \
979   row11 = vis_faligndata(data1, data2);                               \
980   sPtr += srcYStride;                                                 \
981   ALIGN_ADDR(dpSrc, sPtr);                                            \
982   v23 = vis_fmul8x16au(vis_read_lo(row21), vis_read_lo(yFilter));     \
983   sum3 = vis_fpadd16(v03, v13);                                       \
984   Y += dY;                                                            \
985   xSrc = (X>>MLIB_SHIFT)-1;                                           \
986   v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter));     \
987   sum0 = vis_fpadd16(sum0, v20);                                      \
988   data0 = dpSrc[0];                                                   \
989   ySrc = (Y>>MLIB_SHIFT)-1;                                           \
990   data1 = dpSrc[1];                                                   \
991   v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter));     \
992   sum1 = vis_fpadd16(sum1, v21);                                      \
993   data2 = dpSrc[2];                                                   \
994   filterposy &= FILTER_MASK;                                          \
995   row20 = vis_faligndata(data0, data1);                               \
996   v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter));     \
997   row21 = vis_faligndata(data1, data2);                               \
998   sPtr += srcYStride;                                                 \
999   filterposx &= FILTER_MASK;                                          \
1000   v33 = vis_fmul8x16al(vis_read_lo(row31), vis_read_lo(yFilter));     \
1001   sum2 = vis_fpadd16(sum2, v22);                                      \
1002   ALIGN_ADDR(dpSrc, sPtr);                                            \
1003   sum3 = vis_fpadd16(sum3, v23);                                      \
1004   sum0 = vis_fpadd16(sum0, v30);                                      \
1005   data0 = dpSrc[0];                                                   \
1006   sum1 = vis_fpadd16(sum1, v31);                                      \
1007   v00 = vis_fmul8sux16(sum0, xFilter0);                               \
1008   data1 = dpSrc[1];                                                   \
1009   sum2 = vis_fpadd16(sum2, v32);                                      \
1010   v01 = vis_fmul8ulx16(sum0, xFilter0);                               \
1011   sum3 = vis_fpadd16(sum3, v33);                                      \
1012   data2 = dpSrc[2];                                                   \
1013   row30 = vis_faligndata(data0, data1);                               \
1014   v10 = vis_fmul8sux16(sum1, xFilter1);                               \
1015   d0##ind = vis_fpadd16(v00, v01);                                    \
1016   row31 = vis_faligndata(data1, data2);                               \
1017   yFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposy)); \
1018   v11 = vis_fmul8ulx16(sum1, xFilter1);                               \
1019   xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_4+4*filterposx));     \
1020   xFilter0 = xPtr[0];                                                 \
1021   v20 = vis_fmul8sux16(sum2, xFilter2);                               \
1022   d1##ind = vis_fpadd16(v10, v11);                                    \
1023   xFilter1 = xPtr[1];                                                 \
1024   v21 = vis_fmul8ulx16(sum2, xFilter2);                               \
1025   xFilter2 = xPtr[2];                                                 \
1026   v30 = vis_fmul8sux16(sum3, xFilter3);                               \
1027   d2##ind = vis_fpadd16(v20, v21);                                    \
1028   v31 = vis_fmul8ulx16(sum3, xFilter3);                               \
1029   xFilter3 = xPtr[3];                                                 \
1030   sPtr = (mlib_u8 *)lineAddr[ySrc] + (4*xSrc);                        \
1031   d3##ind = vis_fpadd16(v30, v31)
1032 
1033 /***************************************************************/
mlib_ImageAffine_u8_4ch_bc(mlib_affine_param * param)1034 mlib_status mlib_ImageAffine_u8_4ch_bc (mlib_affine_param *param)
1035 {
1036   DECLAREVAR_BC();
1037   DTYPE  *dstLineEnd;
1038   mlib_s32  filterposx, filterposy;
1039   mlib_d64  data0, data1, data2;
1040   mlib_d64  sum0, sum1, sum2, sum3;
1041   mlib_d64  row00, row10, row20, row30;
1042   mlib_d64  row01, row11, row21, row31;
1043   mlib_d64  xFilter0, xFilter1, xFilter2, xFilter3, yFilter;
1044   mlib_d64  v00, v10, v20, v30;
1045   mlib_d64  v01, v11, v21, v31;
1046   mlib_d64  v02, v12, v22, v32;
1047   mlib_d64  v03, v13, v23, v33;
1048   mlib_d64  d0, d1, d2, d3;
1049   mlib_d64  d00, d10, d20, d30;
1050   mlib_d64  d01, d11, d21, d31;
1051   mlib_d64  *dpSrc;
1052   mlib_s32  cols, i;
1053   mlib_d64  res, *dp, *xPtr;
1054   mlib_s32  mask, emask, gsrd;
1055   const mlib_s16 *mlib_filters_table  ;
1056   const mlib_s16 *mlib_filters_table_4;
1057 
1058   if (filter == MLIB_BICUBIC) {
1059     mlib_filters_table   = mlib_filters_u8_bc;
1060     mlib_filters_table_4 = mlib_filters_u8_bc_4;
1061   } else {
1062     mlib_filters_table   = mlib_filters_u8_bc2;
1063     mlib_filters_table_4 = mlib_filters_u8_bc2_4;
1064   }
1065 
1066   for (j = yStart; j <= yFinish; j++) {
1067 
1068     vis_write_gsr(3 << 3);
1069 
1070     CLIP(4);
1071     dstLineEnd  = (DTYPE*)dstData + 4 * xRight;
1072     dstLineEnd += 3;
1073     dp = (mlib_d64*)vis_alignaddr(dstPixelPtr, 0);
1074     mask = vis_edge8(dstPixelPtr, dstLineEnd);
1075     gsrd = ((8 - (mlib_addr)dstPixelPtr) & 7);
1076 
1077     cols = xRight - xLeft + 1;
1078     i = 0;
1079 
1080     if (i <= cols - 6) {
1081 
1082       NEXT_PIXEL_4BC_U8();
1083       LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1084 
1085       NEXT_PIXEL_4BC_U8();
1086 
1087       BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1088       BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1089       FADD_4BC_U8();
1090 
1091       BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1092       BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1093 
1094 #pragma pipeloop(0)
1095       for (; i <= cols-8; i+=2) {
1096         vis_alignaddr((void *)gsrd, 0);
1097         res = vis_faligndata(res, res);
1098 
1099         vis_pst_8(res, dp++, mask);
1100         vis_pst_8(res, dp, ~mask);
1101 
1102         FADD_4BC_U8();
1103         BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1104         BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1105       }
1106 
1107       vis_alignaddr((void *)gsrd, 0);
1108       res = vis_faligndata(res, res);
1109 
1110       vis_pst_8(res, dp++, mask);
1111       vis_pst_8(res, dp, ~mask);
1112 
1113       FADD_4BC_U8();
1114       vis_alignaddr((void *)gsrd, 0);
1115       res = vis_faligndata(res, res);
1116 
1117       vis_pst_8(res, dp++, mask);
1118       vis_pst_8(res, dp, ~mask);
1119 
1120       RESULT_4BC_U8_1PIXEL(0);
1121       LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1122       RESULT_4BC_U8_1PIXEL(1);
1123       FADD_4BC_U8();
1124 
1125       vis_alignaddr((void *)gsrd, 0);
1126       res = vis_faligndata(res, res);
1127 
1128       vis_pst_8(res, dp++, mask);
1129       vis_pst_8(res, dp, ~mask);
1130       i += 6;
1131     }
1132 
1133     if (i <= cols-4) {
1134       NEXT_PIXEL_4BC_U8();
1135       LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1136 
1137       NEXT_PIXEL_4BC_U8();
1138 
1139       BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1140       BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1141       FADD_4BC_U8();
1142       vis_alignaddr((void *)gsrd, 0);
1143       res = vis_faligndata(res, res);
1144 
1145       vis_pst_8(res, dp++, mask);
1146       vis_pst_8(res, dp, ~mask);
1147 
1148       RESULT_4BC_U8_1PIXEL(0);
1149       LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1150       RESULT_4BC_U8_1PIXEL(1);
1151       FADD_4BC_U8();
1152 
1153       vis_alignaddr((void *)gsrd, 0);
1154       res = vis_faligndata(res, res);
1155 
1156       vis_pst_8(res, dp++, mask);
1157       vis_pst_8(res, dp, ~mask);
1158       i += 4;
1159     }
1160 
1161     if (i <= cols-2) {
1162       NEXT_PIXEL_4BC_U8();
1163       LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1164       RESULT_4BC_U8_1PIXEL(0);
1165 
1166       NEXT_PIXEL_4BC_U8();
1167       LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1168       RESULT_4BC_U8_1PIXEL(1);
1169       FADD_4BC_U8();
1170 
1171       vis_alignaddr((void *)gsrd, 0);
1172       res = vis_faligndata(res, res);
1173 
1174       vis_pst_8(res, dp++, mask);
1175       vis_pst_8(res, dp, ~mask);
1176       i += 2;
1177     }
1178 
1179     if (i < cols) {
1180       NEXT_PIXEL_4BC_U8();
1181       LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1182       RESULT_4BC_U8_1PIXEL(0);
1183 
1184       d0 = vis_fpadd16(d00, d10);
1185       d1 = vis_fpadd16(d20, d30);
1186       d0 = vis_fpadd16(d0, d1);
1187       res = vis_fpack16_pair(d0, d0);
1188       vis_alignaddr((void *)gsrd, 0);
1189       res = vis_faligndata(res, res);
1190 
1191       emask = vis_edge8(dp, dstLineEnd);
1192       vis_pst_8(res, dp++, emask & mask);
1193 
1194       if ((mlib_u8*)dp <= (mlib_u8*)dstLineEnd) {
1195         mask = vis_edge8(dp, dstLineEnd);
1196         vis_pst_8(res, dp, mask);
1197       }
1198     }
1199   }
1200 
1201   return MLIB_SUCCESS;
1202 }
1203 
1204 /***************************************************************/
1205