1 /*
2  * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 
28 /*
29  *      The functions step along the lines from xLeft to xRight and apply
30  *      the bicubic filtering.
31  *
32  */
33 
34 #include "vis_proto.h"
35 #include "mlib_ImageAffine.h"
36 #include "mlib_v_ImageFilters.h"
37 
38 /***************************************************************/
39 #define DTYPE  mlib_s16
40 
41 #define FILTER_BITS  9
42 
43 /***************************************************************/
44 #define sPtr srcPixelPtr
45 
46 /***************************************************************/
47 #define NEXT_PIXEL_1BC_S16()                                    \
48   xSrc = (X >> MLIB_SHIFT)-1;                                   \
49   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
50   sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
51 
52 /***************************************************************/
53 #define LOAD_BC_S16_1CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
54   vis_alignaddr(sPtr, 0);                                               \
55   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
56   data0 = dpSrc[0];                                                     \
57   data1 = dpSrc[1];                                                     \
58   row0 = vis_faligndata(data0, data1);                                  \
59   sPtr += srcYStride;                                                   \
60   vis_alignaddr(sPtr, 0);                                               \
61   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
62   data0 = dpSrc[0];                                                     \
63   data1 = dpSrc[1];                                                     \
64   row1 = vis_faligndata(data0, data1);                                  \
65   sPtr += srcYStride;                                                   \
66   vis_alignaddr(sPtr, 0);                                               \
67   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
68   data0 = dpSrc[0];                                                     \
69   data1 = dpSrc[1];                                                     \
70   row2 = vis_faligndata(data0, data1);                                  \
71   sPtr += srcYStride;                                                   \
72   vis_alignaddr(sPtr, 0);                                               \
73   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
74   data0 = dpSrc[0];                                                     \
75   data1 = dpSrc[1];                                                     \
76   row3 = vis_faligndata(data0, data1);                                  \
77   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
78   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
79   yFilter0 = yPtr[0];                                                   \
80   yFilter1 = yPtr[1];                                                   \
81   yFilter2 = yPtr[2];                                                   \
82   yFilter3 = yPtr[3];                                                   \
83   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
84   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
85   X += dX;                                                              \
86   Y += dY
87 
88 /***************************************************************/
89 #define RESULT_1BC_S16_1PIXEL()                                          \
90   u0 = vis_fmul8sux16(row0, yFilter0);                                   \
91   u1 = vis_fmul8ulx16(row0, yFilter0);                                   \
92   u2 = vis_fmul8sux16(row1, yFilter1);                                   \
93   v0 = vis_fpadd16(u0, u1);                                              \
94   u3 = vis_fmul8ulx16(row1, yFilter1);                                   \
95   u0 = vis_fmul8sux16(row2, yFilter2);                                   \
96   v1 = vis_fpadd16(u2, u3);                                              \
97   u1 = vis_fmul8ulx16(row2, yFilter2);                                   \
98   sum = vis_fpadd16(v0, v1);                                             \
99   u2 = vis_fmul8sux16(row3, yFilter3);                                   \
100   v2 = vis_fpadd16(u0, u1);                                              \
101   u3 = vis_fmul8ulx16(row3, yFilter3);                                   \
102   sum = vis_fpadd16(sum, v2);                                            \
103   v3 = vis_fpadd16(u2, u3);                                              \
104   sum = vis_fpadd16(sum, v3);                                            \
105   d00 = vis_fmul8sux16(sum, xFilter);                                    \
106   d10 = vis_fmul8ulx16(sum, xFilter);                                    \
107   d0 = vis_fpadd16(d00, d10);                                            \
108   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));                   \
109   d0 = vis_fmuld8sux16(f_x01000100, p0);                                 \
110   d1 = vis_write_lo(d1, vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0))); \
111   res = vis_fpackfix_pair(d1, d1)
112 
113 /***************************************************************/
114 #define BC_S16_1CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
115   u0 = vis_fmul8sux16(row0, yFilter0);                                  \
116   u1 = vis_fmul8ulx16(row0, yFilter0);                                  \
117   vis_alignaddr(sPtr, 0);                                               \
118   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
119   u2 = vis_fmul8sux16(row1, yFilter1);                                  \
120   v0 = vis_fpadd16(u0, u1);                                             \
121   data0 = dpSrc[0];                                                     \
122   filterposy = (Y >> FILTER_SHIFT);                                     \
123   u3 = vis_fmul8ulx16(row1, yFilter1);                                  \
124   data1 = dpSrc[1];                                                     \
125   row0 = vis_faligndata(data0, data1);                                  \
126   filterposx = (X >> FILTER_SHIFT);                                     \
127   sPtr += srcYStride;                                                   \
128   vis_alignaddr(sPtr, 0);                                               \
129   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
130   u0 = vis_fmul8sux16(row2, yFilter2);                                  \
131   v1 = vis_fpadd16(u2, u3);                                             \
132   data0 = dpSrc[0];                                                     \
133   u1 = vis_fmul8ulx16(row2, yFilter2);                                  \
134   sum = vis_fpadd16(v0, v1);                                            \
135   X += dX;                                                              \
136   data1 = dpSrc[1];                                                     \
137   row1 = vis_faligndata(data0, data1);                                  \
138   sPtr += srcYStride;                                                   \
139   vis_alignaddr(sPtr, 0);                                               \
140   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
141   u2 = vis_fmul8sux16(row3, yFilter3);                                  \
142   v2 = vis_fpadd16(u0, u1);                                             \
143   Y += dY;                                                              \
144   xSrc = (X >> MLIB_SHIFT)-1;                                           \
145   data0 = dpSrc[0];                                                     \
146   u3 = vis_fmul8ulx16(row3, yFilter3);                                  \
147   sum = vis_fpadd16(sum, v2);                                           \
148   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
149   data1 = dpSrc[1];                                                     \
150   filterposy &= FILTER_MASK;                                            \
151   row2 = vis_faligndata(data0, data1);                                  \
152   sPtr += srcYStride;                                                   \
153   filterposx &= FILTER_MASK;                                            \
154   vis_alignaddr(sPtr, 0);                                               \
155   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
156   data0 = dpSrc[0];                                                     \
157   v3 = vis_fpadd16(u2, u3);                                             \
158   data1 = dpSrc[1];                                                     \
159   row3 = vis_faligndata(data0, data1);                                  \
160   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
161   yFilter0 = yPtr[0];                                                   \
162   sum = vis_fpadd16(sum, v3);                                           \
163   yFilter1 = yPtr[1];                                                   \
164   d0 = vis_fmul8sux16(sum, xFilter);                                    \
165   yFilter2 = yPtr[2];                                                   \
166   d1 = vis_fmul8ulx16(sum, xFilter);                                    \
167   yFilter3 = yPtr[3];                                                   \
168   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
169   d0##ind = vis_fpadd16(d0, d1);                                        \
170   sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
171 
172 /***************************************************************/
173 #define FADD_1BC_S16()                                                \
174   p0 = vis_fpadd16s(vis_read_hi(d00), vis_read_lo(d00));              \
175   p1 = vis_fpadd16s(vis_read_hi(d01), vis_read_lo(d01));              \
176   p2 = vis_fpadd16s(vis_read_hi(d02), vis_read_lo(d02));              \
177   p3 = vis_fpadd16s(vis_read_hi(d03), vis_read_lo(d03));              \
178   d0 = vis_fmuld8sux16(f_x01000100, p0);                              \
179   d1 = vis_fmuld8sux16(f_x01000100, p1);                              \
180   d2 = vis_fmuld8sux16(f_x01000100, p2);                              \
181   d3 = vis_fmuld8sux16(f_x01000100, p3);                              \
182   d0 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0)),  \
183                      vis_fpadd32s(vis_read_hi(d1), vis_read_lo(d1))); \
184   d1 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d2), vis_read_lo(d2)),  \
185                      vis_fpadd32s(vis_read_hi(d3), vis_read_lo(d3))); \
186   res = vis_fpackfix_pair(d0, d1)
187 
188 /***************************************************************/
mlib_ImageAffine_s16_1ch_bc(mlib_affine_param * param)189 mlib_status mlib_ImageAffine_s16_1ch_bc (mlib_affine_param *param)
190 {
191   DECLAREVAR_BC();
192   mlib_s32  filterposx, filterposy;
193   mlib_d64  data0, data1;
194   mlib_d64  sum;
195   mlib_d64  row0, row1, row2, row3;
196   mlib_f32  p0, p1, p2, p3;
197   mlib_d64  xFilter, yFilter0, yFilter1, yFilter2, yFilter3;
198   mlib_d64  v0, v1, v2, v3;
199   mlib_d64  u0, u1, u2, u3;
200   mlib_d64  d0, d1, d2, d3;
201   mlib_d64  d00, d10, d01, d02, d03;
202   mlib_d64 *yPtr;
203   mlib_d64 *dpSrc;
204   mlib_s32  align, cols, i;
205   mlib_d64  res;
206   mlib_f32  f_x01000100 = vis_to_float(0x01000100);
207   const mlib_s16 *mlib_filters_table  ;
208   const mlib_s16 *mlib_filters_table_4;
209 
210   if (filter == MLIB_BICUBIC) {
211     mlib_filters_table   = mlib_filters_s16_bc;
212     mlib_filters_table_4 = mlib_filters_s16_bc_4;
213   } else {
214     mlib_filters_table   = mlib_filters_s16_bc2;
215     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
216   }
217 
218   srcYStride >>= 1;
219 
220   for (j = yStart; j <= yFinish; j++) {
221 
222     vis_write_gsr(10 << 3);
223 
224     CLIP(1);
225 
226     cols = xRight - xLeft + 1;
227     align = (8 - ((mlib_addr)dstPixelPtr) & 7) & 7;
228     align >>= 1;
229     align = (cols < align)? cols : align;
230 
231     for (i = 0; i < align; i++) {
232       NEXT_PIXEL_1BC_S16();
233       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
234       RESULT_1BC_S16_1PIXEL();
235       vis_st_u16(res, dstPixelPtr++);
236     }
237 
238     if (i <= cols - 10) {
239 
240       NEXT_PIXEL_1BC_S16();
241       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
242 
243       NEXT_PIXEL_1BC_S16();
244 
245       BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
246       BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
247       BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
248       BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
249 
250       FADD_1BC_S16();
251 
252       BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
253       BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
254       BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
255       BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
256 
257 #pragma pipeloop(0)
258       for (; i <= cols - 14; i += 4) {
259         *(mlib_d64*)dstPixelPtr = res;
260         FADD_1BC_S16();
261         BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
262         BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
263         BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
264         BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
265         dstPixelPtr += 4;
266       }
267 
268       *(mlib_d64*)dstPixelPtr = res;
269       dstPixelPtr += 4;
270       FADD_1BC_S16();
271       *(mlib_d64*)dstPixelPtr = res;
272       dstPixelPtr += 4;
273 
274       RESULT_1BC_S16_1PIXEL();
275       vis_st_u16(res, dstPixelPtr++);
276 
277       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
278       RESULT_1BC_S16_1PIXEL();
279       vis_st_u16(res, dstPixelPtr++);
280       i += 10;
281     }
282 
283     for (; i < cols; i++) {
284       NEXT_PIXEL_1BC_S16();
285       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
286       RESULT_1BC_S16_1PIXEL();
287       vis_st_u16(res, dstPixelPtr++);
288     }
289   }
290 
291   return MLIB_SUCCESS;
292 }
293 
294 /***************************************************************/
295 #define NEXT_PIXEL_2BC_S16()                                    \
296   xSrc = (X >> MLIB_SHIFT)-1;                                   \
297   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
298   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
299 
300 /***************************************************************/
301 #define LOAD_BC_S16_2CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
302   vis_alignaddr(sPtr, 0);                                               \
303   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
304   data0 = dpSrc[0];                                                     \
305   data1 = dpSrc[1];                                                     \
306   data2 = dpSrc[2];                                                     \
307   row00 = vis_faligndata(data0, data1);                                 \
308   row01 = vis_faligndata(data1, data2);                                 \
309   sPtr += srcYStride;                                                   \
310   vis_alignaddr(sPtr, 0);                                               \
311   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
312   data0 = dpSrc[0];                                                     \
313   data1 = dpSrc[1];                                                     \
314   data2 = dpSrc[2];                                                     \
315   row10 = vis_faligndata(data0, data1);                                 \
316   row11 = vis_faligndata(data1, data2);                                 \
317   sPtr += srcYStride;                                                   \
318   vis_alignaddr(sPtr, 0);                                               \
319   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
320   data0 = dpSrc[0];                                                     \
321   data1 = dpSrc[1];                                                     \
322   data2 = dpSrc[2];                                                     \
323   row20 = vis_faligndata(data0, data1);                                 \
324   row21 = vis_faligndata(data1, data2);                                 \
325   sPtr += srcYStride;                                                   \
326   vis_alignaddr(sPtr, 0);                                               \
327   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
328   data0 = dpSrc[0];                                                     \
329   data1 = dpSrc[1];                                                     \
330   data2 = dpSrc[2];                                                     \
331   row30 = vis_faligndata(data0, data1);                                 \
332   row31 = vis_faligndata(data1, data2);                                 \
333   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
334   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
335   yFilter0 = yPtr[0];                                                   \
336   yFilter1 = yPtr[1];                                                   \
337   yFilter2 = yPtr[2];                                                   \
338   yFilter3 = yPtr[3];                                                   \
339   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
340   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
341   X += dX;                                                              \
342   Y += dY
343 
344 /***************************************************************/
345 #define RESULT_2BC_S16_1PIXEL()                                 \
346   u00 = vis_fmul8sux16(row00, yFilter0);                        \
347   dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
348   u01 = vis_fmul8ulx16(row00, yFilter0);                        \
349   dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));           \
350   u10 = vis_fmul8sux16(row01, yFilter0);                        \
351   dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));          \
352   u11 = vis_fmul8ulx16(row01, yFilter0);                        \
353   dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));           \
354   u20 = vis_fmul8sux16(row10, yFilter1);                        \
355   v00 = vis_fpadd16(u00, u01);                                  \
356   u21 = vis_fmul8ulx16(row10, yFilter1);                        \
357   v01 = vis_fpadd16(u10, u11);                                  \
358   u00 = vis_fmul8sux16(row11, yFilter1);                        \
359   xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));    \
360   u01 = vis_fmul8ulx16(row11, yFilter1);                        \
361   u10 = vis_fmul8sux16(row20, yFilter2);                        \
362   u11 = vis_fmul8ulx16(row20, yFilter2);                        \
363   v10 = vis_fpadd16(u20, u21);                                  \
364   sum0 = vis_fpadd16(v00, v10);                                 \
365   u20 = vis_fmul8sux16(row21, yFilter2);                        \
366   v11 = vis_fpadd16(u00, u01);                                  \
367   u21 = vis_fmul8ulx16(row21, yFilter2);                        \
368   xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));    \
369   u00 = vis_fmul8sux16(row30, yFilter3);                        \
370   v20 = vis_fpadd16(u10, u11);                                  \
371   sum1 = vis_fpadd16(v01, v11);                                 \
372   u01 = vis_fmul8ulx16(row30, yFilter3);                        \
373   sum0 = vis_fpadd16(sum0, v20);                                \
374   v21 = vis_fpadd16(u20, u21);                                  \
375   u10 = vis_fmul8sux16(row31, yFilter3);                        \
376   v30 = vis_fpadd16(u00, u01);                                  \
377   sum1 = vis_fpadd16(sum1, v21);                                \
378   u11 = vis_fmul8ulx16(row31, yFilter3);                        \
379   sum0 = vis_fpadd16(sum0, v30);                                \
380   v31 = vis_fpadd16(u10, u11);                                  \
381   sum1 = vis_fpadd16(sum1, v31);                                \
382   d00 = vis_fmul8sux16(sum0, xFilter0);                         \
383   d10 = vis_fmul8ulx16(sum0, xFilter0);                         \
384   d20 = vis_fmul8sux16(sum1, xFilter1);                         \
385   d30 = vis_fmul8ulx16(sum1, xFilter1);                         \
386   d0 = vis_fpadd16(d00, d10);                                   \
387   d1 = vis_fpadd16(d20, d30);                                   \
388   d0 = vis_fpadd16(d0, d1);                                     \
389   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
390   d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
391   res = vis_fpackfix_pair(d0, d0)
392 
393 /***************************************************************/
394 #define BC_S16_2CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
395   u00 = vis_fmul8sux16(row00, yFilter0);                                \
396   dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter));         \
397   u01 = vis_fmul8ulx16(row00, yFilter0);                                \
398   dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));                   \
399   u10 = vis_fmul8sux16(row01, yFilter0);                                \
400   dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));                  \
401   u11 = vis_fmul8ulx16(row01, yFilter0);                                \
402   dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));                   \
403   vis_alignaddr(sPtr, 0);                                               \
404   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
405   u20 = vis_fmul8sux16(row10, yFilter1);                                \
406   v00 = vis_fpadd16(u00, u01);                                          \
407   u21 = vis_fmul8ulx16(row10, yFilter1);                                \
408   data0 = dpSrc[0];                                                     \
409   filterposy = (Y >> FILTER_SHIFT);                                     \
410   v01 = vis_fpadd16(u10, u11);                                          \
411   data1 = dpSrc[1];                                                     \
412   u00 = vis_fmul8sux16(row11, yFilter1);                                \
413   xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));            \
414   data2 = dpSrc[2];                                                     \
415   u01 = vis_fmul8ulx16(row11, yFilter1);                                \
416   row00 = vis_faligndata(data0, data1);                                 \
417   u10 = vis_fmul8sux16(row20, yFilter2);                                \
418   row01 = vis_faligndata(data1, data2);                                 \
419   filterposx = (X >> FILTER_SHIFT);                                     \
420   sPtr += srcYStride;                                                   \
421   vis_alignaddr(sPtr, 0);                                               \
422   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
423   u11 = vis_fmul8ulx16(row20, yFilter2);                                \
424   v10 = vis_fpadd16(u20, u21);                                          \
425   data0 = dpSrc[0];                                                     \
426   sum0 = vis_fpadd16(v00, v10);                                         \
427   X += dX;                                                              \
428   data1 = dpSrc[1];                                                     \
429   u20 = vis_fmul8sux16(row21, yFilter2);                                \
430   v11 = vis_fpadd16(u00, u01);                                          \
431   data2 = dpSrc[2];                                                     \
432   row10 = vis_faligndata(data0, data1);                                 \
433   u21 = vis_fmul8ulx16(row21, yFilter2);                                \
434   row11 = vis_faligndata(data1, data2);                                 \
435   sPtr += srcYStride;                                                   \
436   xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));            \
437   vis_alignaddr(sPtr, 0);                                               \
438   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
439   u00 = vis_fmul8sux16(row30, yFilter3);                                \
440   v20 = vis_fpadd16(u10, u11);                                          \
441   Y += dY;                                                              \
442   xSrc = (X >> MLIB_SHIFT)-1;                                           \
443   sum1 = vis_fpadd16(v01, v11);                                         \
444   data0 = dpSrc[0];                                                     \
445   u01 = vis_fmul8ulx16(row30, yFilter3);                                \
446   sum0 = vis_fpadd16(sum0, v20);                                        \
447   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
448   data1 = dpSrc[1];                                                     \
449   v21 = vis_fpadd16(u20, u21);                                          \
450   u10 = vis_fmul8sux16(row31, yFilter3);                                \
451   data2 = dpSrc[2];                                                     \
452   v30 = vis_fpadd16(u00, u01);                                          \
453   filterposy &= FILTER_MASK;                                            \
454   row20 = vis_faligndata(data0, data1);                                 \
455   sum1 = vis_fpadd16(sum1, v21);                                        \
456   u11 = vis_fmul8ulx16(row31, yFilter3);                                \
457   row21 = vis_faligndata(data1, data2);                                 \
458   sPtr += srcYStride;                                                   \
459   filterposx &= FILTER_MASK;                                            \
460   v31 = vis_fpadd16(u10, u11);                                          \
461   vis_alignaddr(sPtr, 0);                                               \
462   dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
463   data0 = dpSrc[0];                                                     \
464   sum0 = vis_fpadd16(sum0, v30);                                        \
465   data1 = dpSrc[1];                                                     \
466   sum1 = vis_fpadd16(sum1, v31);                                        \
467   data2 = dpSrc[2];                                                     \
468   row30 = vis_faligndata(data0, data1);                                 \
469   d0 = vis_fmul8sux16(sum0, xFilter0);                                  \
470   row31 = vis_faligndata(data1, data2);                                 \
471   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
472   d1 = vis_fmul8ulx16(sum0, xFilter0);                                  \
473   yFilter0 = yPtr[0];                                                   \
474   d2 = vis_fmul8sux16(sum1, xFilter1);                                  \
475   yFilter1 = yPtr[1];                                                   \
476   d3 = vis_fmul8ulx16(sum1, xFilter1);                                  \
477   d0##ind = vis_fpadd16(d0, d1);                                        \
478   yFilter2 = yPtr[2];                                                   \
479   yFilter3 = yPtr[3];                                                   \
480   d1##ind = vis_fpadd16(d2, d3);                                        \
481   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
482   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
483 
484 /***************************************************************/
485 #define FADD_2BC_S16()                                          \
486   d0 = vis_fpadd16(d00, d10);                                   \
487   d2 = vis_fpadd16(d01, d11);                                   \
488   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
489   p1 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2));          \
490   d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
491   d1 = vis_fmuld8sux16(f_x01000100, p1);                        \
492   res = vis_fpackfix_pair(d0, d1)
493 
494 /***************************************************************/
mlib_ImageAffine_s16_2ch_bc(mlib_affine_param * param)495 mlib_status mlib_ImageAffine_s16_2ch_bc (mlib_affine_param *param)
496 {
497   DECLAREVAR_BC();
498   DTYPE  *dstLineEnd;
499   mlib_s32  filterposx, filterposy;
500   mlib_d64  data0, data1, data2;
501   mlib_d64  sum0, sum1;
502   mlib_d64  row00, row10, row20, row30;
503   mlib_d64  row01, row11, row21, row31;
504   mlib_f32  p0, p1;
505   mlib_d64  xFilter, xFilter0, xFilter1;
506   mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
507   mlib_d64  v00, v01, v10, v11, v20, v21, v30, v31;
508   mlib_d64  u00, u01, u10, u11, u20, u21;
509   mlib_d64  d0, d1, d2, d3;
510   mlib_d64  d00, d10, d20, d30, d01, d11;
511   mlib_d64  *yPtr;
512   mlib_d64  *dp, *dpSrc;
513   mlib_s32  cols, i, mask, emask;
514   mlib_d64  res, res1;
515   mlib_d64  dr, dr1;
516   mlib_f32 f_x01000100 = vis_to_float(0x01000100);
517   const mlib_s16 *mlib_filters_table  ;
518   const mlib_s16 *mlib_filters_table_4;
519 
520   if (filter == MLIB_BICUBIC) {
521     mlib_filters_table   = mlib_filters_s16_bc;
522     mlib_filters_table_4 = mlib_filters_s16_bc_4;
523   } else {
524     mlib_filters_table   = mlib_filters_s16_bc2;
525     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
526   }
527 
528   srcYStride >>= 1;
529 
530   for (j = yStart; j <= yFinish; j++) {
531 
532     vis_write_gsr(10 << 3);
533 
534     CLIP(2);
535     dstLineEnd  = (DTYPE*)dstData + 2 * xRight;
536 
537     cols = xRight - xLeft + 1;
538     dp = vis_alignaddr(dstPixelPtr, 0);
539     dstLineEnd += 1;
540     mask = vis_edge16(dstPixelPtr, dstLineEnd);
541     i = 0;
542 
543     if (i <= cols - 6) {
544 
545       NEXT_PIXEL_2BC_S16();
546       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
547 
548       NEXT_PIXEL_2BC_S16();
549 
550       BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
551       BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
552 
553       FADD_2BC_S16();
554 
555       BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
556       BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
557 
558 #pragma pipeloop(0)
559       for (; i <= cols-8; i += 2) {
560         vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
561         res = vis_faligndata(res, res);
562         vis_pst_16(res, dp++, mask);
563         vis_pst_16(res, dp, ~mask);
564         FADD_2BC_S16();
565         BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
566         BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
567       }
568 
569       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
570       res = vis_faligndata(res, res);
571       vis_pst_16(res, dp++, mask);
572       vis_pst_16(res, dp, ~mask);
573 
574       FADD_2BC_S16();
575       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
576       res = vis_faligndata(res, res);
577       vis_pst_16(res, dp++, mask);
578       vis_pst_16(res, dp, ~mask);
579 
580       RESULT_2BC_S16_1PIXEL();
581       res1 = res;
582 
583       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
584       RESULT_2BC_S16_1PIXEL();
585       res = vis_write_hi(res, vis_read_hi(res1));
586       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
587       res = vis_faligndata(res, res);
588       vis_pst_16(res, dp++, mask);
589       vis_pst_16(res, dp, ~mask);
590 
591       i += 6;
592     }
593 
594     if (i <= cols - 4) {
595       NEXT_PIXEL_2BC_S16();
596       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
597 
598       NEXT_PIXEL_2BC_S16();
599 
600       BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
601       BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
602 
603       FADD_2BC_S16();
604       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
605       res = vis_faligndata(res, res);
606       vis_pst_16(res, dp++, mask);
607       vis_pst_16(res, dp, ~mask);
608 
609       RESULT_2BC_S16_1PIXEL();
610       res1 = res;
611 
612       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
613       RESULT_2BC_S16_1PIXEL();
614       res = vis_write_hi(res, vis_read_hi(res1));
615       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
616       res = vis_faligndata(res, res);
617       vis_pst_16(res, dp++, mask);
618       vis_pst_16(res, dp, ~mask);
619 
620       i += 4;
621     }
622 
623     if (i <= cols - 2) {
624       NEXT_PIXEL_2BC_S16();
625       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
626       RESULT_2BC_S16_1PIXEL();
627       res1 = res;
628 
629       NEXT_PIXEL_2BC_S16();
630       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
631       RESULT_2BC_S16_1PIXEL();
632       res = vis_write_hi(res, vis_read_hi(res1));
633       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
634       res = vis_faligndata(res, res);
635       vis_pst_16(res, dp++, mask);
636       vis_pst_16(res, dp, ~mask);
637 
638       i += 2;
639     }
640 
641     if (i < cols) {
642       NEXT_PIXEL_2BC_S16();
643       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
644       RESULT_2BC_S16_1PIXEL();
645       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
646       res = vis_faligndata(res, res);
647       emask = vis_edge16(dp, dstLineEnd);
648       vis_pst_16(res, dp++, mask & emask);
649 
650       if ((mlib_s16*)dp <= dstLineEnd) {
651         mask = vis_edge16(dp, dstLineEnd);
652         vis_pst_16(res, dp, mask);
653       }
654     }
655   }
656 
657   return MLIB_SUCCESS;
658 }
659 
660 /***************************************************************/
661 #define NEXT_PIXEL_3BC_S16()                                    \
662   xSrc = (X >> MLIB_SHIFT)-1;                                   \
663   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
664   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
665 
666 /***************************************************************/
667 #define LOAD_BC_S16_3CH_1PIXEL(mlib_filters_s16_3, mlib_filters_s16_4)  \
668   dpSrc = vis_alignaddr(sPtr, 0);                                       \
669   data0 = dpSrc[0];                                                     \
670   data1 = dpSrc[1];                                                     \
671   data2 = dpSrc[2];                                                     \
672   data3 = dpSrc[3];                                                     \
673   row00 = vis_faligndata(data0, data1);                                 \
674   row01 = vis_faligndata(data1, data2);                                 \
675   row02 = vis_faligndata(data2, data3);                                 \
676   sPtr += srcYStride;                                                   \
677   dpSrc = vis_alignaddr(sPtr, 0);                                       \
678   data0 = dpSrc[0];                                                     \
679   data1 = dpSrc[1];                                                     \
680   data2 = dpSrc[2];                                                     \
681   data3 = dpSrc[3];                                                     \
682   row10 = vis_faligndata(data0, data1);                                 \
683   row11 = vis_faligndata(data1, data2);                                 \
684   row12 = vis_faligndata(data2, data3);                                 \
685   sPtr += srcYStride;                                                   \
686   dpSrc = vis_alignaddr(sPtr, 0);                                       \
687   data0 = dpSrc[0];                                                     \
688   data1 = dpSrc[1];                                                     \
689   data2 = dpSrc[2];                                                     \
690   data3 = dpSrc[3];                                                     \
691   row20 = vis_faligndata(data0, data1);                                 \
692   row21 = vis_faligndata(data1, data2);                                 \
693   row22 = vis_faligndata(data2, data3);                                 \
694   sPtr += srcYStride;                                                   \
695   dpSrc = vis_alignaddr(sPtr, 0);                                       \
696   data0 = dpSrc[0];                                                     \
697   data1 = dpSrc[1];                                                     \
698   data2 = dpSrc[2];                                                     \
699   data3 = dpSrc[3];                                                     \
700   row30 = vis_faligndata(data0, data1);                                 \
701   row31 = vis_faligndata(data1, data2);                                 \
702   row32 = vis_faligndata(data2, data3);                                 \
703   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
704   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
705   yFilter0 = yPtr[0];                                                   \
706   yFilter1 = yPtr[1];                                                   \
707   yFilter2 = yPtr[2];                                                   \
708   yFilter3 = yPtr[3];                                                   \
709   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
710   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
711   xFilter0 = xPtr[0];                                                   \
712   xFilter1 = xPtr[1];                                                   \
713   xFilter2 = xPtr[2];                                                   \
714   X += dX;                                                              \
715   Y += dY
716 
717 /***************************************************************/
718 #define STORE_BC_S16_3CH_1PIXEL()                               \
719   dstPixelPtr[0] = f0.t[0];                                     \
720   dstPixelPtr[1] = f0.t[1];                                     \
721   dstPixelPtr[2] = f0.t[2];                                     \
722   dstPixelPtr += 3
723 
724 /***************************************************************/
725 #define RESULT_3BC_S16_1PIXEL()                                 \
726   u00 = vis_fmul8sux16(row00, yFilter0);                        \
727   u01 = vis_fmul8ulx16(row00, yFilter0);                        \
728   u10 = vis_fmul8sux16(row01, yFilter0);                        \
729   u11 = vis_fmul8ulx16(row01, yFilter0);                        \
730   v00 = vis_fpadd16(u00, u01);                                  \
731   u20 = vis_fmul8sux16(row02, yFilter0);                        \
732   v01 = vis_fpadd16(u10, u11);                                  \
733   u21 = vis_fmul8ulx16(row02, yFilter0);                        \
734   u00 = vis_fmul8sux16(row10, yFilter1);                        \
735   u01 = vis_fmul8ulx16(row10, yFilter1);                        \
736   v02 = vis_fpadd16(u20, u21);                                  \
737   u10 = vis_fmul8sux16(row11, yFilter1);                        \
738   u11 = vis_fmul8ulx16(row11, yFilter1);                        \
739   v10 = vis_fpadd16(u00, u01);                                  \
740   u20 = vis_fmul8sux16(row12, yFilter1);                        \
741   u21 = vis_fmul8ulx16(row12, yFilter1);                        \
742   u00 = vis_fmul8sux16(row20, yFilter2);                        \
743   v11 = vis_fpadd16(u10, u11);                                  \
744   u01 = vis_fmul8ulx16(row20, yFilter2);                        \
745   v12 = vis_fpadd16(u20, u21);                                  \
746   u10 = vis_fmul8sux16(row21, yFilter2);                        \
747   u11 = vis_fmul8ulx16(row21, yFilter2);                        \
748   v20 = vis_fpadd16(u00, u01);                                  \
749   u20 = vis_fmul8sux16(row22, yFilter2);                        \
750   sum0 = vis_fpadd16(v00, v10);                                 \
751   u21 = vis_fmul8ulx16(row22, yFilter2);                        \
752   u00 = vis_fmul8sux16(row30, yFilter3);                        \
753   u01 = vis_fmul8ulx16(row30, yFilter3);                        \
754   v21 = vis_fpadd16(u10, u11);                                  \
755   sum1 = vis_fpadd16(v01, v11);                                 \
756   u10 = vis_fmul8sux16(row31, yFilter3);                        \
757   sum2 = vis_fpadd16(v02, v12);                                 \
758   v22 = vis_fpadd16(u20, u21);                                  \
759   u11 = vis_fmul8ulx16(row31, yFilter3);                        \
760   sum0 = vis_fpadd16(sum0, v20);                                \
761   u20 = vis_fmul8sux16(row32, yFilter3);                        \
762   v30 = vis_fpadd16(u00, u01);                                  \
763   sum1 = vis_fpadd16(sum1, v21);                                \
764   u21 = vis_fmul8ulx16(row32, yFilter3);                        \
765   v31 = vis_fpadd16(u10, u11);                                  \
766   sum2 = vis_fpadd16(sum2, v22);                                \
767   v32 = vis_fpadd16(u20, u21);                                  \
768   sum0 = vis_fpadd16(sum0, v30);                                \
769   row30 = vis_faligndata(data0, data1);                         \
770   v00 = vis_fmul8sux16(sum0, xFilter0);                         \
771   sum1 = vis_fpadd16(sum1, v31);                                \
772   sum2 = vis_fpadd16(sum2, v32);                                \
773   v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
774   v10 = vis_fmul8sux16(sum1, xFilter1);                         \
775   v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
776   d0 = vis_fpadd16(v00, v01);                                   \
777   v20 = vis_fmul8sux16(sum2, xFilter2);                         \
778   v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
779   d1 = vis_fpadd16(v10, v11);                                   \
780   d2 = vis_fpadd16(v20, v21);                                   \
781   vis_alignaddr((void*)6, 0);                                   \
782   d3 = vis_faligndata(d0, d1);                                  \
783   vis_alignaddr((void*)2, 0);                                   \
784   d4 = vis_faligndata(d1, d2);                                  \
785   d0 = vis_fpadd16(d0, d3);                                     \
786   d2 = vis_fpadd16(d2, d4);                                     \
787   d1 = vis_faligndata(d2, d2);                                  \
788   d0 = vis_fpadd16(d0, d1);                                     \
789   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
790   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
791   f0.d = vis_fpackfix_pair(d2, d3)
792 
793 /***************************************************************/
794 #define BC_S16_3CH(mlib_filters_s16_3, mlib_filters_s16_4)              \
795   u00 = vis_fmul8sux16(row00, yFilter0);                                \
796   u01 = vis_fmul8ulx16(row00, yFilter0);                                \
797   u10 = vis_fmul8sux16(row01, yFilter0);                                \
798   u11 = vis_fmul8ulx16(row01, yFilter0);                                \
799   v00 = vis_fpadd16(u00, u01);                                          \
800   u20 = vis_fmul8sux16(row02, yFilter0);                                \
801   v01 = vis_fpadd16(u10, u11);                                          \
802   u21 = vis_fmul8ulx16(row02, yFilter0);                                \
803   dpSrc = vis_alignaddr(sPtr, 0);                                       \
804   u00 = vis_fmul8sux16(row10, yFilter1);                                \
805   u01 = vis_fmul8ulx16(row10, yFilter1);                                \
806   data0 = dpSrc[0];                                                     \
807   filterposy = (Y >> FILTER_SHIFT);                                     \
808   v02 = vis_fpadd16(u20, u21);                                          \
809   data1 = dpSrc[1];                                                     \
810   u10 = vis_fmul8sux16(row11, yFilter1);                                \
811   data2 = dpSrc[2];                                                     \
812   u11 = vis_fmul8ulx16(row11, yFilter1);                                \
813   v10 = vis_fpadd16(u00, u01);                                          \
814   data3 = dpSrc[3];                                                     \
815   u20 = vis_fmul8sux16(row12, yFilter1);                                \
816   row00 = vis_faligndata(data0, data1);                                 \
817   u21 = vis_fmul8ulx16(row12, yFilter1);                                \
818   row01 = vis_faligndata(data1, data2);                                 \
819   u00 = vis_fmul8sux16(row20, yFilter2);                                \
820   row02 = vis_faligndata(data2, data3);                                 \
821   filterposx = (X >> FILTER_SHIFT);                                     \
822   sPtr += srcYStride;                                                   \
823   dpSrc = vis_alignaddr(sPtr, 0);                                       \
824   v11 = vis_fpadd16(u10, u11);                                          \
825   u01 = vis_fmul8ulx16(row20, yFilter2);                                \
826   v12 = vis_fpadd16(u20, u21);                                          \
827   data0 = dpSrc[0];                                                     \
828   u10 = vis_fmul8sux16(row21, yFilter2);                                \
829   X += dX;                                                              \
830   data1 = dpSrc[1];                                                     \
831   u11 = vis_fmul8ulx16(row21, yFilter2);                                \
832   v20 = vis_fpadd16(u00, u01);                                          \
833   data2 = dpSrc[2];                                                     \
834   u20 = vis_fmul8sux16(row22, yFilter2);                                \
835   sum0 = vis_fpadd16(v00, v10);                                         \
836   data3 = dpSrc[3];                                                     \
837   row10 = vis_faligndata(data0, data1);                                 \
838   u21 = vis_fmul8ulx16(row22, yFilter2);                                \
839   row11 = vis_faligndata(data1, data2);                                 \
840   u00 = vis_fmul8sux16(row30, yFilter3);                                \
841   row12 = vis_faligndata(data2, data3);                                 \
842   sPtr += srcYStride;                                                   \
843   dpSrc = vis_alignaddr(sPtr, 0);                                       \
844   u01 = vis_fmul8ulx16(row30, yFilter3);                                \
845   v21 = vis_fpadd16(u10, u11);                                          \
846   Y += dY;                                                              \
847   xSrc = (X >> MLIB_SHIFT)-1;                                           \
848   sum1 = vis_fpadd16(v01, v11);                                         \
849   data0 = dpSrc[0];                                                     \
850   u10 = vis_fmul8sux16(row31, yFilter3);                                \
851   sum2 = vis_fpadd16(v02, v12);                                         \
852   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
853   data1 = dpSrc[1];                                                     \
854   v22 = vis_fpadd16(u20, u21);                                          \
855   u11 = vis_fmul8ulx16(row31, yFilter3);                                \
856   data2 = dpSrc[2];                                                     \
857   sum0 = vis_fpadd16(sum0, v20);                                        \
858   u20 = vis_fmul8sux16(row32, yFilter3);                                \
859   data3 = dpSrc[3];                                                     \
860   v30 = vis_fpadd16(u00, u01);                                          \
861   filterposy &= FILTER_MASK;                                            \
862   row20 = vis_faligndata(data0, data1);                                 \
863   sum1 = vis_fpadd16(sum1, v21);                                        \
864   u21 = vis_fmul8ulx16(row32, yFilter3);                                \
865   row21 = vis_faligndata(data1, data2);                                 \
866   row22 = vis_faligndata(data2, data3);                                 \
867   sPtr += srcYStride;                                                   \
868   filterposx &= FILTER_MASK;                                            \
869   v31 = vis_fpadd16(u10, u11);                                          \
870   dpSrc = vis_alignaddr(sPtr, 0);                                       \
871   data0 = dpSrc[0];                                                     \
872   sum2 = vis_fpadd16(sum2, v22);                                        \
873   data1 = dpSrc[1];                                                     \
874   v32 = vis_fpadd16(u20, u21);                                          \
875   data2 = dpSrc[2];                                                     \
876   sum0 = vis_fpadd16(sum0, v30);                                        \
877   data3 = dpSrc[3];                                                     \
878   row30 = vis_faligndata(data0, data1);                                 \
879   v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
880   row31 = vis_faligndata(data1, data2);                                 \
881   row32 = vis_faligndata(data2, data3);                                 \
882   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
883   sum1 = vis_fpadd16(sum1, v31);                                        \
884   yFilter0 = yPtr[0];                                                   \
885   sum2 = vis_fpadd16(sum2, v32);                                        \
886   v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
887   yFilter1 = yPtr[1];                                                   \
888   v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
889   yFilter2 = yPtr[2];                                                   \
890   v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
891   d0 = vis_fpadd16(v00, v01);                                           \
892   yFilter3 = yPtr[3];                                                   \
893   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
894   v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
895   xFilter0 = xPtr[0];                                                   \
896   v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
897   d1 = vis_fpadd16(v10, v11);                                           \
898   xFilter1 = xPtr[1];                                                   \
899   d2 = vis_fpadd16(v20, v21);                                           \
900   xFilter2 = xPtr[2];                                                   \
901   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
902 
903 /***************************************************************/
904 #define FADD_3BC_S16()                                          \
905   vis_alignaddr((void*)6, 0);                                   \
906   d3 = vis_faligndata(d0, d1);                                  \
907   vis_alignaddr((void*)2, 0);                                   \
908   d4 = vis_faligndata(d1, d2);                                  \
909   d0 = vis_fpadd16(d0, d3);                                     \
910   d2 = vis_fpadd16(d2, d4);                                     \
911   d1 = vis_faligndata(d2, d2);                                  \
912   d0 = vis_fpadd16(d0, d1);                                     \
913   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
914   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
915   f0.d = vis_fpackfix_pair(d2, d3)
916 
917 /***************************************************************/
mlib_ImageAffine_s16_3ch_bc(mlib_affine_param * param)918 mlib_status mlib_ImageAffine_s16_3ch_bc (mlib_affine_param *param)
919 {
920   DECLAREVAR_BC();
921   mlib_s32  filterposx, filterposy;
922   mlib_d64  data0, data1, data2, data3;
923   mlib_d64  sum0, sum1, sum2;
924   mlib_d64  row00, row10, row20, row30;
925   mlib_d64  row01, row11, row21, row31;
926   mlib_d64  row02, row12, row22, row32;
927   mlib_d64  xFilter0, xFilter1, xFilter2;
928   mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
929   mlib_d64  v00, v01, v02, v10, v11, v12, v20, v21, v22, v30, v31, v32;
930   mlib_d64  u00, u01, u10, u11, u20, u21;
931   mlib_d64  d0, d1, d2, d3, d4;
932   mlib_d64 *yPtr, *xPtr;
933   mlib_d64 *dpSrc;
934   mlib_s32  cols, i;
935   mlib_f32  f_x01000100 = vis_to_float(0x01000100);
936   union {
937     mlib_s16 t[4];
938     mlib_d64 d;
939   } f0;
940   const mlib_s16 *mlib_filters_table_3;
941   const mlib_s16 *mlib_filters_table_4;
942 
943   if (filter == MLIB_BICUBIC) {
944     mlib_filters_table_3 = mlib_filters_s16_bc_3;
945     mlib_filters_table_4 = mlib_filters_s16_bc_4;
946   } else {
947     mlib_filters_table_3 = mlib_filters_s16_bc2_3;
948     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
949   }
950 
951   srcYStride >>= 1;
952 
953   for (j = yStart; j <= yFinish; j++) {
954 
955     vis_write_gsr(10 << 3);
956 
957     CLIP(3);
958 
959     cols = xRight - xLeft + 1;
960 
961     i = 0;
962 
963     if (i <= cols - 4) {
964 
965       NEXT_PIXEL_3BC_S16();
966       LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
967 
968       NEXT_PIXEL_3BC_S16();
969 
970       BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
971       FADD_3BC_S16();
972 
973       BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
974 
975 #pragma pipeloop(0)
976       for (; i < cols-4; i++) {
977         STORE_BC_S16_3CH_1PIXEL();
978 
979         FADD_3BC_S16();
980         BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
981       }
982 
983       STORE_BC_S16_3CH_1PIXEL();
984 
985       FADD_3BC_S16();
986       STORE_BC_S16_3CH_1PIXEL();
987 
988       RESULT_3BC_S16_1PIXEL();
989       STORE_BC_S16_3CH_1PIXEL();
990 
991       LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
992       RESULT_3BC_S16_1PIXEL();
993       STORE_BC_S16_3CH_1PIXEL();
994       i += 4;
995     }
996 
997     for (; i < cols; i++) {
998       NEXT_PIXEL_3BC_S16();
999       LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
1000       RESULT_3BC_S16_1PIXEL();
1001       STORE_BC_S16_3CH_1PIXEL();
1002     }
1003   }
1004 
1005   return MLIB_SUCCESS;
1006 }
1007 
1008 /***************************************************************/
1009 #define NEXT_PIXEL_4BC_S16()                                    \
1010   xSrc = (X >> MLIB_SHIFT)-1;                                   \
1011   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
1012   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1013 
1014 /***************************************************************/
1015 #define LOAD_BC_S16_4CH_1PIXEL(mlib_filters_s16_4)                      \
1016   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1017   data0 = dpSrc[0];                                                     \
1018   data1 = dpSrc[1];                                                     \
1019   data2 = dpSrc[2];                                                     \
1020   data3 = dpSrc[3];                                                     \
1021   data4 = dpSrc[4];                                                     \
1022   row00 = vis_faligndata(data0, data1);                                 \
1023   row01 = vis_faligndata(data1, data2);                                 \
1024   row02 = vis_faligndata(data2, data3);                                 \
1025   row03 = vis_faligndata(data3, data4);                                 \
1026   sPtr += srcYStride;                                                   \
1027   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1028   data0 = dpSrc[0];                                                     \
1029   data1 = dpSrc[1];                                                     \
1030   data2 = dpSrc[2];                                                     \
1031   data3 = dpSrc[3];                                                     \
1032   data4 = dpSrc[4];                                                     \
1033   row10 = vis_faligndata(data0, data1);                                 \
1034   row11 = vis_faligndata(data1, data2);                                 \
1035   row12 = vis_faligndata(data2, data3);                                 \
1036   row13 = vis_faligndata(data3, data4);                                 \
1037   sPtr += srcYStride;                                                   \
1038   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1039   data0 = dpSrc[0];                                                     \
1040   data1 = dpSrc[1];                                                     \
1041   data2 = dpSrc[2];                                                     \
1042   data3 = dpSrc[3];                                                     \
1043   data4 = dpSrc[4];                                                     \
1044   row20 = vis_faligndata(data0, data1);                                 \
1045   row21 = vis_faligndata(data1, data2);                                 \
1046   row22 = vis_faligndata(data2, data3);                                 \
1047   row23 = vis_faligndata(data3, data4);                                 \
1048   sPtr += srcYStride;                                                   \
1049   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1050   data0 = dpSrc[0];                                                     \
1051   data1 = dpSrc[1];                                                     \
1052   data2 = dpSrc[2];                                                     \
1053   data3 = dpSrc[3];                                                     \
1054   data4 = dpSrc[4];                                                     \
1055   row30 = vis_faligndata(data0, data1);                                 \
1056   row31 = vis_faligndata(data1, data2);                                 \
1057   row32 = vis_faligndata(data2, data3);                                 \
1058   row33 = vis_faligndata(data3, data4);                                 \
1059   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
1060   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1061   yFilter0 = yPtr[0];                                                   \
1062   yFilter1 = yPtr[1];                                                   \
1063   yFilter2 = yPtr[2];                                                   \
1064   yFilter3 = yPtr[3];                                                   \
1065   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
1066   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1067   xFilter0 = xPtr[0];                                                   \
1068   xFilter1 = xPtr[1];                                                   \
1069   xFilter2 = xPtr[2];                                                   \
1070   xFilter3 = xPtr[3];                                                   \
1071   X += dX;                                                              \
1072   Y += dY
1073 
1074 /***************************************************************/
1075 #define RESULT_4BC_S16_1PIXEL()                                 \
1076   u00 = vis_fmul8sux16(row00, yFilter0);                        \
1077   u01 = vis_fmul8ulx16(row00, yFilter0);                        \
1078   u10 = vis_fmul8sux16(row01, yFilter0);                        \
1079   u11 = vis_fmul8ulx16(row01, yFilter0);                        \
1080   v00 = vis_fpadd16(u00, u01);                                  \
1081   u20 = vis_fmul8sux16(row02, yFilter0);                        \
1082   v01 = vis_fpadd16(u10, u11);                                  \
1083   u21 = vis_fmul8ulx16(row02, yFilter0);                        \
1084   u30 = vis_fmul8sux16(row03, yFilter0);                        \
1085   u31 = vis_fmul8ulx16(row03, yFilter0);                        \
1086   v02 = vis_fpadd16(u20, u21);                                  \
1087   u00 = vis_fmul8sux16(row10, yFilter1);                        \
1088   u01 = vis_fmul8ulx16(row10, yFilter1);                        \
1089   v03 = vis_fpadd16(u30, u31);                                  \
1090   u10 = vis_fmul8sux16(row11, yFilter1);                        \
1091   u11 = vis_fmul8ulx16(row11, yFilter1);                        \
1092   v10 = vis_fpadd16(u00, u01);                                  \
1093   u20 = vis_fmul8sux16(row12, yFilter1);                        \
1094   v11 = vis_fpadd16(u10, u11);                                  \
1095   u21 = vis_fmul8ulx16(row12, yFilter1);                        \
1096   u30 = vis_fmul8sux16(row13, yFilter1);                        \
1097   u31 = vis_fmul8ulx16(row13, yFilter1);                        \
1098   u00 = vis_fmul8sux16(row20, yFilter2);                        \
1099   v12 = vis_fpadd16(u20, u21);                                  \
1100   u01 = vis_fmul8ulx16(row20, yFilter2);                        \
1101   v13 = vis_fpadd16(u30, u31);                                  \
1102   u10 = vis_fmul8sux16(row21, yFilter2);                        \
1103   u11 = vis_fmul8ulx16(row21, yFilter2);                        \
1104   v20 = vis_fpadd16(u00, u01);                                  \
1105   u20 = vis_fmul8sux16(row22, yFilter2);                        \
1106   sum0 = vis_fpadd16(v00, v10);                                 \
1107   u21 = vis_fmul8ulx16(row22, yFilter2);                        \
1108   u30 = vis_fmul8sux16(row23, yFilter2);                        \
1109   u31 = vis_fmul8ulx16(row23, yFilter2);                        \
1110   u00 = vis_fmul8sux16(row30, yFilter3);                        \
1111   u01 = vis_fmul8ulx16(row30, yFilter3);                        \
1112   v21 = vis_fpadd16(u10, u11);                                  \
1113   sum1 = vis_fpadd16(v01, v11);                                 \
1114   u10 = vis_fmul8sux16(row31, yFilter3);                        \
1115   sum2 = vis_fpadd16(v02, v12);                                 \
1116   sum3 = vis_fpadd16(v03, v13);                                 \
1117   v22 = vis_fpadd16(u20, u21);                                  \
1118   u11 = vis_fmul8ulx16(row31, yFilter3);                        \
1119   sum0 = vis_fpadd16(sum0, v20);                                \
1120   u20 = vis_fmul8sux16(row32, yFilter3);                        \
1121   u21 = vis_fmul8ulx16(row32, yFilter3);                        \
1122   v23 = vis_fpadd16(u30, u31);                                  \
1123   v30 = vis_fpadd16(u00, u01);                                  \
1124   sum1 = vis_fpadd16(sum1, v21);                                \
1125   u30 = vis_fmul8sux16(row33, yFilter3);                        \
1126   u31 = vis_fmul8ulx16(row33, yFilter3);                        \
1127   v31 = vis_fpadd16(u10, u11);                                  \
1128   sum2 = vis_fpadd16(sum2, v22);                                \
1129   sum3 = vis_fpadd16(sum3, v23);                                \
1130   v32 = vis_fpadd16(u20, u21);                                  \
1131   sum0 = vis_fpadd16(sum0, v30);                                \
1132   v33 = vis_fpadd16(u30, u31);                                  \
1133   v00 = vis_fmul8sux16(sum0, xFilter0);                         \
1134   sum1 = vis_fpadd16(sum1, v31);                                \
1135   sum2 = vis_fpadd16(sum2, v32);                                \
1136   v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
1137   v10 = vis_fmul8sux16(sum1, xFilter1);                         \
1138   sum3 = vis_fpadd16(sum3, v33);                                \
1139   v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
1140   d0 = vis_fpadd16(v00, v01);                                   \
1141   v20 = vis_fmul8sux16(sum2, xFilter2);                         \
1142   v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
1143   d1 = vis_fpadd16(v10, v11);                                   \
1144   v30 = vis_fmul8sux16(sum3, xFilter3);                         \
1145   v31 = vis_fmul8ulx16(sum3, xFilter3);                         \
1146   d2 = vis_fpadd16(v20, v21);                                   \
1147   d3 = vis_fpadd16(v30, v31);                                   \
1148   d0 = vis_fpadd16(d0, d1);                                     \
1149   d2 = vis_fpadd16(d2, d3);                                     \
1150   d0 = vis_fpadd16(d0, d2);                                     \
1151   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1152   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1153   res = vis_fpackfix_pair(d2, d3)
1154 
1155 /***************************************************************/
1156 #define BC_S16_4CH(mlib_filters_s16_4)                                  \
1157   u00 = vis_fmul8sux16(row00, yFilter0);                                \
1158   u01 = vis_fmul8ulx16(row00, yFilter0);                                \
1159   u10 = vis_fmul8sux16(row01, yFilter0);                                \
1160   u11 = vis_fmul8ulx16(row01, yFilter0);                                \
1161   v00 = vis_fpadd16(u00, u01);                                          \
1162   u20 = vis_fmul8sux16(row02, yFilter0);                                \
1163   v01 = vis_fpadd16(u10, u11);                                          \
1164   u21 = vis_fmul8ulx16(row02, yFilter0);                                \
1165   u30 = vis_fmul8sux16(row03, yFilter0);                                \
1166   u31 = vis_fmul8ulx16(row03, yFilter0);                                \
1167   v02 = vis_fpadd16(u20, u21);                                          \
1168   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1169   u00 = vis_fmul8sux16(row10, yFilter1);                                \
1170   u01 = vis_fmul8ulx16(row10, yFilter1);                                \
1171   data0 = dpSrc[0];                                                     \
1172   filterposy = (Y >> FILTER_SHIFT);                                     \
1173   v03 = vis_fpadd16(u30, u31);                                          \
1174   data1 = dpSrc[1];                                                     \
1175   u10 = vis_fmul8sux16(row11, yFilter1);                                \
1176   data2 = dpSrc[2];                                                     \
1177   u11 = vis_fmul8ulx16(row11, yFilter1);                                \
1178   v10 = vis_fpadd16(u00, u01);                                          \
1179   data3 = dpSrc[3];                                                     \
1180   u20 = vis_fmul8sux16(row12, yFilter1);                                \
1181   v11 = vis_fpadd16(u10, u11);                                          \
1182   data4 = dpSrc[4];                                                     \
1183   u21 = vis_fmul8ulx16(row12, yFilter1);                                \
1184   row00 = vis_faligndata(data0, data1);                                 \
1185   u30 = vis_fmul8sux16(row13, yFilter1);                                \
1186   row01 = vis_faligndata(data1, data2);                                 \
1187   u31 = vis_fmul8ulx16(row13, yFilter1);                                \
1188   row02 = vis_faligndata(data2, data3);                                 \
1189   u00 = vis_fmul8sux16(row20, yFilter2);                                \
1190   row03 = vis_faligndata(data3, data4);                                 \
1191   filterposx = (X >> FILTER_SHIFT);                                     \
1192   sPtr += srcYStride;                                                   \
1193   v12 = vis_fpadd16(u20, u21);                                          \
1194   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1195   u01 = vis_fmul8ulx16(row20, yFilter2);                                \
1196   v13 = vis_fpadd16(u30, u31);                                          \
1197   data0 = dpSrc[0];                                                     \
1198   u10 = vis_fmul8sux16(row21, yFilter2);                                \
1199   X += dX;                                                              \
1200   data1 = dpSrc[1];                                                     \
1201   u11 = vis_fmul8ulx16(row21, yFilter2);                                \
1202   v20 = vis_fpadd16(u00, u01);                                          \
1203   data2 = dpSrc[2];                                                     \
1204   u20 = vis_fmul8sux16(row22, yFilter2);                                \
1205   sum0 = vis_fpadd16(v00, v10);                                         \
1206   data3 = dpSrc[3];                                                     \
1207   u21 = vis_fmul8ulx16(row22, yFilter2);                                \
1208   data4 = dpSrc[4];                                                     \
1209   row10 = vis_faligndata(data0, data1);                                 \
1210   u30 = vis_fmul8sux16(row23, yFilter2);                                \
1211   row11 = vis_faligndata(data1, data2);                                 \
1212   u31 = vis_fmul8ulx16(row23, yFilter2);                                \
1213   row12 = vis_faligndata(data2, data3);                                 \
1214   u00 = vis_fmul8sux16(row30, yFilter3);                                \
1215   row13 = vis_faligndata(data3, data4);                                 \
1216   sPtr += srcYStride;                                                   \
1217   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1218   u01 = vis_fmul8ulx16(row30, yFilter3);                                \
1219   v21 = vis_fpadd16(u10, u11);                                          \
1220   Y += dY;                                                              \
1221   xSrc = (X >> MLIB_SHIFT)-1;                                           \
1222   sum1 = vis_fpadd16(v01, v11);                                         \
1223   data0 = dpSrc[0];                                                     \
1224   u10 = vis_fmul8sux16(row31, yFilter3);                                \
1225   sum2 = vis_fpadd16(v02, v12);                                         \
1226   sum3 = vis_fpadd16(v03, v13);                                         \
1227   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
1228   data1 = dpSrc[1];                                                     \
1229   v22 = vis_fpadd16(u20, u21);                                          \
1230   u11 = vis_fmul8ulx16(row31, yFilter3);                                \
1231   data2 = dpSrc[2];                                                     \
1232   sum0 = vis_fpadd16(sum0, v20);                                        \
1233   u20 = vis_fmul8sux16(row32, yFilter3);                                \
1234   data3 = dpSrc[3];                                                     \
1235   u21 = vis_fmul8ulx16(row32, yFilter3);                                \
1236   v23 = vis_fpadd16(u30, u31);                                          \
1237   data4 = dpSrc[4];                                                     \
1238   v30 = vis_fpadd16(u00, u01);                                          \
1239   filterposy &= FILTER_MASK;                                            \
1240   row20 = vis_faligndata(data0, data1);                                 \
1241   sum1 = vis_fpadd16(sum1, v21);                                        \
1242   u30 = vis_fmul8sux16(row33, yFilter3);                                \
1243   row21 = vis_faligndata(data1, data2);                                 \
1244   u31 = vis_fmul8ulx16(row33, yFilter3);                                \
1245   row22 = vis_faligndata(data2, data3);                                 \
1246   row23 = vis_faligndata(data3, data4);                                 \
1247   sPtr += srcYStride;                                                   \
1248   filterposx &= FILTER_MASK;                                            \
1249   v31 = vis_fpadd16(u10, u11);                                          \
1250   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1251   data0 = dpSrc[0];                                                     \
1252   sum2 = vis_fpadd16(sum2, v22);                                        \
1253   sum3 = vis_fpadd16(sum3, v23);                                        \
1254   data1 = dpSrc[1];                                                     \
1255   v32 = vis_fpadd16(u20, u21);                                          \
1256   data2 = dpSrc[2];                                                     \
1257   sum0 = vis_fpadd16(sum0, v30);                                        \
1258   data3 = dpSrc[3];                                                     \
1259   v33 = vis_fpadd16(u30, u31);                                          \
1260   data4 = dpSrc[4];                                                     \
1261   row30 = vis_faligndata(data0, data1);                                 \
1262   v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
1263   row31 = vis_faligndata(data1, data2);                                 \
1264   row32 = vis_faligndata(data2, data3);                                 \
1265   row33 = vis_faligndata(data3, data4);                                 \
1266   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1267   sum1 = vis_fpadd16(sum1, v31);                                        \
1268   yFilter0 = yPtr[0];                                                   \
1269   sum2 = vis_fpadd16(sum2, v32);                                        \
1270   v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
1271   yFilter1 = yPtr[1];                                                   \
1272   v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
1273   sum3 = vis_fpadd16(sum3, v33);                                        \
1274   yFilter2 = yPtr[2];                                                   \
1275   v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
1276   d0 = vis_fpadd16(v00, v01);                                           \
1277   yFilter3 = yPtr[3];                                                   \
1278   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1279   v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
1280   xFilter0 = xPtr[0];                                                   \
1281   v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
1282   d1 = vis_fpadd16(v10, v11);                                           \
1283   xFilter1 = xPtr[1];                                                   \
1284   v30 = vis_fmul8sux16(sum3, xFilter3);                                 \
1285   v31 = vis_fmul8ulx16(sum3, xFilter3);                                 \
1286   d2 = vis_fpadd16(v20, v21);                                           \
1287   xFilter2 = xPtr[2];                                                   \
1288   d3 = vis_fpadd16(v30, v31);                                           \
1289   xFilter3 = xPtr[3];                                                   \
1290   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1291 
1292 /***************************************************************/
1293 #define FADD_4BC_S16()                                          \
1294   d0 = vis_fpadd16(d0, d1);                                     \
1295   d2 = vis_fpadd16(d2, d3);                                     \
1296   d0 = vis_fpadd16(d0, d2);                                     \
1297   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1298   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1299   res = vis_fpackfix_pair(d2, d3)
1300 
1301 /***************************************************************/
mlib_ImageAffine_s16_4ch_bc(mlib_affine_param * param)1302 mlib_status mlib_ImageAffine_s16_4ch_bc (mlib_affine_param *param)
1303 {
1304   DECLAREVAR_BC();
1305   DTYPE  *dstLineEnd;
1306   mlib_s32  filterposx, filterposy;
1307   mlib_d64  data0, data1, data2, data3, data4;
1308   mlib_d64  sum0, sum1, sum2, sum3;
1309   mlib_d64  row00, row10, row20, row30;
1310   mlib_d64  row01, row11, row21, row31;
1311   mlib_d64  row02, row12, row22, row32;
1312   mlib_d64  row03, row13, row23, row33;
1313   mlib_d64  xFilter0, xFilter1, xFilter2, xFilter3;
1314   mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
1315   mlib_d64  v00, v01, v02, v03, v10, v11, v12, v13;
1316   mlib_d64  v20, v21, v22, v23, v30, v31, v32, v33;
1317   mlib_d64  u00, u01, u10, u11, u20, u21, u30, u31;
1318   mlib_d64  d0, d1, d2, d3;
1319   mlib_d64 *yPtr, *xPtr;
1320   mlib_d64 *dp, *dpSrc;
1321   mlib_s32  cols, i, mask, gsrd;
1322   mlib_d64  res;
1323   mlib_f32  f_x01000100 = vis_to_float(0x01000100);
1324   const mlib_s16 *mlib_filters_table_4;
1325 
1326   if (filter == MLIB_BICUBIC) {
1327     mlib_filters_table_4 = mlib_filters_s16_bc_4;
1328   } else {
1329     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
1330   }
1331 
1332   srcYStride >>= 1;
1333 
1334   for (j = yStart; j <= yFinish; j++) {
1335 
1336     vis_write_gsr(10 << 3);
1337 
1338     CLIP(4);
1339     dstLineEnd  = (DTYPE*)dstData + 4 * xRight;
1340 
1341     cols = xRight - xLeft + 1;
1342     dp = vis_alignaddr(dstPixelPtr, 0);
1343     dstLineEnd += 3;
1344     mask = vis_edge16(dstPixelPtr, dstLineEnd);
1345     gsrd = ((8 - (mlib_addr)dstPixelPtr) & 7);
1346 
1347     i = 0;
1348 
1349     if (i <= cols - 4) {
1350 
1351       NEXT_PIXEL_4BC_S16();
1352       LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1353 
1354       NEXT_PIXEL_4BC_S16();
1355 
1356       BC_S16_4CH(mlib_filters_table_4);
1357       FADD_4BC_S16();
1358 
1359       BC_S16_4CH(mlib_filters_table_4);
1360 
1361 #pragma pipeloop(0)
1362       for (; i < cols-4; i++) {
1363         vis_alignaddr((void *)gsrd, 0);
1364         res = vis_faligndata(res, res);
1365 
1366         vis_pst_16(res, dp++, mask);
1367         vis_pst_16(res, dp, ~mask);
1368 
1369         FADD_4BC_S16();
1370         BC_S16_4CH(mlib_filters_table_4);
1371       }
1372 
1373       vis_alignaddr((void *)gsrd, 0);
1374       res = vis_faligndata(res, res);
1375       vis_pst_16(res, dp++, mask);
1376       vis_pst_16(res, dp, ~mask);
1377 
1378       FADD_4BC_S16();
1379       vis_alignaddr((void *)gsrd, 0);
1380       res = vis_faligndata(res, res);
1381       vis_pst_16(res, dp++, mask);
1382       vis_pst_16(res, dp, ~mask);
1383 
1384       RESULT_4BC_S16_1PIXEL();
1385       vis_alignaddr((void *)gsrd, 0);
1386       res = vis_faligndata(res, res);
1387       vis_pst_16(res, dp++, mask);
1388       vis_pst_16(res, dp, ~mask);
1389 
1390       LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1391       RESULT_4BC_S16_1PIXEL();
1392       vis_alignaddr((void *)gsrd, 0);
1393       res = vis_faligndata(res, res);
1394       vis_pst_16(res, dp++, mask);
1395       vis_pst_16(res, dp, ~mask);
1396       i += 4;
1397     }
1398 
1399 #pragma pipeloop(0)
1400     for (; i < cols; i++) {
1401       NEXT_PIXEL_4BC_S16();
1402       LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1403       RESULT_4BC_S16_1PIXEL();
1404       vis_alignaddr((void *)gsrd, 0);
1405       res = vis_faligndata(res, res);
1406       vis_pst_16(res, dp++, mask);
1407       vis_pst_16(res, dp, ~mask);
1408     }
1409   }
1410 
1411   return MLIB_SUCCESS;
1412 }
1413 
1414 /***************************************************************/
1415