1 /*
2  * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 
27 
28 /*
29  *      The functions step along the lines from xLeft to xRight and apply
30  *      the bicubic filtering.
31  *
32  */
33 
34 #include "vis_proto.h"
35 #include "mlib_ImageAffine.h"
36 #include "mlib_v_ImageFilters.h"
37 
38 /***************************************************************/
39 #define DTYPE  mlib_s16
40 
41 #define FILTER_BITS  9
42 
43 /***************************************************************/
44 #define sPtr srcPixelPtr
45 
46 /***************************************************************/
47 #define NEXT_PIXEL_1BC_S16()                                    \
48   xSrc = (X >> MLIB_SHIFT)-1;                                   \
49   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
50   sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
51 
52 /***************************************************************/
53 #define LOAD_BC_S16_1CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
54   dpSrc = vis_alignaddr(sPtr, 0);                                       \
55   data0 = dpSrc[0];                                                     \
56   data1 = dpSrc[1];                                                     \
57   row0 = vis_faligndata(data0, data1);                                  \
58   sPtr += srcYStride;                                                   \
59   dpSrc = vis_alignaddr(sPtr, 0);                                       \
60   data0 = dpSrc[0];                                                     \
61   data1 = dpSrc[1];                                                     \
62   row1 = vis_faligndata(data0, data1);                                  \
63   sPtr += srcYStride;                                                   \
64   dpSrc = vis_alignaddr(sPtr, 0);                                       \
65   data0 = dpSrc[0];                                                     \
66   data1 = dpSrc[1];                                                     \
67   row2 = vis_faligndata(data0, data1);                                  \
68   sPtr += srcYStride;                                                   \
69   dpSrc = vis_alignaddr(sPtr, 0);                                       \
70   data0 = dpSrc[0];                                                     \
71   data1 = dpSrc[1];                                                     \
72   row3 = vis_faligndata(data0, data1);                                  \
73   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
74   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
75   yFilter0 = yPtr[0];                                                   \
76   yFilter1 = yPtr[1];                                                   \
77   yFilter2 = yPtr[2];                                                   \
78   yFilter3 = yPtr[3];                                                   \
79   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
80   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
81   X += dX;                                                              \
82   Y += dY
83 
84 /***************************************************************/
85 #define RESULT_1BC_S16_1PIXEL()                                          \
86   u0 = vis_fmul8sux16(vis_fxor(row0, mask8000), yFilter0);               \
87   u1 = vis_fmul8ulx16(vis_fxor(row0, mask8000), yFilter0);               \
88   u2 = vis_fmul8sux16(vis_fxor(row1, mask8000), yFilter1);               \
89   v0 = vis_fpadd16(u0, u1);                                              \
90   u3 = vis_fmul8ulx16(vis_fxor(row1, mask8000), yFilter1);               \
91   u0 = vis_fmul8sux16(vis_fxor(row2, mask8000), yFilter2);               \
92   v1 = vis_fpadd16(u2, u3);                                              \
93   u1 = vis_fmul8ulx16(vis_fxor(row2, mask8000), yFilter2);               \
94   sum = vis_fpadd16(v0, v1);                                             \
95   u2 = vis_fmul8sux16(vis_fxor(row3, mask8000), yFilter3);               \
96   v2 = vis_fpadd16(u0, u1);                                              \
97   u3 = vis_fmul8ulx16(vis_fxor(row3, mask8000), yFilter3);               \
98   sum = vis_fpadd16(sum, v2);                                            \
99   v3 = vis_fpadd16(u2, u3);                                              \
100   sum = vis_fpadd16(sum, v3);                                            \
101   d00 = vis_fmul8sux16(sum, xFilter);                                    \
102   d10 = vis_fmul8ulx16(sum, xFilter);                                    \
103   d0 = vis_fpadd16(d00, d10);                                            \
104   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));                   \
105   d0 = vis_fmuld8sux16(f_x01000100, p0);                                 \
106   d1 = vis_write_lo(d1, vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0))); \
107   res = vis_fxor(vis_fpackfix_pair(d1, d1), mask8000)
108 
109 /***************************************************************/
110 #define BC_S16_1CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
111   u0 = vis_fmul8sux16(vis_fxor(row0, mask8000), yFilter0);              \
112   u1 = vis_fmul8ulx16(vis_fxor(row0, mask8000), yFilter0);              \
113   dpSrc = vis_alignaddr(sPtr, 0);                                       \
114   u2 = vis_fmul8sux16(vis_fxor(row1, mask8000), yFilter1);              \
115   v0 = vis_fpadd16(u0, u1);                                             \
116   data0 = dpSrc[0];                                                     \
117   filterposy = (Y >> FILTER_SHIFT);                                     \
118   u3 = vis_fmul8ulx16(vis_fxor(row1, mask8000), yFilter1);              \
119   data1 = dpSrc[1];                                                     \
120   row0 = vis_faligndata(data0, data1);                                  \
121   filterposx = (X >> FILTER_SHIFT);                                     \
122   sPtr += srcYStride;                                                   \
123   dpSrc = vis_alignaddr(sPtr, 0);                                       \
124   u0 = vis_fmul8sux16(vis_fxor(row2, mask8000), yFilter2);              \
125   v1 = vis_fpadd16(u2, u3);                                             \
126   data0 = dpSrc[0];                                                     \
127   u1 = vis_fmul8ulx16(vis_fxor(row2, mask8000), yFilter2);              \
128   sum = vis_fpadd16(v0, v1);                                            \
129   X += dX;                                                              \
130   data1 = dpSrc[1];                                                     \
131   row1 = vis_faligndata(data0, data1);                                  \
132   sPtr += srcYStride;                                                   \
133   dpSrc = vis_alignaddr(sPtr, 0);                                       \
134   u2 = vis_fmul8sux16(vis_fxor(row3, mask8000), yFilter3);              \
135   v2 = vis_fpadd16(u0, u1);                                             \
136   Y += dY;                                                              \
137   xSrc = (X >> MLIB_SHIFT)-1;                                           \
138   data0 = dpSrc[0];                                                     \
139   u3 = vis_fmul8ulx16(vis_fxor(row3, mask8000), yFilter3);              \
140   sum = vis_fpadd16(sum, v2);                                           \
141   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
142   data1 = dpSrc[1];                                                     \
143   filterposy &= FILTER_MASK;                                            \
144   row2 = vis_faligndata(data0, data1);                                  \
145   sPtr += srcYStride;                                                   \
146   filterposx &= FILTER_MASK;                                            \
147   dpSrc = vis_alignaddr(sPtr, 0);                                       \
148   data0 = dpSrc[0];                                                     \
149   v3 = vis_fpadd16(u2, u3);                                             \
150   data1 = dpSrc[1];                                                     \
151   row3 = vis_faligndata(data0, data1);                                  \
152   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
153   yFilter0 = yPtr[0];                                                   \
154   sum = vis_fpadd16(sum, v3);                                           \
155   yFilter1 = yPtr[1];                                                   \
156   d0 = vis_fmul8sux16(sum, xFilter);                                    \
157   yFilter2 = yPtr[2];                                                   \
158   d1 = vis_fmul8ulx16(sum, xFilter);                                    \
159   yFilter3 = yPtr[3];                                                   \
160   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
161   d0##ind = vis_fpadd16(d0, d1);                                        \
162   sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
163 
164 /***************************************************************/
165 #define FADD_1BC_S16()                                                \
166   p0 = vis_fpadd16s(vis_read_hi(d00), vis_read_lo(d00));              \
167   p1 = vis_fpadd16s(vis_read_hi(d01), vis_read_lo(d01));              \
168   p2 = vis_fpadd16s(vis_read_hi(d02), vis_read_lo(d02));              \
169   p3 = vis_fpadd16s(vis_read_hi(d03), vis_read_lo(d03));              \
170   d0 = vis_fmuld8sux16(f_x01000100, p0);                              \
171   d1 = vis_fmuld8sux16(f_x01000100, p1);                              \
172   d2 = vis_fmuld8sux16(f_x01000100, p2);                              \
173   d3 = vis_fmuld8sux16(f_x01000100, p3);                              \
174   d0 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0)),  \
175                      vis_fpadd32s(vis_read_hi(d1), vis_read_lo(d1))); \
176   d1 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d2), vis_read_lo(d2)),  \
177                      vis_fpadd32s(vis_read_hi(d3), vis_read_lo(d3))); \
178   res = vis_fxor(vis_fpackfix_pair(d0, d1), mask8000)
179 
180 /***************************************************************/
mlib_ImageAffine_u16_1ch_bc(mlib_affine_param * param)181 mlib_status mlib_ImageAffine_u16_1ch_bc (mlib_affine_param *param)
182 {
183   DECLAREVAR_BC();
184   mlib_s32  filterposx, filterposy;
185   mlib_d64  data0, data1;
186   mlib_d64  sum;
187   mlib_d64  row0, row1, row2, row3;
188   mlib_f32  p0, p1, p2, p3;
189   mlib_d64  xFilter, yFilter0, yFilter1, yFilter2, yFilter3;
190   mlib_d64  v0, v1, v2, v3;
191   mlib_d64  u0, u1, u2, u3;
192   mlib_d64  d0, d1, d2, d3;
193   mlib_d64  d00, d10, d01, d02, d03;
194   mlib_d64 *yPtr;
195   mlib_d64 *dpSrc;
196   mlib_s32  align, cols, i;
197   mlib_d64  res;
198   mlib_f32  f_x01000100 = vis_to_float(0x01000100);
199   mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
200   const mlib_s16 *mlib_filters_table  ;
201   const mlib_s16 *mlib_filters_table_4;
202 
203   if (filter == MLIB_BICUBIC) {
204     mlib_filters_table   = mlib_filters_s16_bc;
205     mlib_filters_table_4 = mlib_filters_s16_bc_4;
206   } else {
207     mlib_filters_table   = mlib_filters_s16_bc2;
208     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
209   }
210 
211   srcYStride >>= 1;
212 
213   for (j = yStart; j <= yFinish; j++) {
214 
215     vis_write_gsr(10 << 3);
216 
217     CLIP(1);
218 
219     cols = xRight - xLeft + 1;
220     align = (8 - ((mlib_addr)dstPixelPtr) & 7) & 7;
221     align >>= 1;
222     align = (cols < align)? cols : align;
223 
224     for (i = 0; i < align; i++) {
225       NEXT_PIXEL_1BC_S16();
226       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
227       RESULT_1BC_S16_1PIXEL();
228       vis_st_u16(res, dstPixelPtr++);
229     }
230 
231     if (i <= cols - 10) {
232 
233       NEXT_PIXEL_1BC_S16();
234       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
235 
236       NEXT_PIXEL_1BC_S16();
237 
238       BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
239       BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
240       BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
241       BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
242 
243       FADD_1BC_S16();
244 
245       BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
246       BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
247       BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
248       BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
249 
250 #pragma pipeloop(0)
251       for (; i <= cols - 14; i += 4) {
252         *(mlib_d64*)dstPixelPtr = res;
253         FADD_1BC_S16();
254         BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
255         BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
256         BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
257         BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
258         dstPixelPtr += 4;
259       }
260 
261       *(mlib_d64*)dstPixelPtr = res;
262       dstPixelPtr += 4;
263       FADD_1BC_S16();
264       *(mlib_d64*)dstPixelPtr = res;
265       dstPixelPtr += 4;
266 
267       RESULT_1BC_S16_1PIXEL();
268       vis_st_u16(res, dstPixelPtr++);
269 
270       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
271       RESULT_1BC_S16_1PIXEL();
272       vis_st_u16(res, dstPixelPtr++);
273       i += 10;
274     }
275 
276     for (; i < cols; i++) {
277       NEXT_PIXEL_1BC_S16();
278       LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
279       RESULT_1BC_S16_1PIXEL();
280       vis_st_u16(res, dstPixelPtr++);
281     }
282   }
283 
284   return MLIB_SUCCESS;
285 }
286 
287 /***************************************************************/
288 #define NEXT_PIXEL_2BC_S16()                                    \
289   xSrc = (X >> MLIB_SHIFT)-1;                                   \
290   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
291   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
292 
293 /***************************************************************/
294 #define LOAD_BC_S16_2CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
295   dpSrc = vis_alignaddr(sPtr, 0);                                       \
296   data0 = dpSrc[0];                                                     \
297   data1 = dpSrc[1];                                                     \
298   data2 = dpSrc[2];                                                     \
299   row00 = vis_faligndata(data0, data1);                                 \
300   row01 = vis_faligndata(data1, data2);                                 \
301   sPtr += srcYStride;                                                   \
302   dpSrc = vis_alignaddr(sPtr, 0);                                       \
303   data0 = dpSrc[0];                                                     \
304   data1 = dpSrc[1];                                                     \
305   data2 = dpSrc[2];                                                     \
306   row10 = vis_faligndata(data0, data1);                                 \
307   row11 = vis_faligndata(data1, data2);                                 \
308   sPtr += srcYStride;                                                   \
309   dpSrc = vis_alignaddr(sPtr, 0);                                       \
310   data0 = dpSrc[0];                                                     \
311   data1 = dpSrc[1];                                                     \
312   data2 = dpSrc[2];                                                     \
313   row20 = vis_faligndata(data0, data1);                                 \
314   row21 = vis_faligndata(data1, data2);                                 \
315   sPtr += srcYStride;                                                   \
316   dpSrc = vis_alignaddr(sPtr, 0);                                       \
317   data0 = dpSrc[0];                                                     \
318   data1 = dpSrc[1];                                                     \
319   data2 = dpSrc[2];                                                     \
320   row30 = vis_faligndata(data0, data1);                                 \
321   row31 = vis_faligndata(data1, data2);                                 \
322   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
323   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
324   yFilter0 = yPtr[0];                                                   \
325   yFilter1 = yPtr[1];                                                   \
326   yFilter2 = yPtr[2];                                                   \
327   yFilter3 = yPtr[3];                                                   \
328   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
329   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
330   X += dX;                                                              \
331   Y += dY
332 
333 /***************************************************************/
334 #define RESULT_2BC_S16_1PIXEL()                                 \
335   u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);    \
336   dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
337   u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);    \
338   dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));           \
339   u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);    \
340   dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));          \
341   u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);    \
342   dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));           \
343   u20 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);    \
344   v00 = vis_fpadd16(u00, u01);                                  \
345   u21 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);    \
346   v01 = vis_fpadd16(u10, u11);                                  \
347   u00 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);    \
348   xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));    \
349   u01 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);    \
350   u10 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);    \
351   u11 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);    \
352   v10 = vis_fpadd16(u20, u21);                                  \
353   sum0 = vis_fpadd16(v00, v10);                                 \
354   u20 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);    \
355   v11 = vis_fpadd16(u00, u01);                                  \
356   u21 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);    \
357   xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));    \
358   u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);    \
359   v20 = vis_fpadd16(u10, u11);                                  \
360   sum1 = vis_fpadd16(v01, v11);                                 \
361   u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);    \
362   sum0 = vis_fpadd16(sum0, v20);                                \
363   v21 = vis_fpadd16(u20, u21);                                  \
364   u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);    \
365   v30 = vis_fpadd16(u00, u01);                                  \
366   sum1 = vis_fpadd16(sum1, v21);                                \
367   u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);    \
368   sum0 = vis_fpadd16(sum0, v30);                                \
369   v31 = vis_fpadd16(u10, u11);                                  \
370   sum1 = vis_fpadd16(sum1, v31);                                \
371   d00 = vis_fmul8sux16(sum0, xFilter0);                         \
372   d10 = vis_fmul8ulx16(sum0, xFilter0);                         \
373   d20 = vis_fmul8sux16(sum1, xFilter1);                         \
374   d30 = vis_fmul8ulx16(sum1, xFilter1);                         \
375   d0 = vis_fpadd16(d00, d10);                                   \
376   d1 = vis_fpadd16(d20, d30);                                   \
377   d0 = vis_fpadd16(d0, d1);                                     \
378   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
379   d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
380   res = vis_fxor(vis_fpackfix_pair(d0, d0), mask8000)
381 
382 /***************************************************************/
383 #define BC_S16_2CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
384   u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);            \
385   dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter));         \
386   u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);            \
387   dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));                   \
388   u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);            \
389   dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));                  \
390   u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);            \
391   dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));                   \
392   dpSrc = vis_alignaddr(sPtr, 0);                                       \
393   u20 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);            \
394   v00 = vis_fpadd16(u00, u01);                                          \
395   u21 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);            \
396   data0 = dpSrc[0];                                                     \
397   filterposy = (Y >> FILTER_SHIFT);                                     \
398   v01 = vis_fpadd16(u10, u11);                                          \
399   data1 = dpSrc[1];                                                     \
400   u00 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);            \
401   xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));            \
402   data2 = dpSrc[2];                                                     \
403   u01 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);            \
404   row00 = vis_faligndata(data0, data1);                                 \
405   u10 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);            \
406   row01 = vis_faligndata(data1, data2);                                 \
407   filterposx = (X >> FILTER_SHIFT);                                     \
408   sPtr += srcYStride;                                                   \
409   dpSrc = vis_alignaddr(sPtr, 0);                                       \
410   u11 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);            \
411   v10 = vis_fpadd16(u20, u21);                                          \
412   data0 = dpSrc[0];                                                     \
413   sum0 = vis_fpadd16(v00, v10);                                         \
414   X += dX;                                                              \
415   data1 = dpSrc[1];                                                     \
416   u20 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);            \
417   v11 = vis_fpadd16(u00, u01);                                          \
418   data2 = dpSrc[2];                                                     \
419   row10 = vis_faligndata(data0, data1);                                 \
420   u21 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);            \
421   row11 = vis_faligndata(data1, data2);                                 \
422   sPtr += srcYStride;                                                   \
423   xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));            \
424   dpSrc = vis_alignaddr(sPtr, 0);                                       \
425   u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);            \
426   v20 = vis_fpadd16(u10, u11);                                          \
427   Y += dY;                                                              \
428   xSrc = (X >> MLIB_SHIFT)-1;                                           \
429   sum1 = vis_fpadd16(v01, v11);                                         \
430   data0 = dpSrc[0];                                                     \
431   u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);            \
432   sum0 = vis_fpadd16(sum0, v20);                                        \
433   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
434   data1 = dpSrc[1];                                                     \
435   v21 = vis_fpadd16(u20, u21);                                          \
436   u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);            \
437   data2 = dpSrc[2];                                                     \
438   v30 = vis_fpadd16(u00, u01);                                          \
439   filterposy &= FILTER_MASK;                                            \
440   row20 = vis_faligndata(data0, data1);                                 \
441   sum1 = vis_fpadd16(sum1, v21);                                        \
442   u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);            \
443   row21 = vis_faligndata(data1, data2);                                 \
444   sPtr += srcYStride;                                                   \
445   filterposx &= FILTER_MASK;                                            \
446   v31 = vis_fpadd16(u10, u11);                                          \
447   dpSrc = vis_alignaddr(sPtr, 0);                                       \
448   data0 = dpSrc[0];                                                     \
449   sum0 = vis_fpadd16(sum0, v30);                                        \
450   data1 = dpSrc[1];                                                     \
451   sum1 = vis_fpadd16(sum1, v31);                                        \
452   data2 = dpSrc[2];                                                     \
453   row30 = vis_faligndata(data0, data1);                                 \
454   d0 = vis_fmul8sux16(sum0, xFilter0);                                  \
455   row31 = vis_faligndata(data1, data2);                                 \
456   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
457   d1 = vis_fmul8ulx16(sum0, xFilter0);                                  \
458   yFilter0 = yPtr[0];                                                   \
459   d2 = vis_fmul8sux16(sum1, xFilter1);                                  \
460   yFilter1 = yPtr[1];                                                   \
461   d3 = vis_fmul8ulx16(sum1, xFilter1);                                  \
462   d0##ind = vis_fpadd16(d0, d1);                                        \
463   yFilter2 = yPtr[2];                                                   \
464   yFilter3 = yPtr[3];                                                   \
465   d1##ind = vis_fpadd16(d2, d3);                                        \
466   xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
467   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
468 
469 /***************************************************************/
470 #define FADD_2BC_S16()                                          \
471   d0 = vis_fpadd16(d00, d10);                                   \
472   d2 = vis_fpadd16(d01, d11);                                   \
473   p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
474   p1 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2));          \
475   d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
476   d1 = vis_fmuld8sux16(f_x01000100, p1);                        \
477   res = vis_fxor(vis_fpackfix_pair(d0, d1), mask8000)
478 
479 /***************************************************************/
mlib_ImageAffine_u16_2ch_bc(mlib_affine_param * param)480 mlib_status mlib_ImageAffine_u16_2ch_bc (mlib_affine_param *param)
481 {
482   DECLAREVAR_BC();
483   DTYPE  *dstLineEnd;
484   mlib_s32  filterposx, filterposy;
485   mlib_d64  data0, data1, data2;
486   mlib_d64  sum0, sum1;
487   mlib_d64  row00, row10, row20, row30;
488   mlib_d64  row01, row11, row21, row31;
489   mlib_f32  p0, p1;
490   mlib_d64  xFilter, xFilter0, xFilter1;
491   mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
492   mlib_d64  v00, v01, v10, v11, v20, v21, v30, v31;
493   mlib_d64  u00, u01, u10, u11, u20, u21;
494   mlib_d64  d0, d1, d2, d3;
495   mlib_d64  d00, d10, d20, d30, d01, d11;
496   mlib_d64  *yPtr;
497   mlib_d64  *dp, *dpSrc;
498   mlib_s32  cols, i, mask, emask;
499   mlib_d64  res, res1;
500   mlib_d64  dr, dr1;
501   mlib_f32 f_x01000100 = vis_to_float(0x01000100);
502   mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
503   const mlib_s16 *mlib_filters_table  ;
504   const mlib_s16 *mlib_filters_table_4;
505 
506   if (filter == MLIB_BICUBIC) {
507     mlib_filters_table   = mlib_filters_s16_bc;
508     mlib_filters_table_4 = mlib_filters_s16_bc_4;
509   } else {
510     mlib_filters_table   = mlib_filters_s16_bc2;
511     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
512   }
513 
514   srcYStride >>= 1;
515 
516   for (j = yStart; j <= yFinish; j++) {
517 
518     vis_write_gsr(10 << 3);
519 
520     CLIP(2);
521     dstLineEnd  = (DTYPE*)dstData + 2 * xRight;
522 
523     cols = xRight - xLeft + 1;
524     dp = vis_alignaddr(dstPixelPtr, 0);
525     dstLineEnd += 1;
526     mask = vis_edge16(dstPixelPtr, dstLineEnd);
527     i = 0;
528 
529     if (i <= cols - 6) {
530 
531       NEXT_PIXEL_2BC_S16();
532       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
533 
534       NEXT_PIXEL_2BC_S16();
535 
536       BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
537       BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
538 
539       FADD_2BC_S16();
540 
541       BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
542       BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
543 
544 #pragma pipeloop(0)
545       for (; i <= cols-8; i += 2) {
546         vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
547         res = vis_faligndata(res, res);
548         vis_pst_16(res, dp++, mask);
549         vis_pst_16(res, dp, ~mask);
550         FADD_2BC_S16();
551         BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
552         BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
553       }
554 
555       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
556       res = vis_faligndata(res, res);
557       vis_pst_16(res, dp++, mask);
558       vis_pst_16(res, dp, ~mask);
559 
560       FADD_2BC_S16();
561       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
562       res = vis_faligndata(res, res);
563       vis_pst_16(res, dp++, mask);
564       vis_pst_16(res, dp, ~mask);
565 
566       RESULT_2BC_S16_1PIXEL();
567       res1 = res;
568 
569       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
570       RESULT_2BC_S16_1PIXEL();
571       res = vis_write_hi(res, vis_read_hi(res1));
572       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
573       res = vis_faligndata(res, res);
574       vis_pst_16(res, dp++, mask);
575       vis_pst_16(res, dp, ~mask);
576 
577       i += 6;
578     }
579 
580     if (i <= cols - 4) {
581       NEXT_PIXEL_2BC_S16();
582       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
583 
584       NEXT_PIXEL_2BC_S16();
585 
586       BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
587       BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
588 
589       FADD_2BC_S16();
590       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
591       res = vis_faligndata(res, res);
592       vis_pst_16(res, dp++, mask);
593       vis_pst_16(res, dp, ~mask);
594 
595       RESULT_2BC_S16_1PIXEL();
596       res1 = res;
597 
598       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
599       RESULT_2BC_S16_1PIXEL();
600       res = vis_write_hi(res, vis_read_hi(res1));
601       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
602       res = vis_faligndata(res, res);
603       vis_pst_16(res, dp++, mask);
604       vis_pst_16(res, dp, ~mask);
605 
606       i += 4;
607     }
608 
609     if (i <= cols - 2) {
610       NEXT_PIXEL_2BC_S16();
611       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
612       RESULT_2BC_S16_1PIXEL();
613       res1 = res;
614 
615       NEXT_PIXEL_2BC_S16();
616       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
617       RESULT_2BC_S16_1PIXEL();
618       res = vis_write_hi(res, vis_read_hi(res1));
619       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
620       res = vis_faligndata(res, res);
621       vis_pst_16(res, dp++, mask);
622       vis_pst_16(res, dp, ~mask);
623 
624       i += 2;
625     }
626 
627     if (i < cols) {
628       NEXT_PIXEL_2BC_S16();
629       LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
630       RESULT_2BC_S16_1PIXEL();
631       vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
632       res = vis_faligndata(res, res);
633       emask = vis_edge16(dp, dstLineEnd);
634       vis_pst_16(res, dp++, mask & emask);
635 
636       if ((mlib_s16*)dp <= dstLineEnd) {
637         mask = vis_edge16(dp, dstLineEnd);
638         vis_pst_16(res, dp, mask);
639       }
640     }
641   }
642 
643   return MLIB_SUCCESS;
644 }
645 
646 /***************************************************************/
647 #define NEXT_PIXEL_3BC_S16()                                    \
648   xSrc = (X >> MLIB_SHIFT)-1;                                   \
649   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
650   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
651 
652 /***************************************************************/
653 #define LOAD_BC_S16_3CH_1PIXEL(mlib_filters_s16_3, mlib_filters_s16_4)  \
654   dpSrc = vis_alignaddr(sPtr, 0);                                       \
655   data0 = dpSrc[0];                                                     \
656   data1 = dpSrc[1];                                                     \
657   data2 = dpSrc[2];                                                     \
658   data3 = dpSrc[3];                                                     \
659   row00 = vis_faligndata(data0, data1);                                 \
660   row01 = vis_faligndata(data1, data2);                                 \
661   row02 = vis_faligndata(data2, data3);                                 \
662   sPtr += srcYStride;                                                   \
663   dpSrc = vis_alignaddr(sPtr, 0);                                       \
664   data0 = dpSrc[0];                                                     \
665   data1 = dpSrc[1];                                                     \
666   data2 = dpSrc[2];                                                     \
667   data3 = dpSrc[3];                                                     \
668   row10 = vis_faligndata(data0, data1);                                 \
669   row11 = vis_faligndata(data1, data2);                                 \
670   row12 = vis_faligndata(data2, data3);                                 \
671   sPtr += srcYStride;                                                   \
672   dpSrc = vis_alignaddr(sPtr, 0);                                       \
673   data0 = dpSrc[0];                                                     \
674   data1 = dpSrc[1];                                                     \
675   data2 = dpSrc[2];                                                     \
676   data3 = dpSrc[3];                                                     \
677   row20 = vis_faligndata(data0, data1);                                 \
678   row21 = vis_faligndata(data1, data2);                                 \
679   row22 = vis_faligndata(data2, data3);                                 \
680   sPtr += srcYStride;                                                   \
681   dpSrc = vis_alignaddr(sPtr, 0);                                       \
682   data0 = dpSrc[0];                                                     \
683   data1 = dpSrc[1];                                                     \
684   data2 = dpSrc[2];                                                     \
685   data3 = dpSrc[3];                                                     \
686   row30 = vis_faligndata(data0, data1);                                 \
687   row31 = vis_faligndata(data1, data2);                                 \
688   row32 = vis_faligndata(data2, data3);                                 \
689   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
690   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
691   yFilter0 = yPtr[0];                                                   \
692   yFilter1 = yPtr[1];                                                   \
693   yFilter2 = yPtr[2];                                                   \
694   yFilter3 = yPtr[3];                                                   \
695   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
696   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
697   xFilter0 = xPtr[0];                                                   \
698   xFilter1 = xPtr[1];                                                   \
699   xFilter2 = xPtr[2];                                                   \
700   X += dX;                                                              \
701   Y += dY
702 
703 /***************************************************************/
704 #define STORE_BC_S16_3CH_1PIXEL()                               \
705   dstPixelPtr[0] = f0.t[0];                                     \
706   dstPixelPtr[1] = f0.t[1];                                     \
707   dstPixelPtr[2] = f0.t[2];                                     \
708   dstPixelPtr += 3
709 
710 /***************************************************************/
711 #define RESULT_3BC_S16_1PIXEL()                                 \
712   u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);    \
713   u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);    \
714   u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);    \
715   u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);    \
716   v00 = vis_fpadd16(u00, u01);                                  \
717   u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);    \
718   v01 = vis_fpadd16(u10, u11);                                  \
719   u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);    \
720   u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);    \
721   u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);    \
722   v02 = vis_fpadd16(u20, u21);                                  \
723   u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);    \
724   u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);    \
725   v10 = vis_fpadd16(u00, u01);                                  \
726   u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);    \
727   u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);    \
728   u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);    \
729   v11 = vis_fpadd16(u10, u11);                                  \
730   u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);    \
731   v12 = vis_fpadd16(u20, u21);                                  \
732   u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);    \
733   u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);    \
734   v20 = vis_fpadd16(u00, u01);                                  \
735   u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);    \
736   sum0 = vis_fpadd16(v00, v10);                                 \
737   u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);    \
738   u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);    \
739   u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);    \
740   v21 = vis_fpadd16(u10, u11);                                  \
741   sum1 = vis_fpadd16(v01, v11);                                 \
742   u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);    \
743   sum2 = vis_fpadd16(v02, v12);                                 \
744   v22 = vis_fpadd16(u20, u21);                                  \
745   u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);    \
746   sum0 = vis_fpadd16(sum0, v20);                                \
747   u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);    \
748   v30 = vis_fpadd16(u00, u01);                                  \
749   sum1 = vis_fpadd16(sum1, v21);                                \
750   u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);    \
751   v31 = vis_fpadd16(u10, u11);                                  \
752   sum2 = vis_fpadd16(sum2, v22);                                \
753   v32 = vis_fpadd16(u20, u21);                                  \
754   sum0 = vis_fpadd16(sum0, v30);                                \
755   row30 = vis_faligndata(data0, data1);                         \
756   v00 = vis_fmul8sux16(sum0, xFilter0);                         \
757   sum1 = vis_fpadd16(sum1, v31);                                \
758   sum2 = vis_fpadd16(sum2, v32);                                \
759   v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
760   v10 = vis_fmul8sux16(sum1, xFilter1);                         \
761   v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
762   d0 = vis_fpadd16(v00, v01);                                   \
763   v20 = vis_fmul8sux16(sum2, xFilter2);                         \
764   v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
765   d1 = vis_fpadd16(v10, v11);                                   \
766   d2 = vis_fpadd16(v20, v21);                                   \
767   vis_alignaddr((void*)6, 0);                                   \
768   d3 = vis_faligndata(d0, d1);                                  \
769   vis_alignaddr((void*)2, 0);                                   \
770   d4 = vis_faligndata(d1, d2);                                  \
771   d0 = vis_fpadd16(d0, d3);                                     \
772   d2 = vis_fpadd16(d2, d4);                                     \
773   d1 = vis_faligndata(d2, d2);                                  \
774   d0 = vis_fpadd16(d0, d1);                                     \
775   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
776   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
777   f0.d = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
778 
779 /***************************************************************/
780 #define BC_S16_3CH(mlib_filters_s16_3, mlib_filters_s16_4)              \
781   u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);            \
782   u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);            \
783   u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);            \
784   u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);            \
785   v00 = vis_fpadd16(u00, u01);                                          \
786   u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);            \
787   v01 = vis_fpadd16(u10, u11);                                          \
788   u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);            \
789   dpSrc = vis_alignaddr(sPtr, 0);                                       \
790   u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);            \
791   u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);            \
792   data0 = dpSrc[0];                                                     \
793   filterposy = (Y >> FILTER_SHIFT);                                     \
794   v02 = vis_fpadd16(u20, u21);                                          \
795   data1 = dpSrc[1];                                                     \
796   u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);            \
797   data2 = dpSrc[2];                                                     \
798   u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);            \
799   v10 = vis_fpadd16(u00, u01);                                          \
800   data3 = dpSrc[3];                                                     \
801   u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);            \
802   row00 = vis_faligndata(data0, data1);                                 \
803   u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);            \
804   row01 = vis_faligndata(data1, data2);                                 \
805   u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);            \
806   row02 = vis_faligndata(data2, data3);                                 \
807   filterposx = (X >> FILTER_SHIFT);                                     \
808   sPtr += srcYStride;                                                   \
809   dpSrc = vis_alignaddr(sPtr, 0);                                       \
810   v11 = vis_fpadd16(u10, u11);                                          \
811   u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);            \
812   v12 = vis_fpadd16(u20, u21);                                          \
813   data0 = dpSrc[0];                                                     \
814   u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);            \
815   X += dX;                                                              \
816   data1 = dpSrc[1];                                                     \
817   u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);            \
818   v20 = vis_fpadd16(u00, u01);                                          \
819   data2 = dpSrc[2];                                                     \
820   u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);            \
821   sum0 = vis_fpadd16(v00, v10);                                         \
822   data3 = dpSrc[3];                                                     \
823   row10 = vis_faligndata(data0, data1);                                 \
824   u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);            \
825   row11 = vis_faligndata(data1, data2);                                 \
826   u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);            \
827   row12 = vis_faligndata(data2, data3);                                 \
828   sPtr += srcYStride;                                                   \
829   dpSrc = vis_alignaddr(sPtr, 0);                                       \
830   u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);            \
831   v21 = vis_fpadd16(u10, u11);                                          \
832   Y += dY;                                                              \
833   xSrc = (X >> MLIB_SHIFT)-1;                                           \
834   sum1 = vis_fpadd16(v01, v11);                                         \
835   data0 = dpSrc[0];                                                     \
836   u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);            \
837   sum2 = vis_fpadd16(v02, v12);                                         \
838   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
839   data1 = dpSrc[1];                                                     \
840   v22 = vis_fpadd16(u20, u21);                                          \
841   u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);            \
842   data2 = dpSrc[2];                                                     \
843   sum0 = vis_fpadd16(sum0, v20);                                        \
844   u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);            \
845   data3 = dpSrc[3];                                                     \
846   v30 = vis_fpadd16(u00, u01);                                          \
847   filterposy &= FILTER_MASK;                                            \
848   row20 = vis_faligndata(data0, data1);                                 \
849   sum1 = vis_fpadd16(sum1, v21);                                        \
850   u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);            \
851   row21 = vis_faligndata(data1, data2);                                 \
852   row22 = vis_faligndata(data2, data3);                                 \
853   sPtr += srcYStride;                                                   \
854   filterposx &= FILTER_MASK;                                            \
855   v31 = vis_fpadd16(u10, u11);                                          \
856   dpSrc = vis_alignaddr(sPtr, 0);                                       \
857   data0 = dpSrc[0];                                                     \
858   sum2 = vis_fpadd16(sum2, v22);                                        \
859   data1 = dpSrc[1];                                                     \
860   v32 = vis_fpadd16(u20, u21);                                          \
861   data2 = dpSrc[2];                                                     \
862   sum0 = vis_fpadd16(sum0, v30);                                        \
863   data3 = dpSrc[3];                                                     \
864   row30 = vis_faligndata(data0, data1);                                 \
865   v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
866   row31 = vis_faligndata(data1, data2);                                 \
867   row32 = vis_faligndata(data2, data3);                                 \
868   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
869   sum1 = vis_fpadd16(sum1, v31);                                        \
870   yFilter0 = yPtr[0];                                                   \
871   sum2 = vis_fpadd16(sum2, v32);                                        \
872   v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
873   yFilter1 = yPtr[1];                                                   \
874   v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
875   yFilter2 = yPtr[2];                                                   \
876   v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
877   d0 = vis_fpadd16(v00, v01);                                           \
878   yFilter3 = yPtr[3];                                                   \
879   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
880   v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
881   xFilter0 = xPtr[0];                                                   \
882   v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
883   d1 = vis_fpadd16(v10, v11);                                           \
884   xFilter1 = xPtr[1];                                                   \
885   d2 = vis_fpadd16(v20, v21);                                           \
886   xFilter2 = xPtr[2];                                                   \
887   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
888 
889 /***************************************************************/
890 #define FADD_3BC_S16()                                          \
891   vis_alignaddr((void*)6, 0);                                   \
892   d3 = vis_faligndata(d0, d1);                                  \
893   vis_alignaddr((void*)2, 0);                                   \
894   d4 = vis_faligndata(d1, d2);                                  \
895   d0 = vis_fpadd16(d0, d3);                                     \
896   d2 = vis_fpadd16(d2, d4);                                     \
897   d1 = vis_faligndata(d2, d2);                                  \
898   d0 = vis_fpadd16(d0, d1);                                     \
899   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
900   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
901   f0.d = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
902 
903 /***************************************************************/
mlib_ImageAffine_u16_3ch_bc(mlib_affine_param * param)904 mlib_status mlib_ImageAffine_u16_3ch_bc (mlib_affine_param *param)
905 {
906   DECLAREVAR_BC();
907   mlib_s32  filterposx, filterposy;
908   mlib_d64  data0, data1, data2, data3;
909   mlib_d64  sum0, sum1, sum2;
910   mlib_d64  row00, row10, row20, row30;
911   mlib_d64  row01, row11, row21, row31;
912   mlib_d64  row02, row12, row22, row32;
913   mlib_d64  xFilter0, xFilter1, xFilter2;
914   mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
915   mlib_d64  v00, v01, v02, v10, v11, v12, v20, v21, v22, v30, v31, v32;
916   mlib_d64  u00, u01, u10, u11, u20, u21;
917   mlib_d64  d0, d1, d2, d3, d4;
918   mlib_d64 *yPtr, *xPtr;
919   mlib_d64 *dpSrc;
920   mlib_s32  cols, i;
921   mlib_f32  f_x01000100 = vis_to_float(0x01000100);
922   mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
923   union {
924     mlib_s16 t[4];
925     mlib_d64 d;
926   } f0;
927   const mlib_s16 *mlib_filters_table_3;
928   const mlib_s16 *mlib_filters_table_4;
929 
930   if (filter == MLIB_BICUBIC) {
931     mlib_filters_table_3 = mlib_filters_s16_bc_3;
932     mlib_filters_table_4 = mlib_filters_s16_bc_4;
933   } else {
934     mlib_filters_table_3 = mlib_filters_s16_bc2_3;
935     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
936   }
937 
938   srcYStride >>= 1;
939 
940   for (j = yStart; j <= yFinish; j++) {
941 
942     vis_write_gsr(10 << 3);
943 
944     CLIP(3);
945 
946     cols = xRight - xLeft + 1;
947 
948     i = 0;
949 
950     if (i <= cols - 4) {
951 
952       NEXT_PIXEL_3BC_S16();
953       LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
954 
955       NEXT_PIXEL_3BC_S16();
956 
957       BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
958       FADD_3BC_S16();
959 
960       BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
961 
962 #pragma pipeloop(0)
963       for (; i < cols-4; i++) {
964         STORE_BC_S16_3CH_1PIXEL();
965 
966         FADD_3BC_S16();
967         BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
968       }
969 
970       STORE_BC_S16_3CH_1PIXEL();
971 
972       FADD_3BC_S16();
973       STORE_BC_S16_3CH_1PIXEL();
974 
975       RESULT_3BC_S16_1PIXEL();
976       STORE_BC_S16_3CH_1PIXEL();
977 
978       LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
979       RESULT_3BC_S16_1PIXEL();
980       STORE_BC_S16_3CH_1PIXEL();
981       i += 4;
982     }
983 
984     for (; i < cols; i++) {
985       NEXT_PIXEL_3BC_S16();
986       LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
987       RESULT_3BC_S16_1PIXEL();
988       STORE_BC_S16_3CH_1PIXEL();
989     }
990   }
991 
992   return MLIB_SUCCESS;
993 }
994 
995 /***************************************************************/
996 #define NEXT_PIXEL_4BC_S16()                                    \
997   xSrc = (X >> MLIB_SHIFT)-1;                                   \
998   ySrc = (Y >> MLIB_SHIFT)-1;                                   \
999   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1000 
1001 /***************************************************************/
1002 #define LOAD_BC_S16_4CH_1PIXEL(mlib_filters_s16_4)                      \
1003   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1004   data0 = dpSrc[0];                                                     \
1005   data1 = dpSrc[1];                                                     \
1006   data2 = dpSrc[2];                                                     \
1007   data3 = dpSrc[3];                                                     \
1008   data4 = dpSrc[4];                                                     \
1009   row00 = vis_faligndata(data0, data1);                                 \
1010   row01 = vis_faligndata(data1, data2);                                 \
1011   row02 = vis_faligndata(data2, data3);                                 \
1012   row03 = vis_faligndata(data3, data4);                                 \
1013   sPtr += srcYStride;                                                   \
1014   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1015   data0 = dpSrc[0];                                                     \
1016   data1 = dpSrc[1];                                                     \
1017   data2 = dpSrc[2];                                                     \
1018   data3 = dpSrc[3];                                                     \
1019   data4 = dpSrc[4];                                                     \
1020   row10 = vis_faligndata(data0, data1);                                 \
1021   row11 = vis_faligndata(data1, data2);                                 \
1022   row12 = vis_faligndata(data2, data3);                                 \
1023   row13 = vis_faligndata(data3, data4);                                 \
1024   sPtr += srcYStride;                                                   \
1025   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1026   data0 = dpSrc[0];                                                     \
1027   data1 = dpSrc[1];                                                     \
1028   data2 = dpSrc[2];                                                     \
1029   data3 = dpSrc[3];                                                     \
1030   data4 = dpSrc[4];                                                     \
1031   row20 = vis_faligndata(data0, data1);                                 \
1032   row21 = vis_faligndata(data1, data2);                                 \
1033   row22 = vis_faligndata(data2, data3);                                 \
1034   row23 = vis_faligndata(data3, data4);                                 \
1035   sPtr += srcYStride;                                                   \
1036   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1037   data0 = dpSrc[0];                                                     \
1038   data1 = dpSrc[1];                                                     \
1039   data2 = dpSrc[2];                                                     \
1040   data3 = dpSrc[3];                                                     \
1041   data4 = dpSrc[4];                                                     \
1042   row30 = vis_faligndata(data0, data1);                                 \
1043   row31 = vis_faligndata(data1, data2);                                 \
1044   row32 = vis_faligndata(data2, data3);                                 \
1045   row33 = vis_faligndata(data3, data4);                                 \
1046   filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
1047   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1048   yFilter0 = yPtr[0];                                                   \
1049   yFilter1 = yPtr[1];                                                   \
1050   yFilter2 = yPtr[2];                                                   \
1051   yFilter3 = yPtr[3];                                                   \
1052   filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
1053   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1054   xFilter0 = xPtr[0];                                                   \
1055   xFilter1 = xPtr[1];                                                   \
1056   xFilter2 = xPtr[2];                                                   \
1057   xFilter3 = xPtr[3];                                                   \
1058   X += dX;                                                              \
1059   Y += dY
1060 
1061 /***************************************************************/
1062 #define RESULT_4BC_S16_1PIXEL()                                 \
1063   u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);    \
1064   u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);    \
1065   u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);    \
1066   u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);    \
1067   v00 = vis_fpadd16(u00, u01);                                  \
1068   u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);    \
1069   v01 = vis_fpadd16(u10, u11);                                  \
1070   u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);    \
1071   u30 = vis_fmul8sux16(vis_fxor(row03, mask8000), yFilter0);    \
1072   u31 = vis_fmul8ulx16(vis_fxor(row03, mask8000), yFilter0);    \
1073   v02 = vis_fpadd16(u20, u21);                                  \
1074   u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);    \
1075   u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);    \
1076   v03 = vis_fpadd16(u30, u31);                                  \
1077   u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);    \
1078   u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);    \
1079   v10 = vis_fpadd16(u00, u01);                                  \
1080   u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);    \
1081   v11 = vis_fpadd16(u10, u11);                                  \
1082   u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);    \
1083   u30 = vis_fmul8sux16(vis_fxor(row13, mask8000), yFilter1);    \
1084   u31 = vis_fmul8ulx16(vis_fxor(row13, mask8000), yFilter1);    \
1085   u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);    \
1086   v12 = vis_fpadd16(u20, u21);                                  \
1087   u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);    \
1088   v13 = vis_fpadd16(u30, u31);                                  \
1089   u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);    \
1090   u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);    \
1091   v20 = vis_fpadd16(u00, u01);                                  \
1092   u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);    \
1093   sum0 = vis_fpadd16(v00, v10);                                 \
1094   u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);    \
1095   u30 = vis_fmul8sux16(vis_fxor(row23, mask8000), yFilter2);    \
1096   u31 = vis_fmul8ulx16(vis_fxor(row23, mask8000), yFilter2);    \
1097   u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);    \
1098   u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);    \
1099   v21 = vis_fpadd16(u10, u11);                                  \
1100   sum1 = vis_fpadd16(v01, v11);                                 \
1101   u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);    \
1102   sum2 = vis_fpadd16(v02, v12);                                 \
1103   sum3 = vis_fpadd16(v03, v13);                                 \
1104   v22 = vis_fpadd16(u20, u21);                                  \
1105   u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);    \
1106   sum0 = vis_fpadd16(sum0, v20);                                \
1107   u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);    \
1108   u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);    \
1109   v23 = vis_fpadd16(u30, u31);                                  \
1110   v30 = vis_fpadd16(u00, u01);                                  \
1111   sum1 = vis_fpadd16(sum1, v21);                                \
1112   u30 = vis_fmul8sux16(vis_fxor(row33, mask8000), yFilter3);    \
1113   u31 = vis_fmul8ulx16(vis_fxor(row33, mask8000), yFilter3);    \
1114   v31 = vis_fpadd16(u10, u11);                                  \
1115   sum2 = vis_fpadd16(sum2, v22);                                \
1116   sum3 = vis_fpadd16(sum3, v23);                                \
1117   v32 = vis_fpadd16(u20, u21);                                  \
1118   sum0 = vis_fpadd16(sum0, v30);                                \
1119   v33 = vis_fpadd16(u30, u31);                                  \
1120   v00 = vis_fmul8sux16(sum0, xFilter0);                         \
1121   sum1 = vis_fpadd16(sum1, v31);                                \
1122   sum2 = vis_fpadd16(sum2, v32);                                \
1123   v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
1124   v10 = vis_fmul8sux16(sum1, xFilter1);                         \
1125   sum3 = vis_fpadd16(sum3, v33);                                \
1126   v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
1127   d0 = vis_fpadd16(v00, v01);                                   \
1128   v20 = vis_fmul8sux16(sum2, xFilter2);                         \
1129   v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
1130   d1 = vis_fpadd16(v10, v11);                                   \
1131   v30 = vis_fmul8sux16(sum3, xFilter3);                         \
1132   v31 = vis_fmul8ulx16(sum3, xFilter3);                         \
1133   d2 = vis_fpadd16(v20, v21);                                   \
1134   d3 = vis_fpadd16(v30, v31);                                   \
1135   d0 = vis_fpadd16(d0, d1);                                     \
1136   d2 = vis_fpadd16(d2, d3);                                     \
1137   d0 = vis_fpadd16(d0, d2);                                     \
1138   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1139   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1140   res = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
1141 
1142 /***************************************************************/
1143 #define BC_S16_4CH(mlib_filters_s16_4)                                  \
1144   u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);            \
1145   u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);            \
1146   u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);            \
1147   u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);            \
1148   v00 = vis_fpadd16(u00, u01);                                          \
1149   u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);            \
1150   v01 = vis_fpadd16(u10, u11);                                          \
1151   u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);            \
1152   u30 = vis_fmul8sux16(vis_fxor(row03, mask8000), yFilter0);            \
1153   u31 = vis_fmul8ulx16(vis_fxor(row03, mask8000), yFilter0);            \
1154   v02 = vis_fpadd16(u20, u21);                                          \
1155   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1156   u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);            \
1157   u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);            \
1158   data0 = dpSrc[0];                                                     \
1159   filterposy = (Y >> FILTER_SHIFT);                                     \
1160   v03 = vis_fpadd16(u30, u31);                                          \
1161   data1 = dpSrc[1];                                                     \
1162   u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);            \
1163   data2 = dpSrc[2];                                                     \
1164   u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);            \
1165   v10 = vis_fpadd16(u00, u01);                                          \
1166   data3 = dpSrc[3];                                                     \
1167   u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);            \
1168   v11 = vis_fpadd16(u10, u11);                                          \
1169   data4 = dpSrc[4];                                                     \
1170   u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);            \
1171   row00 = vis_faligndata(data0, data1);                                 \
1172   u30 = vis_fmul8sux16(vis_fxor(row13, mask8000), yFilter1);            \
1173   row01 = vis_faligndata(data1, data2);                                 \
1174   u31 = vis_fmul8ulx16(vis_fxor(row13, mask8000), yFilter1);            \
1175   row02 = vis_faligndata(data2, data3);                                 \
1176   u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);            \
1177   row03 = vis_faligndata(data3, data4);                                 \
1178   filterposx = (X >> FILTER_SHIFT);                                     \
1179   sPtr += srcYStride;                                                   \
1180   v12 = vis_fpadd16(u20, u21);                                          \
1181   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1182   u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);            \
1183   v13 = vis_fpadd16(u30, u31);                                          \
1184   data0 = dpSrc[0];                                                     \
1185   u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);            \
1186   X += dX;                                                              \
1187   data1 = dpSrc[1];                                                     \
1188   u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);            \
1189   v20 = vis_fpadd16(u00, u01);                                          \
1190   data2 = dpSrc[2];                                                     \
1191   u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);            \
1192   sum0 = vis_fpadd16(v00, v10);                                         \
1193   data3 = dpSrc[3];                                                     \
1194   u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);            \
1195   data4 = dpSrc[4];                                                     \
1196   row10 = vis_faligndata(data0, data1);                                 \
1197   u30 = vis_fmul8sux16(vis_fxor(row23, mask8000), yFilter2);            \
1198   row11 = vis_faligndata(data1, data2);                                 \
1199   u31 = vis_fmul8ulx16(vis_fxor(row23, mask8000), yFilter2);            \
1200   row12 = vis_faligndata(data2, data3);                                 \
1201   u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);            \
1202   row13 = vis_faligndata(data3, data4);                                 \
1203   sPtr += srcYStride;                                                   \
1204   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1205   u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);            \
1206   v21 = vis_fpadd16(u10, u11);                                          \
1207   Y += dY;                                                              \
1208   xSrc = (X >> MLIB_SHIFT)-1;                                           \
1209   sum1 = vis_fpadd16(v01, v11);                                         \
1210   data0 = dpSrc[0];                                                     \
1211   u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);            \
1212   sum2 = vis_fpadd16(v02, v12);                                         \
1213   sum3 = vis_fpadd16(v03, v13);                                         \
1214   ySrc = (Y >> MLIB_SHIFT)-1;                                           \
1215   data1 = dpSrc[1];                                                     \
1216   v22 = vis_fpadd16(u20, u21);                                          \
1217   u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);            \
1218   data2 = dpSrc[2];                                                     \
1219   sum0 = vis_fpadd16(sum0, v20);                                        \
1220   u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);            \
1221   data3 = dpSrc[3];                                                     \
1222   u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);            \
1223   v23 = vis_fpadd16(u30, u31);                                          \
1224   data4 = dpSrc[4];                                                     \
1225   v30 = vis_fpadd16(u00, u01);                                          \
1226   filterposy &= FILTER_MASK;                                            \
1227   row20 = vis_faligndata(data0, data1);                                 \
1228   sum1 = vis_fpadd16(sum1, v21);                                        \
1229   u30 = vis_fmul8sux16(vis_fxor(row33, mask8000), yFilter3);            \
1230   row21 = vis_faligndata(data1, data2);                                 \
1231   u31 = vis_fmul8ulx16(vis_fxor(row33, mask8000), yFilter3);            \
1232   row22 = vis_faligndata(data2, data3);                                 \
1233   row23 = vis_faligndata(data3, data4);                                 \
1234   sPtr += srcYStride;                                                   \
1235   filterposx &= FILTER_MASK;                                            \
1236   v31 = vis_fpadd16(u10, u11);                                          \
1237   dpSrc = vis_alignaddr(sPtr, 0);                                       \
1238   data0 = dpSrc[0];                                                     \
1239   sum2 = vis_fpadd16(sum2, v22);                                        \
1240   sum3 = vis_fpadd16(sum3, v23);                                        \
1241   data1 = dpSrc[1];                                                     \
1242   v32 = vis_fpadd16(u20, u21);                                          \
1243   data2 = dpSrc[2];                                                     \
1244   sum0 = vis_fpadd16(sum0, v30);                                        \
1245   data3 = dpSrc[3];                                                     \
1246   v33 = vis_fpadd16(u30, u31);                                          \
1247   data4 = dpSrc[4];                                                     \
1248   row30 = vis_faligndata(data0, data1);                                 \
1249   v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
1250   row31 = vis_faligndata(data1, data2);                                 \
1251   row32 = vis_faligndata(data2, data3);                                 \
1252   row33 = vis_faligndata(data3, data4);                                 \
1253   yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1254   sum1 = vis_fpadd16(sum1, v31);                                        \
1255   yFilter0 = yPtr[0];                                                   \
1256   sum2 = vis_fpadd16(sum2, v32);                                        \
1257   v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
1258   yFilter1 = yPtr[1];                                                   \
1259   v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
1260   sum3 = vis_fpadd16(sum3, v33);                                        \
1261   yFilter2 = yPtr[2];                                                   \
1262   v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
1263   d0 = vis_fpadd16(v00, v01);                                           \
1264   yFilter3 = yPtr[3];                                                   \
1265   xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1266   v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
1267   xFilter0 = xPtr[0];                                                   \
1268   v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
1269   d1 = vis_fpadd16(v10, v11);                                           \
1270   xFilter1 = xPtr[1];                                                   \
1271   v30 = vis_fmul8sux16(sum3, xFilter3);                                 \
1272   v31 = vis_fmul8ulx16(sum3, xFilter3);                                 \
1273   d2 = vis_fpadd16(v20, v21);                                           \
1274   xFilter2 = xPtr[2];                                                   \
1275   d3 = vis_fpadd16(v30, v31);                                           \
1276   xFilter3 = xPtr[3];                                                   \
1277   sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1278 
1279 /***************************************************************/
1280 #define FADD_4BC_S16()                                          \
1281   d0 = vis_fpadd16(d0, d1);                                     \
1282   d2 = vis_fpadd16(d2, d3);                                     \
1283   d0 = vis_fpadd16(d0, d2);                                     \
1284   d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1285   d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1286   res = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
1287 
1288 /***************************************************************/
mlib_ImageAffine_u16_4ch_bc(mlib_affine_param * param)1289 mlib_status mlib_ImageAffine_u16_4ch_bc (mlib_affine_param *param)
1290 {
1291   DECLAREVAR_BC();
1292   DTYPE  *dstLineEnd;
1293   mlib_s32  filterposx, filterposy;
1294   mlib_d64  data0, data1, data2, data3, data4;
1295   mlib_d64  sum0, sum1, sum2, sum3;
1296   mlib_d64  row00, row10, row20, row30;
1297   mlib_d64  row01, row11, row21, row31;
1298   mlib_d64  row02, row12, row22, row32;
1299   mlib_d64  row03, row13, row23, row33;
1300   mlib_d64  xFilter0, xFilter1, xFilter2, xFilter3;
1301   mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
1302   mlib_d64  v00, v01, v02, v03, v10, v11, v12, v13;
1303   mlib_d64  v20, v21, v22, v23, v30, v31, v32, v33;
1304   mlib_d64  u00, u01, u10, u11, u20, u21, u30, u31;
1305   mlib_d64  d0, d1, d2, d3;
1306   mlib_d64 *yPtr, *xPtr;
1307   mlib_d64 *dp, *dpSrc;
1308   mlib_s32  cols, i, mask, gsrd;
1309   mlib_d64  res;
1310   mlib_f32  f_x01000100 = vis_to_float(0x01000100);
1311   mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
1312   const mlib_s16 *mlib_filters_table_4;
1313 
1314   if (filter == MLIB_BICUBIC) {
1315     mlib_filters_table_4 = mlib_filters_s16_bc_4;
1316   } else {
1317     mlib_filters_table_4 = mlib_filters_s16_bc2_4;
1318   }
1319 
1320   srcYStride >>= 1;
1321 
1322   for (j = yStart; j <= yFinish; j++) {
1323 
1324     vis_write_gsr(10 << 3);
1325 
1326     CLIP(4);
1327     dstLineEnd  = (DTYPE*)dstData + 4 * xRight;
1328 
1329     cols = xRight - xLeft + 1;
1330     dp = vis_alignaddr(dstPixelPtr, 0);
1331     dstLineEnd += 3;
1332     mask = vis_edge16(dstPixelPtr, dstLineEnd);
1333     gsrd = ((8 - (mlib_addr)dstPixelPtr) & 7);
1334 
1335     i = 0;
1336 
1337     if (i <= cols - 4) {
1338 
1339       NEXT_PIXEL_4BC_S16();
1340       LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1341 
1342       NEXT_PIXEL_4BC_S16();
1343 
1344       BC_S16_4CH(mlib_filters_table_4);
1345       FADD_4BC_S16();
1346 
1347       BC_S16_4CH(mlib_filters_table_4);
1348 
1349 #pragma pipeloop(0)
1350       for (; i < cols-4; i++) {
1351         vis_alignaddr((void *)gsrd, 0);
1352         res = vis_faligndata(res, res);
1353 
1354         vis_pst_16(res, dp++, mask);
1355         vis_pst_16(res, dp, ~mask);
1356 
1357         FADD_4BC_S16();
1358         BC_S16_4CH(mlib_filters_table_4);
1359       }
1360 
1361       vis_alignaddr((void *)gsrd, 0);
1362       res = vis_faligndata(res, res);
1363       vis_pst_16(res, dp++, mask);
1364       vis_pst_16(res, dp, ~mask);
1365 
1366       FADD_4BC_S16();
1367       vis_alignaddr((void *)gsrd, 0);
1368       res = vis_faligndata(res, res);
1369       vis_pst_16(res, dp++, mask);
1370       vis_pst_16(res, dp, ~mask);
1371 
1372       RESULT_4BC_S16_1PIXEL();
1373       vis_alignaddr((void *)gsrd, 0);
1374       res = vis_faligndata(res, res);
1375       vis_pst_16(res, dp++, mask);
1376       vis_pst_16(res, dp, ~mask);
1377 
1378       LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1379       RESULT_4BC_S16_1PIXEL();
1380       vis_alignaddr((void *)gsrd, 0);
1381       res = vis_faligndata(res, res);
1382       vis_pst_16(res, dp++, mask);
1383       vis_pst_16(res, dp, ~mask);
1384       i += 4;
1385     }
1386 
1387 #pragma pipeloop(0)
1388     for (; i < cols; i++) {
1389       NEXT_PIXEL_4BC_S16();
1390       LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1391       RESULT_4BC_S16_1PIXEL();
1392       vis_alignaddr((void *)gsrd, 0);
1393       res = vis_faligndata(res, res);
1394       vis_pst_16(res, dp++, mask);
1395       vis_pst_16(res, dp, ~mask);
1396     }
1397   }
1398 
1399   return MLIB_SUCCESS;
1400 }
1401 
1402 /***************************************************************/
1403