1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "remap.hpp"
41 
42 namespace CAROTENE_NS {
43 
44 #ifdef CAROTENE_NEON
45 
46 namespace internal {
47 
remapNearestNeighborReplicate(const Size2D size,const u8 * srcBase,const s32 * map,u8 * dstBase,ptrdiff_t dstStride)48 void remapNearestNeighborReplicate(const Size2D size,
49                                    const u8 * srcBase,
50                                    const s32 * map,
51                                    u8 * dstBase, ptrdiff_t dstStride)
52 {
53     for (size_t y = 0; y < size.height; ++y)
54     {
55         const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
56         u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
57 
58         for (size_t x = 0; x < size.width; ++x)
59         {
60             dst_row[x] = srcBase[map_row[x]];
61         }
62     }
63 }
64 
remapNearestNeighborConst(const Size2D size,const u8 * srcBase,const s32 * map,u8 * dstBase,ptrdiff_t dstStride,u8 borderValue)65 void remapNearestNeighborConst(const Size2D size,
66                                const u8 * srcBase,
67                                const s32 * map,
68                                u8 * dstBase, ptrdiff_t dstStride,
69                                u8 borderValue)
70 {
71     for (size_t y = 0; y < size.height; ++y)
72     {
73         const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
74         u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
75 
76         for (size_t x = 0; x < size.width; ++x)
77         {
78             s32 src_idx = map_row[x];
79             dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
80         }
81     }
82 }
83 
remapLinearReplicate(const Size2D size,const u8 * srcBase,const s32 * map,const f32 * coeffs,u8 * dstBase,ptrdiff_t dstStride)84 void remapLinearReplicate(const Size2D size,
85                           const u8 * srcBase,
86                           const s32 * map,
87                           const f32 * coeffs,
88                           u8 * dstBase, ptrdiff_t dstStride)
89 {
90     int16x8_t v_zero16 = vdupq_n_s16(0);
91 
92     for (size_t y = 0; y < size.height; ++y)
93     {
94         const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
95         const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
96 
97         u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
98 
99         size_t x = 0;
100         for ( ; x + 8 < size.width; x += 8)
101         {
102             int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
103             v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
104             v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
105             v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
106             v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
107             v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
108             v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
109             v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
110 
111             int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
112             v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
113             v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
114             v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
115             v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
116             v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
117             v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
118             v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
119 
120             int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
121             v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
122             v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
123             v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
124             v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
125             v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
126             v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
127             v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
128 
129             int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
130             v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
131             v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
132             v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
133             v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
134             v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
135             v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
136             v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
137 
138             // first part
139             float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
140             float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
141 
142             float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
143             float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
144                                                                                vget_low_s16(v_src00))), v_coeff.val[0]);
145             float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
146                                                                                vget_low_s16(v_src10))), v_coeff.val[0]);
147 
148             float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
149             uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
150 
151             // second part
152             v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
153             v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
154 
155             v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
156             v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
157                                                                    vget_high_s16(v_src00))), v_coeff.val[0]);
158             v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
159                                                                    vget_high_s16(v_src10))), v_coeff.val[0]);
160 
161             v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
162             uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
163 
164             // store
165             vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
166         }
167 
168         for ( ; x < size.width; ++x)
169         {
170             s32 src00_index = map_row[(x << 2)];
171             s32 src10_index = map_row[(x << 2) + 2];
172             f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
173                              srcBase[src00_index];
174             f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
175                              srcBase[src10_index];
176             dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
177         }
178     }
179 }
180 
remapLinearConst(const Size2D size,const u8 * srcBase,const s32 * map,const f32 * coeffs,u8 * dstBase,ptrdiff_t dstStride,u8 borderValue)181 void remapLinearConst(const Size2D size,
182                       const u8 * srcBase,
183                       const s32 * map,
184                       const f32 * coeffs,
185                       u8 * dstBase, ptrdiff_t dstStride,
186                       u8 borderValue)
187 {
188     int16x8_t v_zero16 = vdupq_n_s16(0);
189 
190     for (size_t y = 0; y < size.height; ++y)
191     {
192         const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
193         const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
194 
195         u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
196 
197         size_t x = 0;
198         for ( ; x + 8 < size.width; x += 8)
199         {
200             int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
201             v_src00 = vsetq_lane_s16(map_row[(x << 2) +  4] >= 0 ? srcBase[map_row[(x << 2) +  4]] : borderValue, v_src00, 1);
202             v_src00 = vsetq_lane_s16(map_row[(x << 2) +  8] >= 0 ? srcBase[map_row[(x << 2) +  8]] : borderValue, v_src00, 2);
203             v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
204             v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
205             v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
206             v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
207             v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
208 
209             int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
210             v_src01 = vsetq_lane_s16(map_row[(x << 2) +  5] >= 0 ? srcBase[map_row[(x << 2) +  5]] : borderValue, v_src01, 1);
211             v_src01 = vsetq_lane_s16(map_row[(x << 2) +  9] >= 0 ? srcBase[map_row[(x << 2) +  9]] : borderValue, v_src01, 2);
212             v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
213             v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
214             v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
215             v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
216             v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
217 
218             int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
219             v_src10 = vsetq_lane_s16(map_row[(x << 2) +  6] >= 0 ? srcBase[map_row[(x << 2) +  6]] : borderValue, v_src10, 1);
220             v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
221             v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
222             v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
223             v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
224             v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
225             v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
226 
227             int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
228             v_src11 = vsetq_lane_s16(map_row[(x << 2) +  7] >= 0 ? srcBase[map_row[(x << 2) +  7]] : borderValue, v_src11, 1);
229             v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
230             v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
231             v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
232             v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
233             v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
234             v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
235 
236             // first part
237             float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
238             float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
239 
240             float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
241             float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
242                                                                                vget_low_s16(v_src00))), v_coeff.val[0]);
243             float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
244                                                                                vget_low_s16(v_src10))), v_coeff.val[0]);
245 
246             float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
247             uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
248 
249             // second part
250             v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
251             v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
252 
253             v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
254             v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
255                                                                    vget_high_s16(v_src00))), v_coeff.val[0]);
256             v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
257                                                                    vget_high_s16(v_src10))), v_coeff.val[0]);
258 
259             v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
260             uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
261 
262             // store
263             vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
264         }
265 
266         for ( ; x < size.width; ++x)
267         {
268             s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
269             s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
270             s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
271             s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
272 
273             f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
274             f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
275             dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
276         }
277     }
278 }
279 
280 } // namespace internal
281 
282 #endif // CAROTENE_NEON
283 
isRemapNearestNeighborSupported(const Size2D & ssize)284 bool isRemapNearestNeighborSupported(const Size2D &ssize)
285 {
286 #if SIZE_MAX > UINT32_MAX
287     return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
288                                                                        // is performed with u32
289            isSupportedConfiguration();
290 #else
291     (void)ssize;
292     return isSupportedConfiguration();
293 #endif
294 }
295 
isRemapLinearSupported(const Size2D & ssize)296 bool isRemapLinearSupported(const Size2D &ssize)
297 {
298 #if SIZE_MAX > UINT32_MAX
299     return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
300                                                                        // is performed with u32
301            isSupportedConfiguration();
302 #else
303     (void)ssize;
304     return isSupportedConfiguration();
305 #endif
306 }
307 
remapNearestNeighbor(const Size2D & ssize,const Size2D & dsize,const u8 * srcBase,ptrdiff_t srcStride,const f32 * tableBase,ptrdiff_t tableStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderMode,u8 borderValue)308 void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
309                           const u8 * srcBase, ptrdiff_t srcStride,
310                           const f32 * tableBase, ptrdiff_t tableStride,
311                           u8 * dstBase, ptrdiff_t dstStride,
312                           BORDER_MODE borderMode, u8 borderValue)
313 {
314     internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
315 #ifdef CAROTENE_NEON
316     using namespace internal;
317 
318     s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
319     s32 * map = alignPtr(_map, 16);
320 
321     int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
322     int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
323     int32x4_t v_step4 = vdupq_n_s32(srcStride);
324     int32x2_t v_step2 = vdup_n_s32(srcStride);
325 
326     if (borderMode == BORDER_MODE_REPLICATE)
327     {
328         int32x4_t v_zero4 = vdupq_n_s32(0);
329         int32x2_t v_zero2 = vdup_n_s32(0);
330 
331         for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
332         {
333             size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
334             for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
335             {
336                 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
337 
338                 // compute table
339                 for (size_t y = 0; y < blockHeight; ++y)
340                 {
341                     const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
342                     s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
343 
344                     size_t x = 0;
345                     for ( ; x + 8 <= blockWidth; x += 8)
346                     {
347                         float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
348                                       v_table1 = vld2q_f32(table_row + (x << 1) + 8);
349 
350                         int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
351                         int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
352                         int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
353                         vst1q_s32(map_row + x, v_dst_index);
354 
355                         v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
356                         v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
357                         v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
358                         vst1q_s32(map_row + x + 4, v_dst_index);
359                     }
360 
361                     for ( ; x + 4 <= blockWidth; x += 4)
362                     {
363                         float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
364 
365                         int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
366                         int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
367                         int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
368                         vst1q_s32(map_row + x, v_dst_index);
369                     }
370 
371                     for ( ; x + 2 <= blockWidth; x += 2)
372                     {
373                         float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
374 
375                         int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
376                         int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
377                         int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
378                         vst1_s32(map_row + x, v_dst_index);
379                     }
380 
381                     for ( ; x < blockWidth; ++x)
382                     {
383                         s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
384                         s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
385                         map_row[x] = src_y * srcStride + src_x;
386                     }
387                 }
388 
389                 // make remap
390                 remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
391                                               getRowPtr(dstBase, dstStride, i) + j, dstStride);
392             }
393         }
394     }
395     else if (borderMode == BORDER_MODE_CONSTANT)
396     {
397         int32x4_t v_m1_4 = vdupq_n_s32(-1);
398         int32x2_t v_m1_2 = vdup_n_s32(-1);
399         float32x4_t v_zero4 = vdupq_n_f32(0.0f);
400         float32x2_t v_zero2 = vdup_n_f32(0.0f);
401 
402         for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
403         {
404             size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
405             for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
406             {
407                 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
408 
409                 // compute table
410                 for (size_t y = 0; y < blockHeight; ++y)
411                 {
412                     const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
413                     s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
414 
415                     size_t x = 0;
416                     for ( ; x + 8 <= blockWidth; x += 8)
417                     {
418                         float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
419                                       v_table1 = vld2q_f32(table_row + (x << 1) + 8);
420 
421                         int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
422                         int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
423                         uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
424                                                       vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
425                         int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
426                         vst1q_s32(map_row + x, v_dst_index);
427 
428                         v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
429                         v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
430                         v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
431                                            vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
432                         v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
433                         vst1q_s32(map_row + x + 4, v_dst_index);
434                     }
435 
436                     for ( ; x + 4 <= blockWidth; x += 4)
437                     {
438                         float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
439 
440                         int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
441                         int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
442                         uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
443                                                       vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
444                         int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
445                         vst1q_s32(map_row + x, v_dst_index);
446                     }
447 
448                     for ( ; x + 2 <= blockWidth; x += 2)
449                     {
450                         float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
451 
452                         int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
453                         int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
454                         uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
455                                                      vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
456                         int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
457                         vst1_s32(map_row + x, v_dst_index);
458                     }
459 
460                     for ( ; x < blockWidth; ++x)
461                     {
462                         s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
463                         s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
464                         map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
465                                      (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
466                     }
467                 }
468 
469                 // make remap
470                 remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
471                                           getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
472             }
473         }
474     }
475 
476 #else
477     (void)ssize;
478     (void)dsize;
479     (void)srcBase;
480     (void)srcStride;
481     (void)tableBase;
482     (void)tableStride;
483     (void)dstBase;
484     (void)dstStride;
485     (void)borderMode;
486     (void)borderValue;
487 #endif
488 }
489 
remapLinear(const Size2D & ssize,const Size2D & dsize,const u8 * srcBase,ptrdiff_t srcStride,const f32 * tableBase,ptrdiff_t tableStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderMode,u8 borderValue)490 void remapLinear(const Size2D &ssize, const Size2D &dsize,
491                  const u8 * srcBase, ptrdiff_t srcStride,
492                  const f32 * tableBase, ptrdiff_t tableStride,
493                  u8 * dstBase, ptrdiff_t dstStride,
494                  BORDER_MODE borderMode, u8 borderValue)
495 {
496     internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
497 #ifdef CAROTENE_NEON
498     using namespace internal;
499 
500     s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
501     f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
502 
503     s32 * map = alignPtr(_map, 16);
504     f32 * coeffs = alignPtr(_coeffs, 16);
505 
506     int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
507     int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
508     float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
509 
510     if (borderMode == BORDER_MODE_REPLICATE)
511     {
512         int32x4_t v_zero4 = vdupq_n_s32(0);
513 
514         for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
515         {
516             size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
517             for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
518             {
519                 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
520 
521                 // compute table
522                 for (size_t y = 0; y < blockHeight; ++y)
523                 {
524                     const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
525 
526                     s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
527                     f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
528 
529                     size_t x = 0;
530                     for ( ; x + 4 <= blockWidth; x += 4)
531                     {
532                         float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
533 
534                         int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
535                         int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
536 
537                         float32x4x2_t  v_coeff;
538                         v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
539                         v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
540                         uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
541                         uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
542                         v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
543                         v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
544                         v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
545                         v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
546 
547                         int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
548                         int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
549                         int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
550                         int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
551 
552                         int32x4x4_t v_dst_index;
553                         v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
554                         v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
555                         v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
556                         v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
557 
558                         vst2q_f32(coeff_row + (x << 1), v_coeff);
559                         vst4q_s32(map_row + (x << 2), v_dst_index);
560                     }
561 
562                     for ( ; x < blockWidth; ++x)
563                     {
564                         f32 src_x_f = table_row[(x << 1) + 0];
565                         f32 src_y_f = table_row[(x << 1) + 1];
566 
567                         s32 src0_x = (s32)floorf(src_x_f);
568                         s32 src0_y = (s32)floorf(src_y_f);
569 
570                         coeff_row[x << 1] = src_x_f - src0_x;
571                         coeff_row[(x << 1) + 1] = src_y_f - src0_y;
572 
573                         s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
574                         src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
575                         s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
576                         src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
577 
578                         map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
579                         map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
580                         map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
581                         map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
582                     }
583                 }
584 
585                 remapLinearReplicate(Size2D(blockWidth, blockHeight),
586                                      srcBase, &map[0], &coeffs[0],
587                                      getRowPtr(dstBase, dstStride, i) + j, dstStride);
588             }
589         }
590     }
591     else if (borderMode == BORDER_MODE_CONSTANT)
592     {
593         float32x4_t v_zero4 = vdupq_n_f32(0.0f);
594         int32x4_t v_m1_4 = vdupq_n_s32(-1);
595 
596         for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
597         {
598             size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
599             for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
600             {
601                 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
602 
603                 // compute table
604                 for (size_t y = 0; y < blockHeight; ++y)
605                 {
606                     const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
607 
608                     s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
609                     f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
610 
611                     size_t x = 0;
612                     for ( ; x + 4 <= blockWidth; x += 4)
613                     {
614                         float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
615 
616                         int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
617                         int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
618 
619                         float32x4x2_t v_coeff;
620                         v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
621                         v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
622                         uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
623                         uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
624                         v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
625                         v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
626                         v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
627                         v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
628 
629                         int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
630                         int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
631 
632                         int32x4x4_t v_dst_index;
633                         v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
634                         v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
635                         v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
636                         v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
637 
638                         uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
639                         uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
640                         uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
641                         uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
642 
643                         v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
644                         v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
645                         v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
646                         v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
647 
648                         vst2q_f32(coeff_row + (x << 1), v_coeff);
649                         vst4q_s32(map_row + (x << 2), v_dst_index);
650                     }
651 
652                     for ( ; x < blockWidth; ++x)
653                     {
654                         f32 src_x_f = table_row[(x << 1) + 0];
655                         f32 src_y_f = table_row[(x << 1) + 1];
656 
657                         s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
658                         s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
659 
660                         coeff_row[(x << 1)] = src_x_f - src0_x;
661                         coeff_row[(x << 1) + 1] = src_y_f - src0_y;
662 
663                         map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
664                                                 (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
665                         map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
666                                                 (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
667                         map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
668                                                 (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
669                         map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
670                                                 (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
671                     }
672                 }
673 
674                 remapLinearConst(Size2D(blockWidth, blockHeight),
675                                  srcBase, &map[0], &coeffs[0],
676                                  getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
677             }
678         }
679     }
680 #else
681     (void)ssize;
682     (void)dsize;
683     (void)srcBase;
684     (void)srcStride;
685     (void)tableBase;
686     (void)tableStride;
687     (void)dstBase;
688     (void)dstStride;
689     (void)borderMode;
690     (void)borderValue;
691 #endif
692 }
693 
694 } // namespace CAROTENE_NS
695