1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "remap.hpp"
41
42 namespace CAROTENE_NS {
43
44 #ifdef CAROTENE_NEON
45
46 namespace internal {
47
remapNearestNeighborReplicate(const Size2D size,const u8 * srcBase,const s32 * map,u8 * dstBase,ptrdiff_t dstStride)48 void remapNearestNeighborReplicate(const Size2D size,
49 const u8 * srcBase,
50 const s32 * map,
51 u8 * dstBase, ptrdiff_t dstStride)
52 {
53 for (size_t y = 0; y < size.height; ++y)
54 {
55 const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
56 u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
57
58 for (size_t x = 0; x < size.width; ++x)
59 {
60 dst_row[x] = srcBase[map_row[x]];
61 }
62 }
63 }
64
remapNearestNeighborConst(const Size2D size,const u8 * srcBase,const s32 * map,u8 * dstBase,ptrdiff_t dstStride,u8 borderValue)65 void remapNearestNeighborConst(const Size2D size,
66 const u8 * srcBase,
67 const s32 * map,
68 u8 * dstBase, ptrdiff_t dstStride,
69 u8 borderValue)
70 {
71 for (size_t y = 0; y < size.height; ++y)
72 {
73 const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
74 u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
75
76 for (size_t x = 0; x < size.width; ++x)
77 {
78 s32 src_idx = map_row[x];
79 dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
80 }
81 }
82 }
83
remapLinearReplicate(const Size2D size,const u8 * srcBase,const s32 * map,const f32 * coeffs,u8 * dstBase,ptrdiff_t dstStride)84 void remapLinearReplicate(const Size2D size,
85 const u8 * srcBase,
86 const s32 * map,
87 const f32 * coeffs,
88 u8 * dstBase, ptrdiff_t dstStride)
89 {
90 int16x8_t v_zero16 = vdupq_n_s16(0);
91
92 for (size_t y = 0; y < size.height; ++y)
93 {
94 const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
95 const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
96
97 u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
98
99 size_t x = 0;
100 for ( ; x + 8 < size.width; x += 8)
101 {
102 int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
103 v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
104 v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
105 v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
106 v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
107 v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
108 v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
109 v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
110
111 int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
112 v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
113 v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
114 v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
115 v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
116 v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
117 v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
118 v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
119
120 int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
121 v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
122 v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
123 v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
124 v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
125 v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
126 v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
127 v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
128
129 int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
130 v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
131 v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
132 v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
133 v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
134 v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
135 v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
136 v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
137
138 // first part
139 float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
140 float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
141
142 float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
143 float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
144 vget_low_s16(v_src00))), v_coeff.val[0]);
145 float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
146 vget_low_s16(v_src10))), v_coeff.val[0]);
147
148 float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
149 uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
150
151 // second part
152 v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
153 v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
154
155 v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
156 v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
157 vget_high_s16(v_src00))), v_coeff.val[0]);
158 v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
159 vget_high_s16(v_src10))), v_coeff.val[0]);
160
161 v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
162 uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
163
164 // store
165 vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
166 }
167
168 for ( ; x < size.width; ++x)
169 {
170 s32 src00_index = map_row[(x << 2)];
171 s32 src10_index = map_row[(x << 2) + 2];
172 f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
173 srcBase[src00_index];
174 f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
175 srcBase[src10_index];
176 dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
177 }
178 }
179 }
180
remapLinearConst(const Size2D size,const u8 * srcBase,const s32 * map,const f32 * coeffs,u8 * dstBase,ptrdiff_t dstStride,u8 borderValue)181 void remapLinearConst(const Size2D size,
182 const u8 * srcBase,
183 const s32 * map,
184 const f32 * coeffs,
185 u8 * dstBase, ptrdiff_t dstStride,
186 u8 borderValue)
187 {
188 int16x8_t v_zero16 = vdupq_n_s16(0);
189
190 for (size_t y = 0; y < size.height; ++y)
191 {
192 const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
193 const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
194
195 u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
196
197 size_t x = 0;
198 for ( ; x + 8 < size.width; x += 8)
199 {
200 int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
201 v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1);
202 v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2);
203 v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
204 v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
205 v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
206 v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
207 v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
208
209 int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
210 v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1);
211 v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2);
212 v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
213 v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
214 v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
215 v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
216 v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
217
218 int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
219 v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1);
220 v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
221 v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
222 v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
223 v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
224 v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
225 v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
226
227 int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
228 v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1);
229 v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
230 v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
231 v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
232 v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
233 v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
234 v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
235
236 // first part
237 float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
238 float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
239
240 float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
241 float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
242 vget_low_s16(v_src00))), v_coeff.val[0]);
243 float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
244 vget_low_s16(v_src10))), v_coeff.val[0]);
245
246 float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
247 uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
248
249 // second part
250 v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
251 v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
252
253 v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
254 v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
255 vget_high_s16(v_src00))), v_coeff.val[0]);
256 v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
257 vget_high_s16(v_src10))), v_coeff.val[0]);
258
259 v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
260 uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
261
262 // store
263 vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
264 }
265
266 for ( ; x < size.width; ++x)
267 {
268 s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
269 s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
270 s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
271 s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
272
273 f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
274 f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
275 dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
276 }
277 }
278 }
279
280 } // namespace internal
281
282 #endif // CAROTENE_NEON
283
isRemapNearestNeighborSupported(const Size2D & ssize)284 bool isRemapNearestNeighborSupported(const Size2D &ssize)
285 {
286 #if SIZE_MAX > UINT32_MAX
287 return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
288 // is performed with u32
289 isSupportedConfiguration();
290 #else
291 (void)ssize;
292 return isSupportedConfiguration();
293 #endif
294 }
295
isRemapLinearSupported(const Size2D & ssize)296 bool isRemapLinearSupported(const Size2D &ssize)
297 {
298 #if SIZE_MAX > UINT32_MAX
299 return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
300 // is performed with u32
301 isSupportedConfiguration();
302 #else
303 (void)ssize;
304 return isSupportedConfiguration();
305 #endif
306 }
307
remapNearestNeighbor(const Size2D & ssize,const Size2D & dsize,const u8 * srcBase,ptrdiff_t srcStride,const f32 * tableBase,ptrdiff_t tableStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderMode,u8 borderValue)308 void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
309 const u8 * srcBase, ptrdiff_t srcStride,
310 const f32 * tableBase, ptrdiff_t tableStride,
311 u8 * dstBase, ptrdiff_t dstStride,
312 BORDER_MODE borderMode, u8 borderValue)
313 {
314 internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
315 #ifdef CAROTENE_NEON
316 using namespace internal;
317
318 s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
319 s32 * map = alignPtr(_map, 16);
320
321 int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
322 int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
323 int32x4_t v_step4 = vdupq_n_s32(srcStride);
324 int32x2_t v_step2 = vdup_n_s32(srcStride);
325
326 if (borderMode == BORDER_MODE_REPLICATE)
327 {
328 int32x4_t v_zero4 = vdupq_n_s32(0);
329 int32x2_t v_zero2 = vdup_n_s32(0);
330
331 for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
332 {
333 size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
334 for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
335 {
336 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
337
338 // compute table
339 for (size_t y = 0; y < blockHeight; ++y)
340 {
341 const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
342 s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
343
344 size_t x = 0;
345 for ( ; x + 8 <= blockWidth; x += 8)
346 {
347 float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
348 v_table1 = vld2q_f32(table_row + (x << 1) + 8);
349
350 int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
351 int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
352 int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
353 vst1q_s32(map_row + x, v_dst_index);
354
355 v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
356 v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
357 v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
358 vst1q_s32(map_row + x + 4, v_dst_index);
359 }
360
361 for ( ; x + 4 <= blockWidth; x += 4)
362 {
363 float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
364
365 int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
366 int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
367 int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
368 vst1q_s32(map_row + x, v_dst_index);
369 }
370
371 for ( ; x + 2 <= blockWidth; x += 2)
372 {
373 float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
374
375 int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
376 int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
377 int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
378 vst1_s32(map_row + x, v_dst_index);
379 }
380
381 for ( ; x < blockWidth; ++x)
382 {
383 s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
384 s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
385 map_row[x] = src_y * srcStride + src_x;
386 }
387 }
388
389 // make remap
390 remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
391 getRowPtr(dstBase, dstStride, i) + j, dstStride);
392 }
393 }
394 }
395 else if (borderMode == BORDER_MODE_CONSTANT)
396 {
397 int32x4_t v_m1_4 = vdupq_n_s32(-1);
398 int32x2_t v_m1_2 = vdup_n_s32(-1);
399 float32x4_t v_zero4 = vdupq_n_f32(0.0f);
400 float32x2_t v_zero2 = vdup_n_f32(0.0f);
401
402 for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
403 {
404 size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
405 for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
406 {
407 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
408
409 // compute table
410 for (size_t y = 0; y < blockHeight; ++y)
411 {
412 const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
413 s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
414
415 size_t x = 0;
416 for ( ; x + 8 <= blockWidth; x += 8)
417 {
418 float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
419 v_table1 = vld2q_f32(table_row + (x << 1) + 8);
420
421 int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
422 int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
423 uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
424 vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
425 int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
426 vst1q_s32(map_row + x, v_dst_index);
427
428 v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
429 v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
430 v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
431 vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
432 v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
433 vst1q_s32(map_row + x + 4, v_dst_index);
434 }
435
436 for ( ; x + 4 <= blockWidth; x += 4)
437 {
438 float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
439
440 int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
441 int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
442 uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
443 vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
444 int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
445 vst1q_s32(map_row + x, v_dst_index);
446 }
447
448 for ( ; x + 2 <= blockWidth; x += 2)
449 {
450 float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
451
452 int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
453 int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
454 uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
455 vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
456 int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
457 vst1_s32(map_row + x, v_dst_index);
458 }
459
460 for ( ; x < blockWidth; ++x)
461 {
462 s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
463 s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
464 map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
465 (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
466 }
467 }
468
469 // make remap
470 remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
471 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
472 }
473 }
474 }
475
476 #else
477 (void)ssize;
478 (void)dsize;
479 (void)srcBase;
480 (void)srcStride;
481 (void)tableBase;
482 (void)tableStride;
483 (void)dstBase;
484 (void)dstStride;
485 (void)borderMode;
486 (void)borderValue;
487 #endif
488 }
489
remapLinear(const Size2D & ssize,const Size2D & dsize,const u8 * srcBase,ptrdiff_t srcStride,const f32 * tableBase,ptrdiff_t tableStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderMode,u8 borderValue)490 void remapLinear(const Size2D &ssize, const Size2D &dsize,
491 const u8 * srcBase, ptrdiff_t srcStride,
492 const f32 * tableBase, ptrdiff_t tableStride,
493 u8 * dstBase, ptrdiff_t dstStride,
494 BORDER_MODE borderMode, u8 borderValue)
495 {
496 internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
497 #ifdef CAROTENE_NEON
498 using namespace internal;
499
500 s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
501 f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
502
503 s32 * map = alignPtr(_map, 16);
504 f32 * coeffs = alignPtr(_coeffs, 16);
505
506 int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
507 int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
508 float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
509
510 if (borderMode == BORDER_MODE_REPLICATE)
511 {
512 int32x4_t v_zero4 = vdupq_n_s32(0);
513
514 for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
515 {
516 size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
517 for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
518 {
519 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
520
521 // compute table
522 for (size_t y = 0; y < blockHeight; ++y)
523 {
524 const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
525
526 s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
527 f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
528
529 size_t x = 0;
530 for ( ; x + 4 <= blockWidth; x += 4)
531 {
532 float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
533
534 int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
535 int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
536
537 float32x4x2_t v_coeff;
538 v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
539 v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
540 uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
541 uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
542 v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
543 v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
544 v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
545 v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
546
547 int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
548 int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
549 int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
550 int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
551
552 int32x4x4_t v_dst_index;
553 v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
554 v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
555 v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
556 v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
557
558 vst2q_f32(coeff_row + (x << 1), v_coeff);
559 vst4q_s32(map_row + (x << 2), v_dst_index);
560 }
561
562 for ( ; x < blockWidth; ++x)
563 {
564 f32 src_x_f = table_row[(x << 1) + 0];
565 f32 src_y_f = table_row[(x << 1) + 1];
566
567 s32 src0_x = (s32)floorf(src_x_f);
568 s32 src0_y = (s32)floorf(src_y_f);
569
570 coeff_row[x << 1] = src_x_f - src0_x;
571 coeff_row[(x << 1) + 1] = src_y_f - src0_y;
572
573 s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
574 src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
575 s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
576 src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
577
578 map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
579 map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
580 map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
581 map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
582 }
583 }
584
585 remapLinearReplicate(Size2D(blockWidth, blockHeight),
586 srcBase, &map[0], &coeffs[0],
587 getRowPtr(dstBase, dstStride, i) + j, dstStride);
588 }
589 }
590 }
591 else if (borderMode == BORDER_MODE_CONSTANT)
592 {
593 float32x4_t v_zero4 = vdupq_n_f32(0.0f);
594 int32x4_t v_m1_4 = vdupq_n_s32(-1);
595
596 for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
597 {
598 size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
599 for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
600 {
601 size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
602
603 // compute table
604 for (size_t y = 0; y < blockHeight; ++y)
605 {
606 const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
607
608 s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
609 f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
610
611 size_t x = 0;
612 for ( ; x + 4 <= blockWidth; x += 4)
613 {
614 float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
615
616 int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
617 int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
618
619 float32x4x2_t v_coeff;
620 v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
621 v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
622 uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
623 uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
624 v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
625 v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
626 v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
627 v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
628
629 int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
630 int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
631
632 int32x4x4_t v_dst_index;
633 v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
634 v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
635 v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
636 v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
637
638 uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
639 uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
640 uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
641 uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
642
643 v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
644 v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
645 v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
646 v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
647
648 vst2q_f32(coeff_row + (x << 1), v_coeff);
649 vst4q_s32(map_row + (x << 2), v_dst_index);
650 }
651
652 for ( ; x < blockWidth; ++x)
653 {
654 f32 src_x_f = table_row[(x << 1) + 0];
655 f32 src_y_f = table_row[(x << 1) + 1];
656
657 s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
658 s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
659
660 coeff_row[(x << 1)] = src_x_f - src0_x;
661 coeff_row[(x << 1) + 1] = src_y_f - src0_y;
662
663 map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
664 (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
665 map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
666 (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
667 map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
668 (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
669 map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
670 (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
671 }
672 }
673
674 remapLinearConst(Size2D(blockWidth, blockHeight),
675 srcBase, &map[0], &coeffs[0],
676 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
677 }
678 }
679 }
680 #else
681 (void)ssize;
682 (void)dsize;
683 (void)srcBase;
684 (void)srcStride;
685 (void)tableBase;
686 (void)tableStride;
687 (void)dstBase;
688 (void)dstStride;
689 (void)borderMode;
690 (void)borderValue;
691 #endif
692 }
693
694 } // namespace CAROTENE_NS
695