1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41
42 namespace CAROTENE_NS {
43
44 //magic number; must be multiple of 4
45 #define NORM32F_BLOCK_SIZE 2048
46
normInf(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)47 s32 normInf(const Size2D &_size,
48 const u8 * srcBase, ptrdiff_t srcStride)
49 {
50 internal::assertSupportedConfiguration();
51 #ifdef CAROTENE_NEON
52 Size2D size(_size);
53 if (srcStride == (ptrdiff_t)(size.width))
54 {
55 size.width *= size.height;
56 size.height = 1;
57 }
58 s32 result = 0;
59 for(size_t k = 0; k < size.height; ++k)
60 {
61 const u8* src = internal::getRowPtr( srcBase, srcStride, k);
62 size_t i = 0;
63 if (size.width >= 16)
64 {
65 uint8x16_t s = vld1q_u8(src);
66 for (i = 16; i <= size.width - 16; i += 16)
67 {
68 internal::prefetch(src + i);
69 uint8x16_t s1 = vld1q_u8(src + i);
70 s = vmaxq_u8(s1, s);
71 }
72 u8 s2[8];
73 uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
74 vst1_u8(s2, s3);
75 for (u32 j = 0; j < 8; j++)
76 result = std::max((s32)(s2[j]), result);
77 }
78 for ( ; i < size.width; i++)
79 result = std::max((s32)(src[i]), result);
80 }
81 return result;
82 #else
83 (void)_size;
84 (void)srcBase;
85 (void)srcStride;
86
87 return 0;
88 #endif
89 }
90
normInf(const Size2D & _size,const s8 * srcBase,ptrdiff_t srcStride)91 s32 normInf(const Size2D &_size,
92 const s8 * srcBase, ptrdiff_t srcStride)
93 {
94 internal::assertSupportedConfiguration();
95 #ifdef CAROTENE_NEON
96 Size2D size(_size);
97 if (srcStride == (ptrdiff_t)(size.width))
98 {
99 size.width *= size.height;
100 size.height = 1;
101 }
102 s32 result = 0;
103 for(size_t k = 0; k < size.height; ++k)
104 {
105 const s8* src = internal::getRowPtr( srcBase, srcStride, k);
106 size_t i = 0;
107 if (size.width >= 16)
108 {
109 uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src)));
110 for (i = 16; i <= size.width - 16; i += 16)
111 {
112 internal::prefetch(src + i);
113 uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i)));
114 s = vmaxq_u8(s1, s);
115 }
116 u8 s2[8];
117 uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
118 vst1_u8(s2, s3);
119 for (u32 j = 0; j < 8; j++)
120 result = std::max((s32)(s2[j]), result);
121 }
122 for ( ; i < size.width; i++)
123 result = std::max((s32)(std::abs(src[i])), result);
124 }
125 return result;
126 #else
127 (void)_size;
128 (void)srcBase;
129 (void)srcStride;
130
131 return 0;
132 #endif
133 }
134
normInf(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)135 s32 normInf(const Size2D &_size,
136 const u16 * srcBase, ptrdiff_t srcStride)
137 {
138 internal::assertSupportedConfiguration();
139 #ifdef CAROTENE_NEON
140 Size2D size(_size);
141 if (srcStride == (ptrdiff_t)(size.width))
142 {
143 size.width *= size.height;
144 size.height = 1;
145 }
146 s32 result = 0;
147 for(size_t k = 0; k < size.height; ++k)
148 {
149 const u16* src = internal::getRowPtr( srcBase, srcStride, k);
150 size_t i = 0;
151 if (size.width >= 8)
152 {
153 uint16x8_t s = vld1q_u16(src);
154 for (i = 8; i <= size.width - 8; i += 8)
155 {
156 internal::prefetch(src + i);
157 uint16x8_t s1 = vld1q_u16(src + i);
158 s = vmaxq_u16(s1, s);
159 }
160 u16 s2[4];
161 uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
162 vst1_u16(s2, s3);
163 for (u32 j = 0; j < 4; j++)
164 result = std::max((s32)(s2[j]), result);
165 }
166 for ( ; i < size.width; i++)
167 result = std::max((s32)(src[i]), result);
168 }
169 return result;
170 #else
171 (void)_size;
172 (void)srcBase;
173 (void)srcStride;
174
175 return 0;
176 #endif
177 }
178
normInf(const Size2D & _size,const s16 * srcBase,ptrdiff_t srcStride)179 s32 normInf(const Size2D &_size,
180 const s16 * srcBase, ptrdiff_t srcStride)
181 {
182 internal::assertSupportedConfiguration();
183 #ifdef CAROTENE_NEON
184 Size2D size(_size);
185 if (srcStride == (ptrdiff_t)(size.width))
186 {
187 size.width *= size.height;
188 size.height = 1;
189 }
190 s32 result = 0;
191 for(size_t k = 0; k < size.height; ++k)
192 {
193 const s16* src = internal::getRowPtr( srcBase, srcStride, k);
194 size_t i = 0;
195 if (size.width >= 8)
196 {
197 uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src)));
198 for (i = 8; i <= size.width - 8; i += 8)
199 {
200 internal::prefetch(src + i);
201 uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i)));
202 s = vmaxq_u16(s1, s);
203 }
204 u16 s2[4];
205 uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
206 vst1_u16(s2, s3);
207 for (u32 j = 0; j < 4; j++)
208 result = std::max((s32)(s2[j]), result);
209 }
210 for ( ; i < size.width; i++)
211 result = std::max(std::abs((s32)(src[i])), result);
212 }
213 return result;
214 #else
215 (void)_size;
216 (void)srcBase;
217 (void)srcStride;
218
219 return 0;
220 #endif
221 }
222
normInf(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)223 s32 normInf(const Size2D &_size,
224 const s32 * srcBase, ptrdiff_t srcStride)
225 {
226 internal::assertSupportedConfiguration();
227 #ifdef CAROTENE_NEON
228 Size2D size(_size);
229 if (srcStride == (ptrdiff_t)(size.width))
230 {
231 size.width *= size.height;
232 size.height = 1;
233 }
234 s32 result = 0;
235 for(size_t k = 0; k < size.height; ++k)
236 {
237 const s32* src = internal::getRowPtr( srcBase, srcStride, k);
238 size_t i = 0;
239 if (size.width >= 4)
240 {
241 uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src)));
242 for (i = 4; i <= size.width - 4; i += 4)
243 {
244 internal::prefetch(src + i);
245 uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i)));
246 s = vmaxq_u32(s1, s);
247 }
248 u32 s2[2];
249 uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s));
250 vst1_u32(s2, s3);
251 for (u32 j = 0; j < 2; j++)
252 result = std::max((s32)(s2[j]), result);
253 }
254 for ( ; i < size.width; i++)
255 result = std::max((s32)(std::abs(src[i])), result);
256 }
257 return result;
258 #else
259 (void)_size;
260 (void)srcBase;
261 (void)srcStride;
262
263 return 0;
264 #endif
265 }
266
normInf(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)267 f32 normInf(const Size2D &_size,
268 const f32 * srcBase, ptrdiff_t srcStride)
269 {
270 internal::assertSupportedConfiguration();
271 #ifdef CAROTENE_NEON
272 Size2D size(_size);
273 if (srcStride == (ptrdiff_t)(size.width))
274 {
275 size.width *= size.height;
276 size.height = 1;
277 }
278 f32 result = 0;
279 for(size_t k = 0; k < size.height; ++k)
280 {
281 const f32* src = internal::getRowPtr( srcBase, srcStride, k);
282 size_t i = 0;
283 if (size.width >= 4)
284 {
285 float32x4_t s = vabsq_f32(vld1q_f32(src));
286 for (i = 4; i <= size.width - 4; i += 4 )
287 {
288 internal::prefetch(src + i);
289 float32x4_t s1 = vld1q_f32(src + i);
290 float32x4_t sa = vabsq_f32(s1);
291 s = vmaxq_f32(sa, s);
292 }
293 f32 s2[2];
294 float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s));
295 vst1_f32(s2, s3);
296 for (u32 j = 0; j < 2; j++)
297 result = std::max(s2[j], result);
298 }
299 for (; i < size.width; i++)
300 result = std::max(std::abs(src[i]), result);
301 }
302 return result;
303 #else
304 (void)_size;
305 (void)srcBase;
306 (void)srcStride;
307
308 return 0.;
309 #endif
310 }
311
normL1(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)312 s32 normL1(const Size2D &_size,
313 const u8 * srcBase, ptrdiff_t srcStride)
314 {
315 internal::assertSupportedConfiguration();
316 #ifdef CAROTENE_NEON
317 Size2D size(_size);
318 if (srcStride == (ptrdiff_t)(size.width))
319 {
320 size.width *= size.height;
321 size.height = 1;
322 }
323 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324 s32 result = 0;
325 for(size_t k = 0; k < size.height; ++k)
326 {
327 const u8* src = internal::getRowPtr( srcBase, srcStride, k);
328 size_t i = 0;
329 uint32x4_t vs = vmovq_n_u32(0);
330 for (; i < roiw8;)
331 {
332 size_t limit = std::min(size.width, i + 256) - 8;
333 uint8x8_t s0 = vld1_u8(src + i);
334 uint16x8_t s = vmovl_u8(s0);
335
336 for (i += 8; i <= limit; i += 8)
337 {
338 internal::prefetch(src + i);
339 uint8x8_t s1 = vld1_u8(src + i);
340 s = vaddw_u8(s, s1);
341 }
342
343 uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
344 vs = vaddw_u16(vs, s4);
345 }
346
347 u32 s2[2];
348 uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
349 vst1_u32(s2, vs2);
350
351 result += (s32)(s2[0] + s2[1]);
352
353 for ( ; i < size.width; i++)
354 result += (s32)(src[i]);
355 }
356 return result;
357 #else
358 (void)_size;
359 (void)srcBase;
360 (void)srcStride;
361
362 return 0;
363 #endif
364 }
365
normL1(const Size2D & _size,const s8 * srcBase,ptrdiff_t srcStride)366 s32 normL1(const Size2D &_size,
367 const s8 * srcBase, ptrdiff_t srcStride)
368 {
369 internal::assertSupportedConfiguration();
370 #ifdef CAROTENE_NEON
371 Size2D size(_size);
372 if (srcStride == (ptrdiff_t)(size.width))
373 {
374 size.width *= size.height;
375 size.height = 1;
376 }
377 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
378 s32 result = 0;
379 for(size_t k = 0; k < size.height; ++k)
380 {
381 const s8* src = internal::getRowPtr( srcBase, srcStride, k);
382 size_t i = 0;
383 uint32x4_t vs = vmovq_n_u32(0);
384
385 for (; i < roiw8;)
386 {
387 size_t limit = std::min(size.width, i + 256) - 8;
388 uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
389 uint16x8_t s = vmovl_u8(s0);
390
391 for (i += 8; i <= limit; i += 8)
392 {
393 internal::prefetch(src + i);
394 uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
395 s = vaddw_u8(s, s1);
396 }
397
398 uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
399 vs = vaddw_u16(vs, s4);
400 }
401
402 u32 s2[2];
403 uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
404 vst1_u32(s2, vs2);
405
406 result += (s32)(s2[0] + s2[1]);
407
408 for ( ; i < size.width; i++)
409 result += (s32)(std::abs(src[i]));
410 }
411 return result;
412 #else
413 (void)_size;
414 (void)srcBase;
415 (void)srcStride;
416
417 return 0;
418 #endif
419 }
420
normL1(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)421 s32 normL1(const Size2D &_size,
422 const u16 * srcBase, ptrdiff_t srcStride)
423 {
424 internal::assertSupportedConfiguration();
425 #ifdef CAROTENE_NEON
426 Size2D size(_size);
427 if (srcStride == (ptrdiff_t)(size.width))
428 {
429 size.width *= size.height;
430 size.height = 1;
431 }
432 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
433 s32 result = 0;
434 for(size_t k = 0; k < size.height; ++k)
435 {
436 const u16* src = internal::getRowPtr( srcBase, srcStride, k);
437 size_t i = 0;
438 uint32x4_t vs = vmovq_n_u32(0);
439 for (; i < roiw4; i += 4)
440 {
441 internal::prefetch(src + i);
442 uint16x4_t s = vld1_u16(src + i);
443 vs = vaddw_u16(vs, s);
444 }
445 u32 s2[4];
446 vst1q_u32(s2, vs);
447 for (u32 j = 0; j < 4; j++)
448 result += s2[j];
449 for ( ; i < size.width; i++)
450 result += (s32)(src[i]);
451 }
452 return result;
453 #else
454 (void)_size;
455 (void)srcBase;
456 (void)srcStride;
457
458 return 0;
459 #endif
460 }
461
normL1(const Size2D & _size,const s16 * srcBase,ptrdiff_t srcStride)462 s32 normL1(const Size2D &_size,
463 const s16 * srcBase, ptrdiff_t srcStride)
464 {
465 internal::assertSupportedConfiguration();
466 #ifdef CAROTENE_NEON
467 Size2D size(_size);
468 if (srcStride == (ptrdiff_t)(size.width))
469 {
470 size.width *= size.height;
471 size.height = 1;
472 }
473 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
474 s32 result = 0;
475 for(size_t k = 0; k < size.height; ++k)
476 {
477 const s16* src = internal::getRowPtr( srcBase, srcStride, k);
478 size_t i = 0;
479 uint32x4_t vs = vmovq_n_u32(0);
480 for (; i < roiw4; i += 4)
481 {
482 internal::prefetch(src + i);
483 uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i)));
484 vs = vaddw_u16(vs, s);
485 }
486 u32 s2[4];
487 vst1q_u32(s2, vs);
488 for (u32 j = 0; j < 4; j++)
489 result += s2[j];
490 for ( ; i < size.width; i++)
491 result += (s32)(std::abs(src[i]));
492 }
493 return result;
494 #else
495 (void)_size;
496 (void)srcBase;
497 (void)srcStride;
498
499 return 0;
500 #endif
501 }
502
normL1(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)503 f64 normL1(const Size2D &_size,
504 const s32 * srcBase, ptrdiff_t srcStride)
505 {
506 internal::assertSupportedConfiguration();
507 #ifdef CAROTENE_NEON
508 Size2D size(_size);
509 if (srcStride == (ptrdiff_t)(size.width))
510 {
511 size.width *= size.height;
512 size.height = 1;
513 }
514 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
515 f64 result = 0;
516 for(size_t k = 0; k < size.height; ++k)
517 {
518 const s32* src = internal::getRowPtr( srcBase, srcStride, k);
519 size_t i = 0;
520 for (; i < roiw4;)
521 {
522 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
523 float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
524 for (i += 4; i <= limit; i += 4 )
525 {
526 internal::prefetch(src + i);
527 float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
528 s = vaddq_f32(s, s1);
529 }
530
531 f32 s2[4];
532 vst1q_f32(s2, s);
533
534 for (u32 j = 0; j < 4; j++)
535 result += (f64)(s2[j]);
536 }
537 for ( ; i < size.width; i++)
538 result += (f64)(std::abs(src[i]));
539 }
540 return result;
541 #else
542 (void)_size;
543 (void)srcBase;
544 (void)srcStride;
545
546 return 0.;
547 #endif
548 }
549
normL1(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)550 f64 normL1(const Size2D &_size,
551 const f32 * srcBase, ptrdiff_t srcStride)
552 {
553 internal::assertSupportedConfiguration();
554 #ifdef CAROTENE_NEON
555 Size2D size(_size);
556 if (srcStride == (ptrdiff_t)(size.width))
557 {
558 size.width *= size.height;
559 size.height = 1;
560 }
561 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
562 f64 result = 0;
563 for(size_t k = 0; k < size.height; ++k)
564 {
565 const f32* src = internal::getRowPtr( srcBase, srcStride, k);
566 size_t i = 0;
567
568 for (; i < roiw4;)
569 {
570 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
571 float32x4_t s = vabsq_f32(vld1q_f32(src + i));
572 for (i += 4; i <= limit; i += 4)
573 {
574 internal::prefetch(src + i);
575 float32x4_t s1 = vld1q_f32(src + i);
576 float32x4_t sa = vabsq_f32(s1);
577 s = vaddq_f32(sa, s);
578 }
579
580 f32 s2[4];
581 vst1q_f32(s2, s);
582
583 for (u32 j = 0; j < 4; j++)
584 result += (f64)(s2[j]);
585 }
586 for (; i < size.width; i++)
587 result += std::abs((f64)(src[i]));
588 }
589 return result;
590 #else
591 (void)_size;
592 (void)srcBase;
593 (void)srcStride;
594
595 return 0.;
596 #endif
597 }
598
normL2(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)599 s32 normL2(const Size2D &_size,
600 const u8 * srcBase, ptrdiff_t srcStride)
601 {
602 internal::assertSupportedConfiguration();
603 #ifdef CAROTENE_NEON
604 Size2D size(_size);
605 if (srcStride == (ptrdiff_t)(size.width))
606 {
607 size.width *= size.height;
608 size.height = 1;
609 }
610 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
611 s32 result = 0;
612 for(size_t k = 0; k < size.height; ++k)
613 {
614 const u8* src = internal::getRowPtr( srcBase, srcStride, k);
615 size_t i = 0;
616
617 uint32x4_t sl = vmovq_n_u32(0);
618 uint32x4_t sh = vmovq_n_u32(0);
619
620 for (; i < roiw8; i += 8)
621 {
622 internal::prefetch(src + i);
623 uint8x8_t s1 = vld1_u8(src + i);
624 uint16x8_t sq = vmull_u8(s1, s1);
625
626 sl = vaddw_u16(sl, vget_low_u16(sq));
627 sh = vaddw_u16(sh, vget_high_u16(sq));
628 }
629
630 uint32x4_t s = vaddq_u32(sl, sh);
631 uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s));
632
633 u32 s2[2];
634 vst1_u32(s2, ss);
635
636 result += (s32)(s2[0] + s2[1]);
637
638 for (; i < size.width; i++)
639 result += (s32)(src[i]) * (s32)(src[i]);
640 }
641 return result;
642 #else
643 (void)_size;
644 (void)srcBase;
645 (void)srcStride;
646
647 return 0;
648 #endif
649 }
650
normL2(const Size2D & _size,const s8 * srcBase,ptrdiff_t srcStride)651 s32 normL2(const Size2D &_size,
652 const s8 * srcBase, ptrdiff_t srcStride)
653 {
654 internal::assertSupportedConfiguration();
655 #ifdef CAROTENE_NEON
656 Size2D size(_size);
657 if (srcStride == (ptrdiff_t)(size.width))
658 {
659 size.width *= size.height;
660 size.height = 1;
661 }
662 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
663 s32 result = 0;
664 for(size_t k = 0; k < size.height; ++k)
665 {
666 const s8* src = internal::getRowPtr( srcBase, srcStride, k);
667 size_t i = 0;
668
669 int32x4_t sl = vmovq_n_s32(0);
670 int32x4_t sh = vmovq_n_s32(0);
671
672 for (; i < roiw8; i += 8)
673 {
674 internal::prefetch(src + i);
675 int8x8_t s1 = vld1_s8(src + i);
676 int16x8_t sq = vmull_s8(s1, s1);
677
678 sl = vaddw_s16(sl, vget_low_s16(sq));
679 sh = vaddw_s16(sh, vget_high_s16(sq));
680 }
681
682 int32x4_t s = vaddq_s32(sl, sh);
683 int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s));
684
685 s32 s2[2];
686 vst1_s32(s2, ss);
687
688 result += s2[0] + s2[1];
689
690 for (; i < size.width; i++)
691 result += (s32)(src[i]) * (s32)(src[i]);
692 }
693 return result;
694 #else
695 (void)_size;
696 (void)srcBase;
697 (void)srcStride;
698
699 return 0;
700 #endif
701 }
702
normL2(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)703 f64 normL2(const Size2D &_size,
704 const u16 * srcBase, ptrdiff_t srcStride)
705 {
706 internal::assertSupportedConfiguration();
707 #ifdef CAROTENE_NEON
708 Size2D size(_size);
709 if (srcStride == (ptrdiff_t)(size.width))
710 {
711 size.width *= size.height;
712 size.height = 1;
713 }
714 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
715 f64 result = 0;
716 for(size_t k = 0; k < size.height; ++k)
717 {
718 const u16* src = internal::getRowPtr( srcBase, srcStride, k);
719 size_t i = 0;
720 for (; i < roiw4;)
721 {
722 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
723 uint16x4_t s0 = vld1_u16(src+i);
724 float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0));
725 for (i += 4; i <= limit; i += 4 )
726 {
727 internal::prefetch(src + i);
728 uint16x4_t s1 = vld1_u16(src+i);
729 float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1));
730 s = vaddq_f32(s, sq);
731 }
732 f32 s2[4];
733 vst1q_f32(s2, s);
734 for (u32 j = 0; j < 4; j++)
735 result += (f64)(s2[j]);
736 }
737
738 for ( ; i < size.width; i++)
739 result += (f64)(src[i]) * (f64)(src[i]);
740 }
741 return result;
742 #else
743 (void)_size;
744 (void)srcBase;
745 (void)srcStride;
746
747 return 0.;
748 #endif
749 }
750
normL2(const Size2D & _size,const s16 * srcBase,ptrdiff_t srcStride)751 f64 normL2(const Size2D &_size,
752 const s16 * srcBase, ptrdiff_t srcStride)
753 {
754 internal::assertSupportedConfiguration();
755 #ifdef CAROTENE_NEON
756 Size2D size(_size);
757 if (srcStride == (ptrdiff_t)(size.width))
758 {
759 size.width *= size.height;
760 size.height = 1;
761 }
762 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
763 f64 result = 0;
764 for(size_t k = 0; k < size.height; ++k)
765 {
766 const s16* src = internal::getRowPtr( srcBase, srcStride, k);
767 size_t i = 0;
768 for (; i < roiw4;)
769 {
770 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
771 int16x4_t s0 = vld1_s16(src+i);
772 float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0));
773 for (i += 4; i <= limit; i += 4 )
774 {
775 internal::prefetch(src + i);
776 int16x4_t s1 = vld1_s16(src+i);
777 float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1));
778 s = vaddq_f32(s, sq);
779 }
780 f32 s2[4];
781 vst1q_f32(s2, s);
782 for (u32 j = 0; j < 4; j++)
783 result += (f64)(s2[j]);
784 }
785
786 for ( ; i < size.width; i++)
787 result += (f64)(src[i]) * (f64)(src[i]);
788 }
789 return result;
790 #else
791 (void)_size;
792 (void)srcBase;
793 (void)srcStride;
794
795 return 0.;
796 #endif
797 }
798
normL2(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)799 f64 normL2(const Size2D &_size,
800 const s32 * srcBase, ptrdiff_t srcStride)
801 {
802 internal::assertSupportedConfiguration();
803 #ifdef CAROTENE_NEON
804 Size2D size(_size);
805 if (srcStride == (ptrdiff_t)(size.width))
806 {
807 size.width *= size.height;
808 size.height = 1;
809 }
810 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
811 f64 result = 0;
812 for(size_t k = 0; k < size.height; ++k)
813 {
814 const s32* src = internal::getRowPtr( srcBase, srcStride, k);
815 size_t i = 0;
816 for (; i < roiw4;)
817 {
818 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
819 float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i));
820 s = vmulq_f32(s, s);
821 for (i += 4; i <= limit; i += 4 )
822 {
823 internal::prefetch(src + i);
824 float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i));
825 s = vmlaq_f32(s, s1, s1);
826 }
827
828 f32 s2[4];
829 vst1q_f32(s2, s);
830
831 for (u32 j = 0; j < 4; j++)
832 result += (f64)(s2[j]);
833 }
834 for ( ; i < size.width; i++)
835 result += (f64)(src[i]) * (f64)(src[i]);
836 }
837 return result;
838 #else
839 (void)_size;
840 (void)srcBase;
841 (void)srcStride;
842
843 return 0.;
844 #endif
845 }
846
normL2(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)847 f64 normL2(const Size2D &_size,
848 const f32 * srcBase, ptrdiff_t srcStride)
849 {
850 internal::assertSupportedConfiguration();
851 #ifdef CAROTENE_NEON
852 Size2D size(_size);
853 if (srcStride == (ptrdiff_t)(size.width))
854 {
855 size.width *= size.height;
856 size.height = 1;
857 }
858 size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
859 f64 result = 0;
860 for(size_t k = 0; k < size.height; ++k)
861 {
862 const f32* src = internal::getRowPtr( srcBase, srcStride, k);
863 size_t i = 0;
864 for (; i < roiw4;)
865 {
866 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
867 float32x4_t s = vld1q_f32(src + i);
868 s = vmulq_f32(s, s);
869 for (i += 4; i <= limit; i += 4 )
870 {
871 internal::prefetch(src + i);
872 float32x4_t s1 = vld1q_f32(src + i);
873 s = vmlaq_f32(s, s1, s1);
874 }
875
876 f32 s2[4];
877 vst1q_f32(s2, s);
878
879 for (u32 j = 0; j < 4; j++)
880 result += (f64)(s2[j]);
881 }
882 for ( ; i < size.width; i++)
883 result += (f64)(src[i]) * (f64)(src[i]);
884 }
885 return result;
886 #else
887 (void)_size;
888 (void)srcBase;
889 (void)srcStride;
890
891 return 0.;
892 #endif
893 }
894
diffNormInf(const Size2D & _size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride)895 s32 diffNormInf(const Size2D &_size,
896 const u8 * src0Base, ptrdiff_t src0Stride,
897 const u8 * src1Base, ptrdiff_t src1Stride)
898 {
899 internal::assertSupportedConfiguration();
900 #ifdef CAROTENE_NEON
901 Size2D size(_size);
902 if (src0Stride == src1Stride &&
903 src0Stride == (ptrdiff_t)(size.width))
904 {
905 size.width *= size.height;
906 size.height = 1;
907 }
908 s32 result = 0;
909 for(size_t k = 0; k < size.height; ++k)
910 {
911 const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
912 const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
913 size_t i = 0;
914
915 if (size.width >= 16)
916 {
917 uint8x16_t vs3 = vdupq_n_u8(0);
918 for (; i < size.width - 16; i += 16)
919 {
920 internal::prefetch(src1 + i);
921 internal::prefetch(src2 + i);
922
923 uint8x16_t vs1 = vld1q_u8(src1 + i);
924 uint8x16_t vs2 = vld1q_u8(src2 + i);
925
926 vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2));
927 }
928
929 u8 s2[8];
930 vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3)));
931
932 for (u32 j = 0; j < 8; j++)
933 result = std::max((s32)(s2[j]), result);
934 }
935
936 for (; i < size.width; i++)
937 {
938 result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result);
939 }
940 }
941 return result;
942 #else
943 (void)_size;
944 (void)src0Base;
945 (void)src0Stride;
946 (void)src1Base;
947 (void)src1Stride;
948
949 return 0;
950 #endif
951 }
952
diffNormInf(const Size2D & _size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride)953 f32 diffNormInf(const Size2D &_size,
954 const f32 * src0Base, ptrdiff_t src0Stride,
955 const f32 * src1Base, ptrdiff_t src1Stride)
956 {
957 internal::assertSupportedConfiguration();
958 #ifdef CAROTENE_NEON
959 Size2D size(_size);
960 if (src0Stride == src1Stride &&
961 src0Stride == (ptrdiff_t)(size.width))
962 {
963 size.width *= size.height;
964 size.height = 1;
965 }
966 f32 result = 0;
967 for(size_t k = 0; k < size.height; ++k)
968 {
969 const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
970 const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
971 size_t i = 0;
972
973 if (size.width >= 4)
974 {
975 float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2));
976
977 for (i += 4; i <= size.width - 4; i += 4 )
978 {
979 internal::prefetch(src1 + i);
980 internal::prefetch(src2 + i);
981
982 float32x4_t vs1 = vld1q_f32(src1 + i);
983 float32x4_t vs2 = vld1q_f32(src2 + i);
984
985 float32x4_t vd = vabdq_f32(vs2, vs1);
986 s = vmaxq_f32(s, vd);
987 }
988
989 f32 s2[4];
990 vst1q_f32(s2, s);
991
992 for (u32 j = 0; j < 4; j++)
993 if (s2[j] > result)
994 result = s2[j];
995 }
996
997 for (; i < size.width; i++)
998 {
999 f32 v = std::abs(src1[i] - src2[i]);
1000 if (v > result)
1001 result = v;
1002 }
1003 }
1004 return result;
1005 #else
1006 (void)_size;
1007 (void)src0Base;
1008 (void)src0Stride;
1009 (void)src1Base;
1010 (void)src1Stride;
1011
1012 return 0.;
1013 #endif
1014 }
1015
diffNormL1(const Size2D & _size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride)1016 s32 diffNormL1(const Size2D &_size,
1017 const u8 * src0Base, ptrdiff_t src0Stride,
1018 const u8 * src1Base, ptrdiff_t src1Stride)
1019 {
1020 internal::assertSupportedConfiguration();
1021 #ifdef CAROTENE_NEON
1022 Size2D size(_size);
1023 if (src0Stride == src1Stride &&
1024 src0Stride == (ptrdiff_t)(size.width))
1025 {
1026 size.width *= size.height;
1027 size.height = 1;
1028 }
1029 s32 result = 0;
1030 for(size_t k = 0; k < size.height; ++k)
1031 {
1032 const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1033 const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1034 size_t i = 0;
1035
1036 if (size.width >= 16)
1037 {
1038 for(; i <= size.width - 16;)
1039 {
1040 size_t limit = std::min(size.width, i + 2*256) - 16;
1041 uint16x8_t si1 = vmovq_n_u16(0);
1042 uint16x8_t si2 = vmovq_n_u16(0);
1043
1044 for (; i <= limit; i += 16)
1045 {
1046 internal::prefetch(src1 + i);
1047 internal::prefetch(src2 + i);
1048
1049 uint8x16_t vs1 = vld1q_u8(src1 + i);
1050 uint8x16_t vs2 = vld1q_u8(src2 + i);
1051
1052 si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2));
1053 si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2));
1054 }
1055
1056 u32 s2[4];
1057 vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2)));
1058
1059 for (u32 j = 0; j < 4; j++)
1060 {
1061 if ((s32)(0x7fFFffFFu - s2[j]) <= result)
1062 {
1063 return 0x7fFFffFF; //result already saturated
1064 }
1065 result = (s32)((u32)(result) + s2[j]);
1066 }
1067 }
1068
1069 }
1070
1071 for (; i < size.width; i++)
1072 {
1073 u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i]));
1074
1075 if ((s32)(0x7fFFffFFu - v) <= result)
1076 {
1077 return 0x7fFFffFF; //result already saturated
1078 }
1079 result = (s32)((u32)(result) + v);
1080 }
1081 }
1082 return result;
1083 #else
1084 (void)_size;
1085 (void)src0Base;
1086 (void)src0Stride;
1087 (void)src1Base;
1088 (void)src1Stride;
1089
1090 return 0;
1091 #endif
1092 }
1093
diffNormL1(const Size2D & _size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride)1094 f64 diffNormL1(const Size2D &_size,
1095 const f32 * src0Base, ptrdiff_t src0Stride,
1096 const f32 * src1Base, ptrdiff_t src1Stride)
1097 {
1098 internal::assertSupportedConfiguration();
1099 #ifdef CAROTENE_NEON
1100 Size2D size(_size);
1101 if (src0Stride == src1Stride &&
1102 src0Stride == (ptrdiff_t)(size.width))
1103 {
1104 size.width *= size.height;
1105 size.height = 1;
1106 }
1107 f64 result = 0;
1108 for(size_t k = 0; k < size.height; ++k)
1109 {
1110 const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1111 const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1112 size_t i = 0;
1113
1114 if (size.width >= 4)
1115 {
1116 for(; i <= size.width - 4;)
1117 {
1118 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
1119 float32x4_t s = vmovq_n_f32(0.0f);
1120
1121 for (; i <= limit; i += 4 )
1122 {
1123 internal::prefetch(src1 + i);
1124 internal::prefetch(src2 + i);
1125
1126 float32x4_t vs1 = vld1q_f32(src1 + i);
1127 float32x4_t vs2 = vld1q_f32(src2 + i);
1128
1129 float32x4_t vd = vabdq_f32(vs2, vs1);
1130 s = vaddq_f32(s, vd);
1131 }
1132
1133 f32 s2[4];
1134 vst1q_f32(s2, s);
1135
1136 for (u32 j = 0; j < 4; j++)
1137 result += (f64)(s2[j]);
1138 }
1139 }
1140
1141 for (; i < size.width; i++)
1142 {
1143 f32 v = std::abs(src1[i] - src2[i]);
1144 result += (f64)(v);
1145 }
1146 }
1147 return result;
1148 #else
1149 (void)_size;
1150 (void)src0Base;
1151 (void)src0Stride;
1152 (void)src1Base;
1153 (void)src1Stride;
1154
1155 return 0.;
1156 #endif
1157 }
1158
diffNormL2(const Size2D & _size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride)1159 s32 diffNormL2(const Size2D &_size,
1160 const u8 * src0Base, ptrdiff_t src0Stride,
1161 const u8 * src1Base, ptrdiff_t src1Stride)
1162 {
1163 internal::assertSupportedConfiguration();
1164 #ifdef CAROTENE_NEON
1165 Size2D size(_size);
1166 if (src0Stride == src1Stride &&
1167 src0Stride == (ptrdiff_t)(size.width))
1168 {
1169 size.width *= size.height;
1170 size.height = 1;
1171 }
1172 s32 result = 0;
1173 for(size_t k = 0; k < size.height; ++k)
1174 {
1175 const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1176 const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1177 size_t i = 0;
1178
1179 #define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow
1180 if (size.width >= 16)
1181 {
1182 for(; i <= size.width - 16;)
1183 {
1184 size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16;
1185 uint32x4_t si1 = vmovq_n_u32(0);
1186 uint32x4_t si2 = vmovq_n_u32(0);
1187
1188 for (; i <= limit; i += 16)
1189 {
1190 internal::prefetch(src1 + i);
1191 internal::prefetch(src2 + i);
1192
1193 uint8x16_t vs1 = vld1q_u8(src1 + i);
1194 uint8x16_t vs2 = vld1q_u8(src2 + i);
1195
1196 uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2));
1197 uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2));
1198
1199 si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo));
1200 si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo));
1201
1202 si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi));
1203 si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi));
1204 }
1205
1206 u32 s2[4];
1207 vst1q_u32(s2, vqaddq_u32(si1, si2));
1208
1209 for (u32 j = 0; j < 4; j++)
1210 {
1211 if ((s32)(0x7fFFffFFu - s2[j]) <= result)
1212 {
1213 return 0x7fFFffFF; //result already saturated
1214 }
1215 result += (s32)s2[j];
1216 }
1217 }
1218
1219 }
1220
1221 for (; i < size.width; i++)
1222 {
1223 s32 v = (s32)(src1[i]) - (s32)(src2[i]);
1224 v *= v;
1225
1226 if ((s32)(0x7fFFffFFu - (u32)(v)) <= result)
1227 {
1228 return 0x7fFFffFF; //result already saturated
1229 }
1230 result += v;
1231 }
1232 }
1233 return result;
1234 #else
1235 (void)_size;
1236 (void)src0Base;
1237 (void)src0Stride;
1238 (void)src1Base;
1239 (void)src1Stride;
1240
1241 return 0;
1242 #endif
1243 }
1244
diffNormL2(const Size2D & _size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride)1245 f64 diffNormL2(const Size2D &_size,
1246 const f32 * src0Base, ptrdiff_t src0Stride,
1247 const f32 * src1Base, ptrdiff_t src1Stride)
1248 {
1249 internal::assertSupportedConfiguration();
1250 #ifdef CAROTENE_NEON
1251 Size2D size(_size);
1252 if (src0Stride == src1Stride &&
1253 src0Stride == (ptrdiff_t)(size.width))
1254 {
1255 size.width *= size.height;
1256 size.height = 1;
1257 }
1258 f64 result = 0;
1259 for(size_t k = 0; k < size.height; ++k)
1260 {
1261 const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1262 const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1263 size_t i = 0;
1264
1265 if (size.width >= 4)
1266 {
1267 for(; i <= size.width - 4;)
1268 {
1269 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
1270 float32x4_t s = vmovq_n_f32(0.0f);
1271
1272 for (; i <= limit; i += 4 )
1273 {
1274 internal::prefetch(src1 + i);
1275 internal::prefetch(src2 + i);
1276
1277 float32x4_t vs1 = vld1q_f32(src1 + i);
1278 float32x4_t vs2 = vld1q_f32(src2 + i);
1279
1280 float32x4_t vd = vsubq_f32(vs2,vs1);
1281 s = vmlaq_f32(s, vd, vd);
1282 }
1283
1284 f32 s2[4];
1285 vst1q_f32(s2, s);
1286
1287 for (u32 j = 0; j < 4; j++)
1288 result += (f64)(s2[j]);
1289 }
1290 }
1291
1292 for (; i < size.width; i++)
1293 {
1294 f32 v = src1[i] - src2[i];
1295 result += v * v;
1296 }
1297 }
1298 return result;
1299 #else
1300 (void)_size;
1301 (void)src0Base;
1302 (void)src0Stride;
1303 (void)src1Base;
1304 (void)src1Stride;
1305
1306 return 0.;
1307 #endif
1308 }
1309
1310 } // namespace CAROTENE_NS
1311