1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 namespace CAROTENE_NS {
43 
44 //magic number; must be multiple of 4
45 #define NORM32F_BLOCK_SIZE 2048
46 
normInf(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)47 s32 normInf(const Size2D &_size,
48             const u8 * srcBase, ptrdiff_t srcStride)
49 {
50     internal::assertSupportedConfiguration();
51 #ifdef CAROTENE_NEON
52     Size2D size(_size);
53     if (srcStride == (ptrdiff_t)(size.width))
54     {
55         size.width *= size.height;
56         size.height = 1;
57     }
58     s32 result = 0;
59     for(size_t k = 0; k < size.height; ++k)
60     {
61         const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
62         size_t i = 0;
63         if (size.width >= 16)
64         {
65             uint8x16_t s = vld1q_u8(src);
66             for (i = 16; i <= size.width - 16; i += 16)
67             {
68                 internal::prefetch(src + i);
69                 uint8x16_t s1 = vld1q_u8(src + i);
70                 s = vmaxq_u8(s1, s);
71             }
72             u8 s2[8];
73             uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
74             vst1_u8(s2, s3);
75             for (u32 j = 0; j < 8; j++)
76                 result = std::max((s32)(s2[j]), result);
77         }
78         for ( ; i < size.width; i++)
79             result = std::max((s32)(src[i]), result);
80     }
81     return result;
82 #else
83     (void)_size;
84     (void)srcBase;
85     (void)srcStride;
86 
87     return 0;
88 #endif
89 }
90 
normInf(const Size2D & _size,const s8 * srcBase,ptrdiff_t srcStride)91 s32 normInf(const Size2D &_size,
92             const s8 * srcBase, ptrdiff_t srcStride)
93 {
94     internal::assertSupportedConfiguration();
95 #ifdef CAROTENE_NEON
96     Size2D size(_size);
97     if (srcStride == (ptrdiff_t)(size.width))
98     {
99         size.width *= size.height;
100         size.height = 1;
101     }
102     s32 result = 0;
103     for(size_t k = 0; k < size.height; ++k)
104     {
105         const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
106         size_t i = 0;
107         if (size.width >= 16)
108         {
109             uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src)));
110             for (i = 16; i <= size.width - 16; i += 16)
111             {
112                 internal::prefetch(src + i);
113                 uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i)));
114                 s = vmaxq_u8(s1, s);
115             }
116             u8 s2[8];
117             uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
118             vst1_u8(s2, s3);
119             for (u32 j = 0; j < 8; j++)
120                 result = std::max((s32)(s2[j]), result);
121         }
122         for ( ; i < size.width; i++)
123             result = std::max((s32)(std::abs(src[i])), result);
124     }
125     return result;
126 #else
127     (void)_size;
128     (void)srcBase;
129     (void)srcStride;
130 
131     return 0;
132 #endif
133 }
134 
normInf(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)135 s32 normInf(const Size2D &_size,
136             const u16 * srcBase, ptrdiff_t srcStride)
137 {
138     internal::assertSupportedConfiguration();
139 #ifdef CAROTENE_NEON
140     Size2D size(_size);
141     if (srcStride == (ptrdiff_t)(size.width))
142     {
143         size.width *= size.height;
144         size.height = 1;
145     }
146     s32 result = 0;
147     for(size_t k = 0; k < size.height; ++k)
148     {
149         const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
150         size_t i = 0;
151         if (size.width >= 8)
152         {
153             uint16x8_t s = vld1q_u16(src);
154             for (i = 8; i <= size.width - 8; i += 8)
155             {
156                 internal::prefetch(src + i);
157                 uint16x8_t s1 = vld1q_u16(src + i);
158                 s = vmaxq_u16(s1, s);
159             }
160             u16 s2[4];
161             uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
162             vst1_u16(s2, s3);
163             for (u32 j = 0; j < 4; j++)
164                 result = std::max((s32)(s2[j]), result);
165         }
166         for ( ; i < size.width; i++)
167             result = std::max((s32)(src[i]), result);
168     }
169     return result;
170 #else
171     (void)_size;
172     (void)srcBase;
173     (void)srcStride;
174 
175     return 0;
176 #endif
177 }
178 
normInf(const Size2D & _size,const s16 * srcBase,ptrdiff_t srcStride)179 s32 normInf(const Size2D &_size,
180             const s16 * srcBase, ptrdiff_t srcStride)
181 {
182     internal::assertSupportedConfiguration();
183 #ifdef CAROTENE_NEON
184     Size2D size(_size);
185     if (srcStride == (ptrdiff_t)(size.width))
186     {
187         size.width *= size.height;
188         size.height = 1;
189     }
190     s32 result = 0;
191     for(size_t k = 0; k < size.height; ++k)
192     {
193         const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
194         size_t i = 0;
195         if (size.width >= 8)
196         {
197             uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src)));
198             for (i = 8; i <= size.width - 8; i += 8)
199             {
200                 internal::prefetch(src + i);
201                 uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i)));
202                 s = vmaxq_u16(s1, s);
203             }
204             u16 s2[4];
205             uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
206             vst1_u16(s2, s3);
207             for (u32 j = 0; j < 4; j++)
208                 result = std::max((s32)(s2[j]), result);
209         }
210         for ( ; i < size.width; i++)
211             result = std::max(std::abs((s32)(src[i])), result);
212     }
213     return result;
214 #else
215     (void)_size;
216     (void)srcBase;
217     (void)srcStride;
218 
219     return 0;
220 #endif
221 }
222 
normInf(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)223 s32 normInf(const Size2D &_size,
224             const s32 * srcBase, ptrdiff_t srcStride)
225 {
226     internal::assertSupportedConfiguration();
227 #ifdef CAROTENE_NEON
228     Size2D size(_size);
229     if (srcStride == (ptrdiff_t)(size.width))
230     {
231         size.width *= size.height;
232         size.height = 1;
233     }
234     s32 result = 0;
235     for(size_t k = 0; k < size.height; ++k)
236     {
237         const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
238         size_t i = 0;
239         if (size.width >= 4)
240         {
241             uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src)));
242             for (i = 4; i <= size.width - 4; i += 4)
243             {
244                 internal::prefetch(src + i);
245                 uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i)));
246                 s = vmaxq_u32(s1, s);
247             }
248             u32 s2[2];
249             uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s));
250             vst1_u32(s2, s3);
251             for (u32 j = 0; j < 2; j++)
252                 result = std::max((s32)(s2[j]), result);
253         }
254         for ( ; i < size.width; i++)
255             result = std::max((s32)(std::abs(src[i])), result);
256     }
257     return result;
258 #else
259     (void)_size;
260     (void)srcBase;
261     (void)srcStride;
262 
263     return 0;
264 #endif
265 }
266 
normInf(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)267 f32 normInf(const Size2D &_size,
268             const f32 * srcBase, ptrdiff_t srcStride)
269 {
270     internal::assertSupportedConfiguration();
271 #ifdef CAROTENE_NEON
272     Size2D size(_size);
273     if (srcStride == (ptrdiff_t)(size.width))
274     {
275         size.width *= size.height;
276         size.height = 1;
277     }
278     f32 result = 0;
279     for(size_t k = 0; k < size.height; ++k)
280     {
281         const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
282         size_t i = 0;
283         if (size.width >= 4)
284         {
285             float32x4_t s = vabsq_f32(vld1q_f32(src));
286             for (i = 4; i <= size.width - 4; i += 4 )
287             {
288                 internal::prefetch(src + i);
289                 float32x4_t s1 = vld1q_f32(src + i);
290                 float32x4_t sa = vabsq_f32(s1);
291                 s = vmaxq_f32(sa, s);
292             }
293             f32 s2[2];
294             float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s));
295             vst1_f32(s2, s3);
296             for (u32 j = 0; j < 2; j++)
297                 result = std::max(s2[j], result);
298         }
299         for (; i < size.width; i++)
300             result = std::max(std::abs(src[i]), result);
301     }
302     return result;
303 #else
304     (void)_size;
305     (void)srcBase;
306     (void)srcStride;
307 
308     return 0.;
309 #endif
310 }
311 
normL1(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)312 s32 normL1(const Size2D &_size,
313            const u8 * srcBase, ptrdiff_t srcStride)
314 {
315     internal::assertSupportedConfiguration();
316 #ifdef CAROTENE_NEON
317     Size2D size(_size);
318     if (srcStride == (ptrdiff_t)(size.width))
319     {
320         size.width *= size.height;
321         size.height = 1;
322     }
323     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324     s32 result = 0;
325     for(size_t k = 0; k < size.height; ++k)
326     {
327         const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
328         size_t i = 0;
329         uint32x4_t vs = vmovq_n_u32(0);
330         for (; i < roiw8;)
331         {
332             size_t limit = std::min(size.width, i + 256) - 8;
333             uint8x8_t s0 = vld1_u8(src + i);
334             uint16x8_t s = vmovl_u8(s0);
335 
336             for (i += 8; i <= limit; i += 8)
337             {
338                 internal::prefetch(src + i);
339                 uint8x8_t s1 = vld1_u8(src + i);
340                 s = vaddw_u8(s, s1);
341             }
342 
343             uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
344             vs = vaddw_u16(vs, s4);
345         }
346 
347         u32 s2[2];
348         uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
349         vst1_u32(s2, vs2);
350 
351         result += (s32)(s2[0] + s2[1]);
352 
353         for ( ; i < size.width; i++)
354             result += (s32)(src[i]);
355     }
356     return result;
357 #else
358     (void)_size;
359     (void)srcBase;
360     (void)srcStride;
361 
362     return 0;
363 #endif
364 }
365 
normL1(const Size2D & _size,const s8 * srcBase,ptrdiff_t srcStride)366 s32 normL1(const Size2D &_size,
367            const s8 * srcBase, ptrdiff_t srcStride)
368 {
369     internal::assertSupportedConfiguration();
370 #ifdef CAROTENE_NEON
371     Size2D size(_size);
372     if (srcStride == (ptrdiff_t)(size.width))
373     {
374         size.width *= size.height;
375         size.height = 1;
376     }
377     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
378     s32 result = 0;
379     for(size_t k = 0; k < size.height; ++k)
380     {
381         const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
382         size_t i = 0;
383         uint32x4_t vs = vmovq_n_u32(0);
384 
385         for (; i < roiw8;)
386         {
387             size_t limit = std::min(size.width, i + 256) - 8;
388             uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
389             uint16x8_t s = vmovl_u8(s0);
390 
391             for (i += 8; i <= limit; i += 8)
392             {
393                 internal::prefetch(src + i);
394                 uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
395                 s = vaddw_u8(s, s1);
396             }
397 
398             uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
399             vs = vaddw_u16(vs, s4);
400         }
401 
402         u32 s2[2];
403         uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
404         vst1_u32(s2, vs2);
405 
406         result += (s32)(s2[0] + s2[1]);
407 
408         for ( ; i < size.width; i++)
409             result += (s32)(std::abs(src[i]));
410     }
411     return result;
412 #else
413     (void)_size;
414     (void)srcBase;
415     (void)srcStride;
416 
417     return 0;
418 #endif
419 }
420 
normL1(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)421 s32 normL1(const Size2D &_size,
422            const u16 * srcBase, ptrdiff_t srcStride)
423 {
424     internal::assertSupportedConfiguration();
425 #ifdef CAROTENE_NEON
426     Size2D size(_size);
427     if (srcStride == (ptrdiff_t)(size.width))
428     {
429         size.width *= size.height;
430         size.height = 1;
431     }
432     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
433     s32 result = 0;
434     for(size_t k = 0; k < size.height; ++k)
435     {
436         const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
437         size_t i = 0;
438         uint32x4_t vs = vmovq_n_u32(0);
439         for (; i < roiw4; i += 4)
440         {
441             internal::prefetch(src + i);
442             uint16x4_t s = vld1_u16(src + i);
443             vs = vaddw_u16(vs, s);
444         }
445         u32 s2[4];
446         vst1q_u32(s2, vs);
447         for (u32 j = 0; j < 4; j++)
448             result += s2[j];
449         for ( ; i < size.width; i++)
450             result += (s32)(src[i]);
451     }
452     return result;
453 #else
454     (void)_size;
455     (void)srcBase;
456     (void)srcStride;
457 
458     return 0;
459 #endif
460 }
461 
normL1(const Size2D & _size,const s16 * srcBase,ptrdiff_t srcStride)462 s32 normL1(const Size2D &_size,
463            const s16 * srcBase, ptrdiff_t srcStride)
464 {
465     internal::assertSupportedConfiguration();
466 #ifdef CAROTENE_NEON
467     Size2D size(_size);
468     if (srcStride == (ptrdiff_t)(size.width))
469     {
470         size.width *= size.height;
471         size.height = 1;
472     }
473     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
474     s32 result = 0;
475     for(size_t k = 0; k < size.height; ++k)
476     {
477         const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
478         size_t i = 0;
479         uint32x4_t vs = vmovq_n_u32(0);
480         for (; i < roiw4; i += 4)
481         {
482             internal::prefetch(src + i);
483             uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i)));
484             vs = vaddw_u16(vs, s);
485         }
486         u32 s2[4];
487         vst1q_u32(s2, vs);
488         for (u32 j = 0; j < 4; j++)
489             result += s2[j];
490         for ( ; i < size.width; i++)
491             result += (s32)(std::abs(src[i]));
492     }
493     return result;
494 #else
495     (void)_size;
496     (void)srcBase;
497     (void)srcStride;
498 
499     return 0;
500 #endif
501 }
502 
normL1(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)503 f64 normL1(const Size2D &_size,
504            const s32 * srcBase, ptrdiff_t srcStride)
505 {
506     internal::assertSupportedConfiguration();
507 #ifdef CAROTENE_NEON
508     Size2D size(_size);
509     if (srcStride == (ptrdiff_t)(size.width))
510     {
511         size.width *= size.height;
512         size.height = 1;
513     }
514     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
515     f64 result = 0;
516     for(size_t k = 0; k < size.height; ++k)
517     {
518         const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
519         size_t i = 0;
520         for (; i < roiw4;)
521         {
522             size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
523             float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
524             for (i += 4; i <= limit; i += 4 )
525             {
526                 internal::prefetch(src + i);
527                 float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
528                 s = vaddq_f32(s, s1);
529             }
530 
531             f32 s2[4];
532             vst1q_f32(s2, s);
533 
534             for (u32 j = 0; j < 4; j++)
535                 result += (f64)(s2[j]);
536         }
537         for ( ; i < size.width; i++)
538             result += (f64)(std::abs(src[i]));
539     }
540     return result;
541 #else
542     (void)_size;
543     (void)srcBase;
544     (void)srcStride;
545 
546     return 0.;
547 #endif
548 }
549 
normL1(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)550 f64 normL1(const Size2D &_size,
551            const f32 * srcBase, ptrdiff_t srcStride)
552 {
553     internal::assertSupportedConfiguration();
554 #ifdef CAROTENE_NEON
555     Size2D size(_size);
556     if (srcStride == (ptrdiff_t)(size.width))
557     {
558         size.width *= size.height;
559         size.height = 1;
560     }
561     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
562     f64 result = 0;
563     for(size_t k = 0; k < size.height; ++k)
564     {
565         const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
566         size_t i = 0;
567 
568         for (; i < roiw4;)
569         {
570             size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
571             float32x4_t s = vabsq_f32(vld1q_f32(src + i));
572             for (i += 4; i <= limit; i += 4)
573             {
574                 internal::prefetch(src + i);
575                 float32x4_t s1 = vld1q_f32(src + i);
576                 float32x4_t sa = vabsq_f32(s1);
577                 s = vaddq_f32(sa, s);
578             }
579 
580             f32 s2[4];
581             vst1q_f32(s2, s);
582 
583             for (u32 j = 0; j < 4; j++)
584                 result += (f64)(s2[j]);
585         }
586         for (; i < size.width; i++)
587             result += std::abs((f64)(src[i]));
588     }
589     return result;
590 #else
591     (void)_size;
592     (void)srcBase;
593     (void)srcStride;
594 
595     return 0.;
596 #endif
597 }
598 
normL2(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)599 s32 normL2(const Size2D &_size,
600            const u8 * srcBase, ptrdiff_t srcStride)
601 {
602     internal::assertSupportedConfiguration();
603 #ifdef CAROTENE_NEON
604     Size2D size(_size);
605     if (srcStride == (ptrdiff_t)(size.width))
606     {
607         size.width *= size.height;
608         size.height = 1;
609     }
610     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
611     s32 result = 0;
612     for(size_t k = 0; k < size.height; ++k)
613     {
614         const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
615         size_t i = 0;
616 
617         uint32x4_t sl = vmovq_n_u32(0);
618         uint32x4_t sh = vmovq_n_u32(0);
619 
620         for (; i < roiw8; i += 8)
621         {
622             internal::prefetch(src + i);
623             uint8x8_t s1 = vld1_u8(src + i);
624             uint16x8_t sq = vmull_u8(s1, s1);
625 
626             sl = vaddw_u16(sl, vget_low_u16(sq));
627             sh = vaddw_u16(sh, vget_high_u16(sq));
628         }
629 
630         uint32x4_t s = vaddq_u32(sl, sh);
631         uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s));
632 
633         u32 s2[2];
634         vst1_u32(s2, ss);
635 
636         result += (s32)(s2[0] + s2[1]);
637 
638         for (; i < size.width; i++)
639             result += (s32)(src[i]) * (s32)(src[i]);
640     }
641     return result;
642 #else
643     (void)_size;
644     (void)srcBase;
645     (void)srcStride;
646 
647     return 0;
648 #endif
649 }
650 
normL2(const Size2D & _size,const s8 * srcBase,ptrdiff_t srcStride)651 s32 normL2(const Size2D &_size,
652            const s8 * srcBase, ptrdiff_t srcStride)
653 {
654     internal::assertSupportedConfiguration();
655 #ifdef CAROTENE_NEON
656     Size2D size(_size);
657     if (srcStride == (ptrdiff_t)(size.width))
658     {
659         size.width *= size.height;
660         size.height = 1;
661     }
662     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
663     s32 result = 0;
664     for(size_t k = 0; k < size.height; ++k)
665     {
666         const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
667         size_t i = 0;
668 
669         int32x4_t sl = vmovq_n_s32(0);
670         int32x4_t sh = vmovq_n_s32(0);
671 
672         for (; i < roiw8; i += 8)
673         {
674             internal::prefetch(src + i);
675             int8x8_t s1 = vld1_s8(src + i);
676             int16x8_t sq = vmull_s8(s1, s1);
677 
678             sl = vaddw_s16(sl, vget_low_s16(sq));
679             sh = vaddw_s16(sh, vget_high_s16(sq));
680         }
681 
682         int32x4_t s = vaddq_s32(sl, sh);
683         int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s));
684 
685         s32 s2[2];
686         vst1_s32(s2, ss);
687 
688         result += s2[0] + s2[1];
689 
690         for (; i < size.width; i++)
691             result += (s32)(src[i]) * (s32)(src[i]);
692     }
693     return result;
694 #else
695     (void)_size;
696     (void)srcBase;
697     (void)srcStride;
698 
699     return 0;
700 #endif
701 }
702 
normL2(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)703 f64 normL2(const Size2D &_size,
704            const u16 * srcBase, ptrdiff_t srcStride)
705 {
706     internal::assertSupportedConfiguration();
707 #ifdef CAROTENE_NEON
708     Size2D size(_size);
709     if (srcStride == (ptrdiff_t)(size.width))
710     {
711         size.width *= size.height;
712         size.height = 1;
713     }
714     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
715     f64 result = 0;
716     for(size_t k = 0; k < size.height; ++k)
717     {
718         const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
719         size_t i = 0;
720         for (; i < roiw4;)
721         {
722             size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
723             uint16x4_t s0 = vld1_u16(src+i);
724             float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0));
725             for (i += 4; i <= limit; i += 4 )
726             {
727                 internal::prefetch(src + i);
728                 uint16x4_t s1 = vld1_u16(src+i);
729                 float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1));
730                 s = vaddq_f32(s, sq);
731             }
732             f32 s2[4];
733             vst1q_f32(s2, s);
734             for (u32 j = 0; j < 4; j++)
735                 result += (f64)(s2[j]);
736         }
737 
738         for ( ; i < size.width; i++)
739             result += (f64)(src[i]) * (f64)(src[i]);
740     }
741     return result;
742 #else
743     (void)_size;
744     (void)srcBase;
745     (void)srcStride;
746 
747     return 0.;
748 #endif
749 }
750 
normL2(const Size2D & _size,const s16 * srcBase,ptrdiff_t srcStride)751 f64 normL2(const Size2D &_size,
752            const s16 * srcBase, ptrdiff_t srcStride)
753 {
754     internal::assertSupportedConfiguration();
755 #ifdef CAROTENE_NEON
756     Size2D size(_size);
757     if (srcStride == (ptrdiff_t)(size.width))
758     {
759         size.width *= size.height;
760         size.height = 1;
761     }
762     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
763     f64 result = 0;
764     for(size_t k = 0; k < size.height; ++k)
765     {
766         const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
767         size_t i = 0;
768         for (; i < roiw4;)
769         {
770             size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
771             int16x4_t s0 = vld1_s16(src+i);
772             float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0));
773             for (i += 4; i <= limit; i += 4 )
774             {
775                 internal::prefetch(src + i);
776                 int16x4_t s1 = vld1_s16(src+i);
777                 float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1));
778                 s = vaddq_f32(s, sq);
779             }
780             f32 s2[4];
781             vst1q_f32(s2, s);
782             for (u32 j = 0; j < 4; j++)
783                 result += (f64)(s2[j]);
784         }
785 
786         for ( ; i < size.width; i++)
787             result += (f64)(src[i]) * (f64)(src[i]);
788     }
789     return result;
790 #else
791     (void)_size;
792     (void)srcBase;
793     (void)srcStride;
794 
795     return 0.;
796 #endif
797 }
798 
normL2(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)799 f64 normL2(const Size2D &_size,
800            const s32 * srcBase, ptrdiff_t srcStride)
801 {
802     internal::assertSupportedConfiguration();
803 #ifdef CAROTENE_NEON
804     Size2D size(_size);
805     if (srcStride == (ptrdiff_t)(size.width))
806     {
807         size.width *= size.height;
808         size.height = 1;
809     }
810     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
811     f64 result = 0;
812     for(size_t k = 0; k < size.height; ++k)
813     {
814         const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
815         size_t i = 0;
816         for (; i < roiw4;)
817         {
818             size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
819             float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i));
820             s = vmulq_f32(s, s);
821             for (i += 4; i <= limit; i += 4 )
822             {
823                 internal::prefetch(src + i);
824                 float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i));
825                 s = vmlaq_f32(s, s1, s1);
826             }
827 
828             f32 s2[4];
829             vst1q_f32(s2, s);
830 
831             for (u32 j = 0; j < 4; j++)
832                 result += (f64)(s2[j]);
833         }
834         for ( ; i < size.width; i++)
835             result += (f64)(src[i]) * (f64)(src[i]);
836     }
837     return result;
838 #else
839     (void)_size;
840     (void)srcBase;
841     (void)srcStride;
842 
843     return 0.;
844 #endif
845 }
846 
normL2(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)847 f64 normL2(const Size2D &_size,
848            const f32 * srcBase, ptrdiff_t srcStride)
849 {
850     internal::assertSupportedConfiguration();
851 #ifdef CAROTENE_NEON
852     Size2D size(_size);
853     if (srcStride == (ptrdiff_t)(size.width))
854     {
855         size.width *= size.height;
856         size.height = 1;
857     }
858     size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
859     f64 result = 0;
860     for(size_t k = 0; k < size.height; ++k)
861     {
862         const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
863         size_t i = 0;
864         for (; i < roiw4;)
865         {
866             size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
867             float32x4_t s = vld1q_f32(src + i);
868             s = vmulq_f32(s, s);
869             for (i += 4; i <= limit; i += 4 )
870             {
871                 internal::prefetch(src + i);
872                 float32x4_t s1 = vld1q_f32(src + i);
873                 s = vmlaq_f32(s, s1, s1);
874             }
875 
876             f32 s2[4];
877             vst1q_f32(s2, s);
878 
879             for (u32 j = 0; j < 4; j++)
880                 result += (f64)(s2[j]);
881         }
882         for ( ; i < size.width; i++)
883             result += (f64)(src[i]) * (f64)(src[i]);
884     }
885     return result;
886 #else
887     (void)_size;
888     (void)srcBase;
889     (void)srcStride;
890 
891     return 0.;
892 #endif
893 }
894 
diffNormInf(const Size2D & _size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride)895 s32 diffNormInf(const Size2D &_size,
896                 const u8 * src0Base, ptrdiff_t src0Stride,
897                 const u8 * src1Base, ptrdiff_t src1Stride)
898 {
899     internal::assertSupportedConfiguration();
900 #ifdef CAROTENE_NEON
901     Size2D size(_size);
902     if (src0Stride == src1Stride &&
903         src0Stride == (ptrdiff_t)(size.width))
904     {
905         size.width *= size.height;
906         size.height = 1;
907     }
908     s32 result = 0;
909     for(size_t k = 0; k < size.height; ++k)
910     {
911         const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
912         const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
913         size_t i = 0;
914 
915         if (size.width >= 16)
916         {
917             uint8x16_t vs3 = vdupq_n_u8(0);
918             for (; i < size.width - 16; i += 16)
919             {
920                 internal::prefetch(src1 + i);
921                 internal::prefetch(src2 + i);
922 
923                 uint8x16_t vs1 = vld1q_u8(src1 + i);
924                 uint8x16_t vs2 = vld1q_u8(src2 + i);
925 
926                 vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2));
927             }
928 
929             u8 s2[8];
930             vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3)));
931 
932             for (u32 j = 0; j < 8; j++)
933                 result = std::max((s32)(s2[j]), result);
934         }
935 
936         for (; i < size.width; i++)
937         {
938             result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result);
939         }
940     }
941     return result;
942 #else
943     (void)_size;
944     (void)src0Base;
945     (void)src0Stride;
946     (void)src1Base;
947     (void)src1Stride;
948 
949     return 0;
950 #endif
951 }
952 
diffNormInf(const Size2D & _size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride)953 f32 diffNormInf(const Size2D &_size,
954                 const f32 * src0Base, ptrdiff_t src0Stride,
955                 const f32 * src1Base, ptrdiff_t src1Stride)
956 {
957     internal::assertSupportedConfiguration();
958 #ifdef CAROTENE_NEON
959     Size2D size(_size);
960     if (src0Stride == src1Stride &&
961         src0Stride == (ptrdiff_t)(size.width))
962     {
963         size.width *= size.height;
964         size.height = 1;
965     }
966     f32 result = 0;
967     for(size_t k = 0; k < size.height; ++k)
968     {
969         const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
970         const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
971         size_t i = 0;
972 
973         if (size.width >= 4)
974         {
975             float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2));
976 
977             for (i += 4; i <= size.width - 4; i += 4 )
978             {
979                 internal::prefetch(src1 + i);
980                 internal::prefetch(src2 + i);
981 
982                 float32x4_t vs1 = vld1q_f32(src1 + i);
983                 float32x4_t vs2 = vld1q_f32(src2 + i);
984 
985                 float32x4_t vd = vabdq_f32(vs2, vs1);
986                 s = vmaxq_f32(s, vd);
987             }
988 
989             f32 s2[4];
990             vst1q_f32(s2, s);
991 
992             for (u32 j = 0; j < 4; j++)
993                 if (s2[j] > result)
994                     result = s2[j];
995         }
996 
997         for (; i < size.width; i++)
998         {
999             f32 v = std::abs(src1[i] - src2[i]);
1000             if (v > result)
1001                 result = v;
1002         }
1003     }
1004     return result;
1005 #else
1006     (void)_size;
1007     (void)src0Base;
1008     (void)src0Stride;
1009     (void)src1Base;
1010     (void)src1Stride;
1011 
1012     return 0.;
1013 #endif
1014 }
1015 
diffNormL1(const Size2D & _size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride)1016 s32 diffNormL1(const Size2D &_size,
1017                const u8 * src0Base, ptrdiff_t src0Stride,
1018                const u8 * src1Base, ptrdiff_t src1Stride)
1019 {
1020     internal::assertSupportedConfiguration();
1021 #ifdef CAROTENE_NEON
1022     Size2D size(_size);
1023     if (src0Stride == src1Stride &&
1024         src0Stride == (ptrdiff_t)(size.width))
1025     {
1026         size.width *= size.height;
1027         size.height = 1;
1028     }
1029     s32 result = 0;
1030     for(size_t k = 0; k < size.height; ++k)
1031     {
1032         const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
1033         const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
1034         size_t i = 0;
1035 
1036         if (size.width >= 16)
1037         {
1038             for(; i <= size.width - 16;)
1039             {
1040                 size_t limit = std::min(size.width, i + 2*256) - 16;
1041                 uint16x8_t si1 = vmovq_n_u16(0);
1042                 uint16x8_t si2 = vmovq_n_u16(0);
1043 
1044                 for (; i <= limit; i += 16)
1045                 {
1046                     internal::prefetch(src1 + i);
1047                     internal::prefetch(src2 + i);
1048 
1049                     uint8x16_t vs1 = vld1q_u8(src1 + i);
1050                     uint8x16_t vs2 = vld1q_u8(src2 + i);
1051 
1052                     si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2));
1053                     si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2));
1054                 }
1055 
1056                 u32 s2[4];
1057                 vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2)));
1058 
1059                 for (u32 j = 0; j < 4; j++)
1060                 {
1061                     if ((s32)(0x7fFFffFFu - s2[j]) <= result)
1062                     {
1063                         return 0x7fFFffFF; //result already saturated
1064                     }
1065                     result = (s32)((u32)(result) + s2[j]);
1066                 }
1067             }
1068 
1069         }
1070 
1071         for (; i < size.width; i++)
1072         {
1073             u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i]));
1074 
1075             if ((s32)(0x7fFFffFFu - v) <= result)
1076             {
1077                 return 0x7fFFffFF; //result already saturated
1078             }
1079             result = (s32)((u32)(result) + v);
1080         }
1081     }
1082     return result;
1083 #else
1084     (void)_size;
1085     (void)src0Base;
1086     (void)src0Stride;
1087     (void)src1Base;
1088     (void)src1Stride;
1089 
1090     return 0;
1091 #endif
1092 }
1093 
diffNormL1(const Size2D & _size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride)1094 f64 diffNormL1(const Size2D &_size,
1095                const f32 * src0Base, ptrdiff_t src0Stride,
1096                const f32 * src1Base, ptrdiff_t src1Stride)
1097 {
1098     internal::assertSupportedConfiguration();
1099 #ifdef CAROTENE_NEON
1100     Size2D size(_size);
1101     if (src0Stride == src1Stride &&
1102         src0Stride == (ptrdiff_t)(size.width))
1103     {
1104         size.width *= size.height;
1105         size.height = 1;
1106     }
1107     f64 result = 0;
1108     for(size_t k = 0; k < size.height; ++k)
1109     {
1110         const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
1111         const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
1112         size_t i = 0;
1113 
1114         if (size.width >= 4)
1115         {
1116             for(; i <= size.width - 4;)
1117             {
1118                 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
1119                 float32x4_t s = vmovq_n_f32(0.0f);
1120 
1121                 for (; i <= limit; i += 4 )
1122                 {
1123                     internal::prefetch(src1 + i);
1124                     internal::prefetch(src2 + i);
1125 
1126                     float32x4_t vs1 = vld1q_f32(src1 + i);
1127                     float32x4_t vs2 = vld1q_f32(src2 + i);
1128 
1129                     float32x4_t vd = vabdq_f32(vs2, vs1);
1130                     s = vaddq_f32(s, vd);
1131                 }
1132 
1133                 f32 s2[4];
1134                 vst1q_f32(s2, s);
1135 
1136                 for (u32 j = 0; j < 4; j++)
1137                     result += (f64)(s2[j]);
1138             }
1139         }
1140 
1141         for (; i < size.width; i++)
1142         {
1143             f32 v = std::abs(src1[i] - src2[i]);
1144             result += (f64)(v);
1145         }
1146     }
1147     return result;
1148 #else
1149     (void)_size;
1150     (void)src0Base;
1151     (void)src0Stride;
1152     (void)src1Base;
1153     (void)src1Stride;
1154 
1155     return 0.;
1156 #endif
1157 }
1158 
diffNormL2(const Size2D & _size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride)1159 s32 diffNormL2(const Size2D &_size,
1160                const u8 * src0Base, ptrdiff_t src0Stride,
1161                const u8 * src1Base, ptrdiff_t src1Stride)
1162 {
1163     internal::assertSupportedConfiguration();
1164 #ifdef CAROTENE_NEON
1165     Size2D size(_size);
1166     if (src0Stride == src1Stride &&
1167         src0Stride == (ptrdiff_t)(size.width))
1168     {
1169         size.width *= size.height;
1170         size.height = 1;
1171     }
1172     s32 result = 0;
1173     for(size_t k = 0; k < size.height; ++k)
1174     {
1175         const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
1176         const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
1177         size_t i = 0;
1178 
1179 #define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow
1180         if (size.width >= 16)
1181         {
1182             for(; i <= size.width - 16;)
1183             {
1184                 size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16;
1185                 uint32x4_t si1 = vmovq_n_u32(0);
1186                 uint32x4_t si2 = vmovq_n_u32(0);
1187 
1188                 for (; i <= limit; i += 16)
1189                 {
1190                     internal::prefetch(src1 + i);
1191                     internal::prefetch(src2 + i);
1192 
1193                     uint8x16_t vs1 = vld1q_u8(src1 + i);
1194                     uint8x16_t vs2 = vld1q_u8(src2 + i);
1195 
1196                     uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2));
1197                     uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2));
1198 
1199                     si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo));
1200                     si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo));
1201 
1202                     si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi));
1203                     si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi));
1204                 }
1205 
1206                 u32 s2[4];
1207                 vst1q_u32(s2, vqaddq_u32(si1, si2));
1208 
1209                 for (u32 j = 0; j < 4; j++)
1210                 {
1211                     if ((s32)(0x7fFFffFFu - s2[j]) <= result)
1212                     {
1213                         return 0x7fFFffFF; //result already saturated
1214                     }
1215                     result += (s32)s2[j];
1216                 }
1217             }
1218 
1219         }
1220 
1221         for (; i < size.width; i++)
1222         {
1223             s32 v = (s32)(src1[i]) - (s32)(src2[i]);
1224             v *= v;
1225 
1226             if ((s32)(0x7fFFffFFu - (u32)(v)) <= result)
1227             {
1228                 return 0x7fFFffFF; //result already saturated
1229             }
1230             result += v;
1231         }
1232     }
1233     return result;
1234 #else
1235     (void)_size;
1236     (void)src0Base;
1237     (void)src0Stride;
1238     (void)src1Base;
1239     (void)src1Stride;
1240 
1241     return 0;
1242 #endif
1243 }
1244 
diffNormL2(const Size2D & _size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride)1245 f64 diffNormL2(const Size2D &_size,
1246                const f32 * src0Base, ptrdiff_t src0Stride,
1247                const f32 * src1Base, ptrdiff_t src1Stride)
1248 {
1249     internal::assertSupportedConfiguration();
1250 #ifdef CAROTENE_NEON
1251     Size2D size(_size);
1252     if (src0Stride == src1Stride &&
1253         src0Stride == (ptrdiff_t)(size.width))
1254     {
1255         size.width *= size.height;
1256         size.height = 1;
1257     }
1258     f64 result = 0;
1259     for(size_t k = 0; k < size.height; ++k)
1260     {
1261         const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
1262         const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
1263         size_t i = 0;
1264 
1265         if (size.width >= 4)
1266         {
1267             for(; i <= size.width - 4;)
1268             {
1269                 size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
1270                 float32x4_t s = vmovq_n_f32(0.0f);
1271 
1272                 for (; i <= limit; i += 4 )
1273                 {
1274                     internal::prefetch(src1 + i);
1275                     internal::prefetch(src2 + i);
1276 
1277                     float32x4_t vs1 = vld1q_f32(src1 + i);
1278                     float32x4_t vs2 = vld1q_f32(src2 + i);
1279 
1280                     float32x4_t vd = vsubq_f32(vs2,vs1);
1281                     s = vmlaq_f32(s, vd, vd);
1282                 }
1283 
1284                 f32 s2[4];
1285                 vst1q_f32(s2, s);
1286 
1287                 for (u32 j = 0; j < 4; j++)
1288                     result += (f64)(s2[j]);
1289             }
1290         }
1291 
1292         for (; i < size.width; i++)
1293         {
1294             f32 v = src1[i] - src2[i];
1295             result += v * v;
1296         }
1297     }
1298     return result;
1299 #else
1300     (void)_size;
1301     (void)src0Base;
1302     (void)src0Stride;
1303     (void)src1Base;
1304     (void)src1Stride;
1305 
1306     return 0.;
1307 #endif
1308 }
1309 
1310 } // namespace CAROTENE_NS
1311