1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "absval_arm.h"
16 
17 #if __ARM_NEON
18 #include <arm_neon.h>
19 #endif // __ARM_NEON
20 
21 namespace ncnn {
22 
AbsVal_arm()23 AbsVal_arm::AbsVal_arm()
24 {
25 #if __ARM_NEON
26     support_packing = true;
27 #endif // __ARM_NEON
28 }
29 
forward_inplace(Mat & bottom_top_blob,const Option & opt) const30 int AbsVal_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
31 {
32     int w = bottom_top_blob.w;
33     int h = bottom_top_blob.h;
34     int channels = bottom_top_blob.c;
35     int size = w * h;
36     int elempack = bottom_top_blob.elempack;
37 
38 #if __ARM_NEON
39     if (elempack == 4)
40     {
41         #pragma omp parallel for num_threads(opt.num_threads)
42         for (int q = 0; q < channels; q++)
43         {
44             float* ptr = bottom_top_blob.channel(q);
45 
46             for (int i = 0; i < size; i++)
47             {
48                 float32x4_t _p = vld1q_f32(ptr);
49                 _p = vabsq_f32(_p);
50                 vst1q_f32(ptr, _p);
51 
52                 ptr += 4;
53             }
54         }
55 
56         return 0;
57     }
58 #endif // __ARM_NEON
59 
60     #pragma omp parallel for num_threads(opt.num_threads)
61     for (int q = 0; q < channels; q++)
62     {
63         float* ptr = bottom_top_blob.channel(q);
64 
65 #if __ARM_NEON
66         int nn = size >> 2;
67         int remain = size - (nn << 2);
68 #else
69         int remain = size;
70 #endif // __ARM_NEON
71 
72 #if __ARM_NEON
73 #if __aarch64__
74         if (nn > 0)
75         {
76             asm volatile(
77                 "0:                               \n"
78                 "prfm       pldl1keep, [%1, #128] \n"
79                 "ld1        {v0.4s}, [%1]         \n"
80                 "fabs       v0.4s, v0.4s          \n"
81                 "subs       %w0, %w0, #1          \n"
82                 "st1        {v0.4s}, [%1], #16    \n"
83                 "bne        0b                    \n"
84                 : "=r"(nn), // %0
85                 "=r"(ptr) // %1
86                 : "0"(nn),
87                 "1"(ptr)
88                 : "cc", "memory", "v0");
89         }
90 #else
91         if (nn > 0)
92         {
93             asm volatile(
94                 "0:                             \n"
95                 "vld1.f32   {d0-d1}, [%1]       \n"
96                 "vabs.f32   q0, q0              \n"
97                 "subs       %0, #1              \n"
98                 "vst1.f32   {d0-d1}, [%1]!      \n"
99                 "bne        0b                  \n"
100                 : "=r"(nn), // %0
101                 "=r"(ptr) // %1
102                 : "0"(nn),
103                 "1"(ptr)
104                 : "cc", "memory", "q0");
105         }
106 #endif // __aarch64__
107 #endif // __ARM_NEON
108         for (; remain > 0; remain--)
109         {
110             *ptr = *ptr > 0 ? *ptr : -*ptr;
111 
112             ptr++;
113         }
114     }
115 
116     return 0;
117 }
118 
119 } // namespace ncnn
120