1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
pooling2x2s2_max_neon(const Mat & bottom_blob,Mat & top_blob,const Option & opt)15 static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
16 {
17 int w = bottom_blob.w;
18 int inch = bottom_blob.c;
19
20 int outw = top_blob.w;
21 int outh = top_blob.h;
22
23 const int tailstep = w - 2 * outw + w;
24
25 #pragma omp parallel for num_threads(opt.num_threads)
26 for (int q = 0; q < inch; q++)
27 {
28 const float* img0 = bottom_blob.channel(q);
29 float* outptr = top_blob.channel(q);
30
31 const float* r0 = img0;
32 const float* r1 = img0 + w;
33
34 for (int i = 0; i < outh; i++)
35 {
36 #if __ARM_NEON
37 int nn = outw >> 2;
38 int remain = outw - (nn << 2);
39 #else
40 int remain = outw;
41 #endif // __ARM_NEON
42
43 #if __ARM_NEON
44 #if __aarch64__
45 if (nn > 0)
46 {
47 asm volatile(
48 "0: \n"
49 "prfm pldl1keep, [%1, #256] \n"
50 "prfm pldl1keep, [%2, #256] \n"
51 "ld1 {v0.4s, v1.4s}, [%1], #32 \n"
52 "ld1 {v2.4s, v3.4s}, [%2], #32 \n"
53 "fmax v0.4s, v0.4s, v2.4s \n"
54 "fmax v1.4s, v1.4s, v3.4s \n"
55 "fmaxp v2.4s, v0.4s, v1.4s \n"
56 "subs %w0, %w0, #1 \n"
57 "st1 {v2.4s}, [%3], #16 \n"
58 "bne 0b \n"
59 : "=r"(nn), // %0
60 "=r"(r0), // %1
61 "=r"(r1), // %2
62 "=r"(outptr) // %3
63 : "0"(nn),
64 "1"(r0),
65 "2"(r1),
66 "3"(outptr)
67 : "cc", "memory", "v0", "v1", "v2", "v3");
68 }
69 #else
70 if (nn > 0)
71 {
72 asm volatile(
73 "0: \n"
74 "pld [%1, #256] \n"
75 "pld [%2, #256] \n"
76 "vld1.f32 {d0-d3}, [%1]! \n"
77 "vld1.f32 {d4-d7}, [%2]! \n"
78 "vmax.f32 q0, q0, q2 \n"
79 "vmax.f32 q1, q1, q3 \n"
80 "vpmax.f32 d4, d0, d1 \n"
81 "vpmax.f32 d5, d2, d3 \n"
82 "subs %0, #1 \n"
83 "vst1.f32 {d4-d5}, [%3]! \n"
84 "bne 0b \n"
85 : "=r"(nn), // %0
86 "=r"(r0), // %1
87 "=r"(r1), // %2
88 "=r"(outptr) // %3
89 : "0"(nn),
90 "1"(r0),
91 "2"(r1),
92 "3"(outptr)
93 : "cc", "memory", "q0", "q1", "q2", "q3");
94 }
95 #endif // __aarch64__
96 #endif // __ARM_NEON
97 for (; remain > 0; remain--)
98 {
99 float max0 = std::max(r0[0], r0[1]);
100 float max1 = std::max(r1[0], r1[1]);
101
102 *outptr = std::max(max0, max1);
103
104 r0 += 2;
105 r1 += 2;
106 outptr++;
107 }
108
109 r0 += tailstep;
110 r1 += tailstep;
111 }
112 }
113 }
114