1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
pooling2x2s2_max_neon(const Mat & bottom_blob,Mat & top_blob,const Option & opt)15 static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
16 {
17     int w = bottom_blob.w;
18     int inch = bottom_blob.c;
19 
20     int outw = top_blob.w;
21     int outh = top_blob.h;
22 
23     const int tailstep = w - 2 * outw + w;
24 
25     #pragma omp parallel for num_threads(opt.num_threads)
26     for (int q = 0; q < inch; q++)
27     {
28         const float* img0 = bottom_blob.channel(q);
29         float* outptr = top_blob.channel(q);
30 
31         const float* r0 = img0;
32         const float* r1 = img0 + w;
33 
34         for (int i = 0; i < outh; i++)
35         {
36 #if __ARM_NEON
37             int nn = outw >> 2;
38             int remain = outw - (nn << 2);
39 #else
40             int remain = outw;
41 #endif // __ARM_NEON
42 
43 #if __ARM_NEON
44 #if __aarch64__
45             if (nn > 0)
46             {
47                 asm volatile(
48                     "0:                                   \n"
49                     "prfm       pldl1keep, [%1, #256]     \n"
50                     "prfm       pldl1keep, [%2, #256]     \n"
51                     "ld1        {v0.4s, v1.4s}, [%1], #32 \n"
52                     "ld1        {v2.4s, v3.4s}, [%2], #32 \n"
53                     "fmax       v0.4s, v0.4s, v2.4s       \n"
54                     "fmax       v1.4s, v1.4s, v3.4s       \n"
55                     "fmaxp      v2.4s, v0.4s, v1.4s       \n"
56                     "subs       %w0, %w0, #1              \n"
57                     "st1        {v2.4s}, [%3], #16        \n"
58                     "bne        0b                        \n"
59                     : "=r"(nn),    // %0
60                     "=r"(r0),    // %1
61                     "=r"(r1),    // %2
62                     "=r"(outptr) // %3
63                     : "0"(nn),
64                     "1"(r0),
65                     "2"(r1),
66                     "3"(outptr)
67                     : "cc", "memory", "v0", "v1", "v2", "v3");
68             }
69 #else
70             if (nn > 0)
71             {
72                 asm volatile(
73                     "0:                             \n"
74                     "pld        [%1, #256]          \n"
75                     "pld        [%2, #256]          \n"
76                     "vld1.f32   {d0-d3}, [%1]!      \n"
77                     "vld1.f32   {d4-d7}, [%2]!      \n"
78                     "vmax.f32   q0, q0, q2          \n"
79                     "vmax.f32   q1, q1, q3          \n"
80                     "vpmax.f32  d4, d0, d1          \n"
81                     "vpmax.f32  d5, d2, d3          \n"
82                     "subs       %0, #1              \n"
83                     "vst1.f32   {d4-d5}, [%3]!      \n"
84                     "bne        0b                  \n"
85                     : "=r"(nn),    // %0
86                     "=r"(r0),    // %1
87                     "=r"(r1),    // %2
88                     "=r"(outptr) // %3
89                     : "0"(nn),
90                     "1"(r0),
91                     "2"(r1),
92                     "3"(outptr)
93                     : "cc", "memory", "q0", "q1", "q2", "q3");
94             }
95 #endif // __aarch64__
96 #endif // __ARM_NEON
97             for (; remain > 0; remain--)
98             {
99                 float max0 = std::max(r0[0], r0[1]);
100                 float max1 = std::max(r1[0], r1[1]);
101 
102                 *outptr = std::max(max0, max1);
103 
104                 r0 += 2;
105                 r1 += 2;
106                 outptr++;
107             }
108 
109             r0 += tailstep;
110             r1 += tailstep;
111         }
112     }
113 }
114