1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
pooling3x3s2_max_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Option & opt)15 static void pooling3x3s2_max_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
16 {
17     int w = bottom_blob.w;
18     int inch = bottom_blob.c;
19 
20     int outw = top_blob.w;
21     int outh = top_blob.h;
22 
23     const int tailstep = (w - 2 * outw + w) * 4;
24 
25     #pragma omp parallel for num_threads(opt.num_threads)
26     for (int q = 0; q < inch; q++)
27     {
28         const Mat img0 = bottom_blob.channel(q);
29         float* outptr = top_blob.channel(q);
30 
31         const float* r0 = img0.row(0);
32         const float* r1 = img0.row(1);
33         const float* r2 = img0.row(2);
34 
35         for (int i = 0; i < outh; i++)
36         {
37             int j = 0;
38 
39             for (; j + 3 < outw; j += 4)
40             {
41 #if __aarch64__
42                 asm volatile(
43                     "prfm   pldl1keep, [%1, #512]   \n"
44                     "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
45 
46                     "fmax   v16.4s, v0.4s, v1.4s    \n"
47                     "fmax   v17.4s, v2.4s, v3.4s    \n"
48 
49                     "prfm   pldl1keep, [%1, #512]   \n"
50                     "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
51 
52                     "fmax   v18.4s, v4.4s, v5.4s    \n"
53                     "fmax   v19.4s, v6.4s, v7.4s    \n"
54 
55                     "ld1    {v8.4s}, [%1]           \n"
56 
57                     "fmax   v20.4s, v16.4s, v2.4s   \n"
58                     "fmax   v21.4s, v17.4s, v4.4s   \n"
59 
60                     "prfm   pldl1keep, [%2, #512]   \n"
61                     "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
62 
63                     "fmax   v22.4s, v18.4s, v6.4s   \n"
64                     "fmax   v23.4s, v19.4s, v8.4s   \n"
65 
66                     "prfm   pldl1keep, [%2, #512]   \n"
67                     "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
68 
69                     "fmax   v16.4s, v0.4s, v1.4s    \n"
70                     "fmax   v17.4s, v2.4s, v3.4s    \n"
71 
72                     "fmax   v18.4s, v4.4s, v5.4s    \n"
73                     "fmax   v19.4s, v6.4s, v7.4s    \n"
74 
75                     "ld1    {v8.4s}, [%2]           \n"
76 
77                     "fmax   v24.4s, v16.4s, v2.4s   \n"
78                     "fmax   v25.4s, v17.4s, v4.4s   \n"
79 
80                     "prfm   pldl1keep, [%3, #512]   \n"
81                     "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
82 
83                     "fmax   v26.4s, v18.4s, v6.4s   \n"
84                     "fmax   v27.4s, v19.4s, v8.4s   \n"
85 
86                     "prfm   pldl1keep, [%3, #512]   \n"
87                     "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n"
88 
89                     "fmax   v16.4s, v0.4s, v1.4s    \n"
90                     "fmax   v17.4s, v2.4s, v3.4s    \n"
91 
92                     "fmax   v18.4s, v4.4s, v5.4s    \n"
93                     "fmax   v19.4s, v6.4s, v7.4s    \n"
94 
95                     "ld1    {v8.4s}, [%3]           \n"
96 
97                     "fmax   v28.4s, v16.4s, v2.4s   \n"
98                     "fmax   v29.4s, v17.4s, v4.4s   \n"
99                     "fmax   v30.4s, v18.4s, v6.4s   \n"
100                     "fmax   v31.4s, v19.4s, v8.4s   \n"
101 
102                     "fmax   v20.4s, v20.4s, v24.4s  \n"
103                     "fmax   v21.4s, v21.4s, v25.4s  \n"
104                     "fmax   v22.4s, v22.4s, v26.4s  \n"
105                     "fmax   v23.4s, v23.4s, v27.4s  \n"
106 
107                     "fmax   v20.4s, v20.4s, v28.4s  \n"
108                     "fmax   v21.4s, v21.4s, v29.4s  \n"
109                     "fmax   v22.4s, v22.4s, v30.4s  \n"
110                     "fmax   v23.4s, v23.4s, v31.4s  \n"
111 
112                     "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
113 
114                     : "=r"(outptr), // %0
115                     "=r"(r0),     // %1
116                     "=r"(r1),     // %2
117                     "=r"(r2)      // %3
118                     : "0"(outptr),
119                     "1"(r0),
120                     "2"(r1),
121                     "3"(r2)
122                     : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
123 #else  // __aarch64__
124                 asm volatile(
125                     "pld        [%1, #512]      \n"
126                     "vldm       %1!, {d0-d7}    \n"
127 
128                     "pld        [%2, #512]      \n"
129                     "vldm       %2!, {d8-d15}   \n"
130 
131                     "vmax.f32   q0, q0, q4      \n"
132                     "vmax.f32   q1, q1, q5      \n"
133 
134                     "pld        [%3, #512]      \n"
135                     "vldm       %3!, {d16-d23}  \n"
136 
137                     "vmax.f32   q2, q2, q6      \n"
138                     "vmax.f32   q3, q3, q7      \n"
139 
140                     "vmax.f32   q0, q0, q8      \n"
141                     "vmax.f32   q1, q1, q9      \n"
142 
143                     "pld        [%1, #512]      \n"
144                     "vldm       %1!, {d8-d15}   \n"
145 
146                     "vmax.f32   q2, q2, q10     \n"
147                     "vmax.f32   q3, q3, q11     \n"
148 
149                     "pld        [%2, #512]      \n"
150                     "vldm       %2!, {d16-d23}  \n"
151 
152                     "vmax.f32   q4, q4, q8      \n"
153                     "vmax.f32   q5, q5, q9      \n"
154 
155                     "pld        [%3, #512]      \n"
156                     "vldm       %3!, {d24-d31}  \n"
157 
158                     "vmax.f32   q6, q6, q10     \n"
159                     "vmax.f32   q7, q7, q11     \n"
160 
161                     "vmax.f32   q4, q4, q12     \n"
162                     "vmax.f32   q5, q5, q13     \n"
163 
164                     "vld1.f32   {d24-d25}, [%1 :128] \n"
165                     "vld1.f32   {d26-d27}, [%2 :128] \n"
166 
167                     "vmax.f32   q6, q6, q14     \n"
168                     "vmax.f32   q7, q7, q15     \n"
169 
170                     "vld1.f32   {d28-d29}, [%3 :128] \n"
171 
172                     "vmax.f32   q8, q12, q13    \n"
173                     "vmax.f32   q8, q8, q14     \n"
174 
175                     "vmax.f32   q12, q0, q1     \n"
176                     "vmax.f32   q13, q2, q3     \n"
177                     "vmax.f32   q14, q4, q5     \n"
178                     "vmax.f32   q15, q6, q7     \n"
179 
180                     "vmax.f32   q12, q12, q2    \n"
181                     "vmax.f32   q13, q13, q4    \n"
182                     "vmax.f32   q14, q14, q6    \n"
183                     "vmax.f32   q15, q15, q8    \n"
184 
185                     "vstm       %0!, {d24-d31}  \n"
186 
187                     : "=r"(outptr), // %0
188                     "=r"(r0),     // %1
189                     "=r"(r1),     // %2
190                     "=r"(r2)      // %3
191                     : "0"(outptr),
192                     "1"(r0),
193                     "2"(r1),
194                     "3"(r2)
195                     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
196 #endif // __aarch64__
197             }
198             for (; j + 1 < outw; j += 2)
199             {
200 #if __aarch64__
201                 asm volatile(
202                     "prfm   pldl1keep, [%1, #512]   \n"
203                     "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
204 
205                     "prfm   pldl1keep, [%2, #512]   \n"
206                     "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
207 
208                     "fmax   v16.4s, v0.4s, v4.4s    \n"
209                     "fmax   v17.4s, v1.4s, v5.4s    \n"
210 
211                     "prfm   pldl1keep, [%3, #512]   \n"
212                     "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
213 
214                     "fmax   v18.4s, v2.4s, v6.4s    \n"
215                     "fmax   v19.4s, v3.4s, v7.4s    \n"
216 
217                     "ld1    {v0.4s}, [%1]           \n"
218 
219                     "fmax   v16.4s, v16.4s, v20.4s  \n"
220                     "fmax   v17.4s, v17.4s, v21.4s  \n"
221 
222                     "ld1    {v1.4s}, [%2]           \n"
223 
224                     "fmax   v18.4s, v18.4s, v22.4s  \n"
225                     "fmax   v19.4s, v19.4s, v23.4s  \n"
226 
227                     "ld1    {v2.4s}, [%3]           \n"
228 
229                     "fmax   v3.4s, v0.4s, v1.4s     \n"
230 
231                     "fmax   v20.4s, v16.4s, v17.4s  \n"
232                     "fmax   v21.4s, v18.4s, v19.4s  \n"
233 
234                     "fmax   v3.4s, v3.4s, v2.4s     \n"
235 
236                     "fmax   v20.4s, v20.4s, v18.4s  \n"
237                     "fmax   v21.4s, v21.4s, v3.4s   \n"
238 
239                     "st1    {v20.4s, v21.4s}, [%0], #32 \n"
240 
241                     : "=r"(outptr), // %0
242                     "=r"(r0),     // %1
243                     "=r"(r1),     // %2
244                     "=r"(r2)      // %3
245                     : "0"(outptr),
246                     "1"(r0),
247                     "2"(r1),
248                     "3"(r2)
249                     : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
250 #else  // __aarch64__
251                 asm volatile(
252                     "pld        [%1, #512]      \n"
253                     "vldm       %1!, {d0-d7}    \n"
254 
255                     "pld        [%2, #512]      \n"
256                     "vldm       %2!, {d8-d15}   \n"
257 
258                     "vmax.f32   q12, q0, q4     \n"
259                     "vmax.f32   q13, q1, q5     \n"
260 
261                     "pld        [%3, #512]      \n"
262                     "vldm       %3!, {d16-d23}  \n"
263 
264                     "vmax.f32   q14, q2, q6     \n"
265                     "vmax.f32   q15, q3, q7     \n"
266 
267                     "vld1.f32   {d0-d1}, [%1 :128] \n"
268 
269                     "vmax.f32   q12, q12, q8    \n"
270                     "vmax.f32   q13, q13, q9    \n"
271 
272                     "vld1.f32   {d2-d3}, [%2 :128] \n"
273 
274                     "vmax.f32   q14, q14, q10   \n"
275                     "vmax.f32   q15, q15, q11   \n"
276 
277                     "vld1.f32   {d4-d5}, [%3 :128] \n"
278 
279                     "vmax.f32   q3, q0, q1      \n"
280 
281                     "vmax.f32   q4, q12, q13    \n"
282                     "vmax.f32   q5, q14, q15    \n"
283 
284                     "vmax.f32   q3, q3, q2      \n"
285 
286                     "vmax.f32   q4, q4, q14     \n"
287                     "vmax.f32   q5, q5, q3      \n"
288 
289                     "vst1.f32   {d8-d11}, [%0 :128]! \n"
290 
291                     : "=r"(outptr), // %0
292                     "=r"(r0),     // %1
293                     "=r"(r1),     // %2
294                     "=r"(r2)      // %3
295                     : "0"(outptr),
296                     "1"(r0),
297                     "2"(r1),
298                     "3"(r2)
299                     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
300 #endif // __aarch64__
301             }
302             for (; j < outw; j++)
303             {
304                 float32x4_t _r00 = vld1q_f32(r0);
305                 float32x4_t _r01 = vld1q_f32(r0 + 4);
306                 float32x4_t _r02 = vld1q_f32(r0 + 8);
307                 float32x4_t _r10 = vld1q_f32(r1);
308                 float32x4_t _r11 = vld1q_f32(r1 + 4);
309                 float32x4_t _r12 = vld1q_f32(r1 + 8);
310                 float32x4_t _r20 = vld1q_f32(r2);
311                 float32x4_t _r21 = vld1q_f32(r2 + 4);
312                 float32x4_t _r22 = vld1q_f32(r2 + 8);
313 
314                 float32x4_t _max0 = vmaxq_f32(vmaxq_f32(_r00, _r01), _r02);
315                 float32x4_t _max1 = vmaxq_f32(vmaxq_f32(_r10, _r11), _r12);
316                 float32x4_t _max2 = vmaxq_f32(vmaxq_f32(_r20, _r21), _r22);
317 
318                 float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);
319 
320                 vst1q_f32(outptr, _max);
321 
322                 r0 += 8;
323                 r1 += 8;
324                 r2 += 8;
325                 outptr += 4;
326             }
327 
328             r0 += tailstep;
329             r1 += tailstep;
330             r2 += tailstep;
331         }
332     }
333 }
334