1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
padding_constant_pack8_avx(const Mat & src,Mat & dst,int top,int bottom,int left,int right,__m256 v)15 static void padding_constant_pack8_avx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m256 v)
16 {
17     const float* ptr = src;
18     float* outptr = dst;
19     int top_size = top * dst.w;
20     int bottom_size = bottom * dst.w;
21 
22     // fill top
23     for (int y = 0; y < top_size; y++)
24     {
25         _mm256_storeu_ps(outptr, v);
26         outptr += 8;
27     }
28     // fill center
29     for (int y = 0; y < src.h; y++)
30     {
31         for (int x = 0; x < left; x++)
32         {
33             _mm256_storeu_ps(outptr, v);
34             outptr += 8;
35         }
36         for (int x = 0; x < src.w; x++)
37         {
38             _mm256_storeu_ps(outptr, _mm256_loadu_ps(ptr));
39             ptr += 8;
40             outptr += 8;
41         }
42         for (int x = 0; x < right; x++)
43         {
44             _mm256_storeu_ps(outptr, v);
45             outptr += 8;
46         }
47     }
48     // fill top
49     for (int y = 0; y < bottom_size; y++)
50     {
51         _mm256_storeu_ps(outptr, v);
52         outptr += 8;
53     }
54 }
55 
padding_replicate_pack8_avx(const Mat & src,Mat & dst,int top,int bottom,int left,int right)56 static void padding_replicate_pack8_avx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
57 {
58     const float* ptr = src;
59     float* outptr = dst;
60 
61     // fill top
62     for (int y = 0; y < top; y++)
63     {
64         const float* ptr0 = ptr;
65         __m256 _p = _mm256_loadu_ps(ptr0);
66         for (int x = 0; x < left; x++)
67         {
68             _mm256_storeu_ps(outptr, _p);
69             outptr += 8;
70         }
71         for (int x = 0; x < src.w; x++)
72         {
73             _p = _mm256_loadu_ps(ptr0);
74             _mm256_storeu_ps(outptr, _p);
75             ptr0 += 8;
76             outptr += 8;
77         }
78         for (int x = 0; x < right; x++)
79         {
80             _mm256_storeu_ps(outptr, _p);
81             outptr += 8;
82         }
83     }
84     // fill center
85     for (int y = 0; y < src.h; y++)
86     {
87         __m256 _p = _mm256_loadu_ps(ptr);
88         for (int x = 0; x < left; x++)
89         {
90             _mm256_storeu_ps(outptr, _p);
91             outptr += 8;
92         }
93         for (int x = 0; x < src.w; x++)
94         {
95             _p = _mm256_loadu_ps(ptr);
96             _mm256_storeu_ps(outptr, _p);
97             ptr += 8;
98             outptr += 8;
99         }
100         for (int x = 0; x < right; x++)
101         {
102             _mm256_storeu_ps(outptr, _p);
103             outptr += 8;
104         }
105     }
106     // fill bottom
107     ptr -= src.w * 8;
108     for (int y = 0; y < bottom; y++)
109     {
110         const float* ptr0 = ptr;
111         __m256 _p = _mm256_loadu_ps(ptr0);
112         for (int x = 0; x < left; x++)
113         {
114             _mm256_storeu_ps(outptr, _p);
115             outptr += 8;
116         }
117         for (int x = 0; x < src.w; x++)
118         {
119             _p = _mm256_loadu_ps(ptr0);
120             _mm256_storeu_ps(outptr, _p);
121             ptr0 += 8;
122             outptr += 8;
123         }
124         for (int x = 0; x < right; x++)
125         {
126             _mm256_storeu_ps(outptr, _p);
127             outptr += 8;
128         }
129     }
130 }
131 
padding_reflect_pack8_avx(const Mat & src,Mat & dst,int top,int bottom,int left,int right)132 static void padding_reflect_pack8_avx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
133 {
134     const float* ptr = src;
135     float* outptr = dst;
136 
137     // fill top
138     ptr += top * src.w * 8;
139     for (int y = 0; y < top; y++)
140     {
141         const float* ptr0 = ptr;
142         for (int x = 0; x < left; x++)
143         {
144             __m256 _p = _mm256_loadu_ps(ptr0 + (left - x) * 8);
145             _mm256_storeu_ps(outptr, _p);
146             outptr += 8;
147         }
148         for (int x = 0; x < src.w; x++)
149         {
150             __m256 _p = _mm256_loadu_ps(ptr0);
151             _mm256_storeu_ps(outptr, _p);
152             ptr0 += 8;
153             outptr += 8;
154         }
155         for (int x = 0; x < right; x++)
156         {
157             __m256 _p = _mm256_loadu_ps(ptr0 - 16 - x * 8);
158             _mm256_storeu_ps(outptr, _p);
159             outptr += 8;
160         }
161         ptr -= src.w * 8;
162     }
163     // fill center
164     for (int y = 0; y < src.h; y++)
165     {
166         for (int x = 0; x < left; x++)
167         {
168             __m256 _p = _mm256_loadu_ps(ptr + (left - x) * 8);
169             _mm256_storeu_ps(outptr, _p);
170             outptr += 8;
171         }
172         for (int x = 0; x < src.w; x++)
173         {
174             __m256 _p = _mm256_loadu_ps(ptr);
175             _mm256_storeu_ps(outptr, _p);
176             ptr += 8;
177             outptr += 8;
178         }
179         for (int x = 0; x < right; x++)
180         {
181             __m256 _p = _mm256_loadu_ps(ptr - 16 - x * 8);
182             _mm256_storeu_ps(outptr, _p);
183             outptr += 8;
184         }
185     }
186     // fill bottom
187     ptr -= 2 * src.w * 8;
188     for (int y = 0; y < bottom; y++)
189     {
190         const float* ptr0 = ptr;
191         for (int x = 0; x < left; x++)
192         {
193             __m256 _p = _mm256_loadu_ps(ptr0 + (left - x) * 8);
194             _mm256_storeu_ps(outptr, _p);
195             outptr += 8;
196         }
197         for (int x = 0; x < src.w; x++)
198         {
199             __m256 _p = _mm256_loadu_ps(ptr0);
200             _mm256_storeu_ps(outptr, _p);
201             ptr0 += 8;
202             outptr += 8;
203         }
204         for (int x = 0; x < right; x++)
205         {
206             __m256 _p = _mm256_loadu_ps(ptr0 - 16 - x * 8);
207             _mm256_storeu_ps(outptr, _p);
208             outptr += 8;
209         }
210         ptr -= src.w * 8;
211     }
212 }
213