1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
padding_constant_pack4_sse(const Mat & src,Mat & dst,int top,int bottom,int left,int right,__m128 v)15 static void padding_constant_pack4_sse(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v)
16 {
17 const float* ptr = src;
18 float* outptr = dst;
19 int top_size = top * dst.w;
20 int bottom_size = bottom * dst.w;
21
22 // fill top
23 for (int y = 0; y < top_size; y++)
24 {
25 _mm_storeu_ps(outptr, v);
26 outptr += 4;
27 }
28 // fill center
29 for (int y = 0; y < src.h; y++)
30 {
31 for (int x = 0; x < left; x++)
32 {
33 _mm_storeu_ps(outptr, v);
34 outptr += 4;
35 }
36 for (int x = 0; x < src.w; x++)
37 {
38 _mm_storeu_ps(outptr, _mm_loadu_ps(ptr));
39 ptr += 4;
40 outptr += 4;
41 }
42 for (int x = 0; x < right; x++)
43 {
44 _mm_storeu_ps(outptr, v);
45 outptr += 4;
46 }
47 }
48 // fill top
49 for (int y = 0; y < bottom_size; y++)
50 {
51 _mm_storeu_ps(outptr, v);
52 outptr += 4;
53 }
54 }
55
padding_replicate_pack4_sse(const Mat & src,Mat & dst,int top,int bottom,int left,int right)56 static void padding_replicate_pack4_sse(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
57 {
58 const float* ptr = src;
59 float* outptr = dst;
60
61 // fill top
62 for (int y = 0; y < top; y++)
63 {
64 const float* ptr0 = ptr;
65 __m128 _p = _mm_loadu_ps(ptr0);
66 for (int x = 0; x < left; x++)
67 {
68 _mm_storeu_ps(outptr, _p);
69 outptr += 4;
70 }
71 for (int x = 0; x < src.w; x++)
72 {
73 _p = _mm_loadu_ps(ptr0);
74 _mm_storeu_ps(outptr, _p);
75 ptr0 += 4;
76 outptr += 4;
77 }
78 for (int x = 0; x < right; x++)
79 {
80 _mm_storeu_ps(outptr, _p);
81 outptr += 4;
82 }
83 }
84 // fill center
85 for (int y = 0; y < src.h; y++)
86 {
87 __m128 _p = _mm_loadu_ps(ptr);
88 for (int x = 0; x < left; x++)
89 {
90 _mm_storeu_ps(outptr, _p);
91 outptr += 4;
92 }
93 for (int x = 0; x < src.w; x++)
94 {
95 _p = _mm_loadu_ps(ptr);
96 _mm_storeu_ps(outptr, _p);
97 ptr += 4;
98 outptr += 4;
99 }
100 for (int x = 0; x < right; x++)
101 {
102 _mm_storeu_ps(outptr, _p);
103 outptr += 4;
104 }
105 }
106 // fill bottom
107 ptr -= src.w * 4;
108 for (int y = 0; y < bottom; y++)
109 {
110 const float* ptr0 = ptr;
111 __m128 _p = _mm_loadu_ps(ptr0);
112 for (int x = 0; x < left; x++)
113 {
114 _mm_storeu_ps(outptr, _p);
115 outptr += 4;
116 }
117 for (int x = 0; x < src.w; x++)
118 {
119 _p = _mm_loadu_ps(ptr0);
120 _mm_storeu_ps(outptr, _p);
121 ptr0 += 4;
122 outptr += 4;
123 }
124 for (int x = 0; x < right; x++)
125 {
126 _mm_storeu_ps(outptr, _p);
127 outptr += 4;
128 }
129 }
130 }
131
padding_reflect_pack4_sse(const Mat & src,Mat & dst,int top,int bottom,int left,int right)132 static void padding_reflect_pack4_sse(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
133 {
134 const float* ptr = src;
135 float* outptr = dst;
136
137 // fill top
138 ptr += top * src.w * 4;
139 for (int y = 0; y < top; y++)
140 {
141 const float* ptr0 = ptr;
142 for (int x = 0; x < left; x++)
143 {
144 __m128 _p = _mm_loadu_ps(ptr0 + (left - x) * 4);
145 _mm_storeu_ps(outptr, _p);
146 outptr += 4;
147 }
148 for (int x = 0; x < src.w; x++)
149 {
150 __m128 _p = _mm_loadu_ps(ptr0);
151 _mm_storeu_ps(outptr, _p);
152 ptr0 += 4;
153 outptr += 4;
154 }
155 for (int x = 0; x < right; x++)
156 {
157 __m128 _p = _mm_loadu_ps(ptr0 - 8 - x * 4);
158 _mm_storeu_ps(outptr, _p);
159 outptr += 4;
160 }
161 ptr -= src.w * 4;
162 }
163 // fill center
164 for (int y = 0; y < src.h; y++)
165 {
166 for (int x = 0; x < left; x++)
167 {
168 __m128 _p = _mm_loadu_ps(ptr + (left - x) * 4);
169 _mm_storeu_ps(outptr, _p);
170 outptr += 4;
171 }
172 for (int x = 0; x < src.w; x++)
173 {
174 __m128 _p = _mm_loadu_ps(ptr);
175 _mm_storeu_ps(outptr, _p);
176 ptr += 4;
177 outptr += 4;
178 }
179 for (int x = 0; x < right; x++)
180 {
181 __m128 _p = _mm_loadu_ps(ptr - 8 - x * 4);
182 _mm_storeu_ps(outptr, _p);
183 outptr += 4;
184 }
185 }
186 // fill bottom
187 ptr -= 2 * src.w * 4;
188 for (int y = 0; y < bottom; y++)
189 {
190 const float* ptr0 = ptr;
191 for (int x = 0; x < left; x++)
192 {
193 __m128 _p = _mm_loadu_ps(ptr0 + (left - x) * 4);
194 _mm_storeu_ps(outptr, _p);
195 outptr += 4;
196 }
197 for (int x = 0; x < src.w; x++)
198 {
199 __m128 _p = _mm_loadu_ps(ptr0);
200 _mm_storeu_ps(outptr, _p);
201 ptr0 += 4;
202 outptr += 4;
203 }
204 for (int x = 0; x < right; x++)
205 {
206 __m128 _p = _mm_loadu_ps(ptr0 - 8 - x * 4);
207 _mm_storeu_ps(outptr, _p);
208 outptr += 4;
209 }
210 ptr -= src.w * 4;
211 }
212 }
213