1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // This file intentionally does not have header guards, it's included from
6 // vector_math_avx.h and from vector_math_sse.h with different macro
7 // definitions. The following line silences a presubmit warning that would
8 // otherwise be triggered by this: no-include-guard-because-multiply-included
9
10 #include "build/build_config.h"
11
12 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
13
14 #include <algorithm>
15 #include <cmath>
16
17 #include "third_party/blink/renderer/platform/audio/audio_array.h"
18 #include "third_party/blink/renderer/platform/wtf/assertions.h"
19
20 namespace blink {
21 namespace vector_math {
22 namespace VECTOR_MATH_SIMD_NAMESPACE_NAME {
23
24 // This stride is chosen so that the same prepared filter created by
25 // AVX::PrepareFilterForConv can be used by both AVX::Conv and sse::Conv.
26 // A prepared filter created by sse::PrepareFilterForConv can only be used
27 // by sse::Conv.
28 constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister;
29
IsAligned(const float * p)30 bool IsAligned(const float* p) {
31 constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u;
32 constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u;
33 return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u;
34 }
35
PrepareFilterForConv(const float * filter_p,int filter_stride,size_t filter_size,AudioFloatArray * prepared_filter)36 void PrepareFilterForConv(const float* filter_p,
37 int filter_stride,
38 size_t filter_size,
39 AudioFloatArray* prepared_filter) {
40 // Only contiguous convolution is implemented. Correlation (positive
41 // |filter_stride|) and support for non-contiguous vectors are not
42 // implemented.
43 DCHECK_EQ(-1, filter_stride);
44 DCHECK(prepared_filter);
45
46 // Reverse the filter and repeat each value across a vector
47 prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister *
48 filter_size);
49 MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data());
50 for (size_t i = 0; i < filter_size; ++i) {
51 reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i));
52 }
53 }
54
55 // Direct vector convolution:
56 // dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
57 // provided that |prepared_filter_p| is |prepared_filter->Data()| and that
58 // |prepared_filter| is prepared with |PrepareFilterForConv|.
Conv(const float * source_p,const float * prepared_filter_p,float * dest_p,uint32_t frames_to_process,size_t filter_size)59 void Conv(const float* source_p,
60 const float* prepared_filter_p,
61 float* dest_p,
62 uint32_t frames_to_process,
63 size_t filter_size) {
64 const float* const dest_end_p = dest_p + frames_to_process;
65
66 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
67 DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister);
68
69 const MType* reversed_filter =
70 reinterpret_cast<const MType*>(prepared_filter_p);
71
72 // Do convolution with kPackedFloatsPerRegister inputs at a time.
73 while (dest_p < dest_end_p) {
74 MType m_convolution_sum = MM_PS(setzero)();
75
76 // |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll
77 // the loop by kPackedFloatsPerRegister, manually.
78 for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) {
79 for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) {
80 size_t k = i + j;
81 MType m_product;
82 MType m_source;
83
84 m_source = MM_PS(loadu)(source_p + k);
85 m_product =
86 MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source);
87 m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product);
88 }
89 }
90 MM_PS(storeu)(dest_p, m_convolution_sum);
91
92 source_p += kPackedFloatsPerRegister;
93 dest_p += kPackedFloatsPerRegister;
94 }
95 }
96
97 // dest[k] = source1[k] + source2[k]
Vadd(const float * source1p,const float * source2p,float * dest_p,uint32_t frames_to_process)98 void Vadd(const float* source1p,
99 const float* source2p,
100 float* dest_p,
101 uint32_t frames_to_process) {
102 const float* const source1_end_p = source1p + frames_to_process;
103
104 DCHECK(IsAligned(source1p));
105 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
106
107 #define ADD_ALL(loadSource2, storeDest) \
108 while (source1p < source1_end_p) { \
109 MType m_source1 = MM_PS(load)(source1p); \
110 MType m_source2 = MM_PS(loadSource2)(source2p); \
111 MType m_dest = MM_PS(add)(m_source1, m_source2); \
112 MM_PS(storeDest)(dest_p, m_dest); \
113 source1p += kPackedFloatsPerRegister; \
114 source2p += kPackedFloatsPerRegister; \
115 dest_p += kPackedFloatsPerRegister; \
116 }
117
118 if (IsAligned(source2p)) {
119 if (IsAligned(dest_p)) {
120 ADD_ALL(load, store);
121 } else {
122 ADD_ALL(load, storeu);
123 }
124 } else {
125 if (IsAligned(dest_p)) {
126 ADD_ALL(loadu, store);
127 } else {
128 ADD_ALL(loadu, storeu);
129 }
130 }
131
132 #undef ADD_ALL
133 }
134
135 // dest[k] = clip(source[k], low_threshold, high_threshold)
136 // = max(low_threshold, min(high_threshold, source[k]))
Vclip(const float * source_p,const float * low_threshold_p,const float * high_threshold_p,float * dest_p,uint32_t frames_to_process)137 void Vclip(const float* source_p,
138 const float* low_threshold_p,
139 const float* high_threshold_p,
140 float* dest_p,
141 uint32_t frames_to_process) {
142 const float* const source_end_p = source_p + frames_to_process;
143
144 DCHECK(IsAligned(source_p));
145 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
146
147 MType m_low_threshold = MM_PS(set1)(*low_threshold_p);
148 MType m_high_threshold = MM_PS(set1)(*high_threshold_p);
149
150 #define CLIP_ALL(storeDest) \
151 while (source_p < source_end_p) { \
152 MType m_source = MM_PS(load)(source_p); \
153 MType m_dest = \
154 MM_PS(max)(m_low_threshold, MM_PS(min)(m_high_threshold, m_source)); \
155 MM_PS(storeDest)(dest_p, m_dest); \
156 source_p += kPackedFloatsPerRegister; \
157 dest_p += kPackedFloatsPerRegister; \
158 }
159
160 if (IsAligned(dest_p)) {
161 CLIP_ALL(store);
162 } else {
163 CLIP_ALL(storeu);
164 }
165
166 #undef CLIP_ALL
167 }
168
169 // *max_p = max(*max_p, source_max) where
170 // source_max = max(abs(source[k])) for all k
Vmaxmgv(const float * source_p,float * max_p,uint32_t frames_to_process)171 void Vmaxmgv(const float* source_p, float* max_p, uint32_t frames_to_process) {
172 constexpr uint32_t kMask = 0x7FFFFFFFu;
173
174 const float* const source_end_p = source_p + frames_to_process;
175
176 DCHECK(IsAligned(source_p));
177 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
178
179 MType m_mask = MM_PS(set1)(*reinterpret_cast<const float*>(&kMask));
180 MType m_max = MM_PS(setzero)();
181
182 while (source_p < source_end_p) {
183 MType m_source = MM_PS(load)(source_p);
184 // Calculate the absolute value by ANDing the source with the mask,
185 // which will set the sign bit to 0.
186 m_source = MM_PS(and)(m_source, m_mask);
187 m_max = MM_PS(max)(m_source, m_max);
188 source_p += kPackedFloatsPerRegister;
189 }
190
191 // Combine the packed floats.
192 const float* maxes = reinterpret_cast<const float*>(&m_max);
193 for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i)
194 *max_p = std::max(*max_p, maxes[i]);
195 }
196
197 // dest[k] = source1[k] * source2[k]
Vmul(const float * source1p,const float * source2p,float * dest_p,uint32_t frames_to_process)198 void Vmul(const float* source1p,
199 const float* source2p,
200 float* dest_p,
201 uint32_t frames_to_process) {
202 const float* const source1_end_p = source1p + frames_to_process;
203
204 DCHECK(IsAligned(source1p));
205 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
206
207 #define MULTIPLY_ALL(loadSource2, storeDest) \
208 while (source1p < source1_end_p) { \
209 MType m_source1 = MM_PS(load)(source1p); \
210 MType m_source2 = MM_PS(loadSource2)(source2p); \
211 MType m_dest = MM_PS(mul)(m_source1, m_source2); \
212 MM_PS(storeDest)(dest_p, m_dest); \
213 source1p += kPackedFloatsPerRegister; \
214 source2p += kPackedFloatsPerRegister; \
215 dest_p += kPackedFloatsPerRegister; \
216 }
217
218 if (IsAligned(source2p)) {
219 if (IsAligned(dest_p)) {
220 MULTIPLY_ALL(load, store);
221 } else {
222 MULTIPLY_ALL(load, storeu);
223 }
224 } else {
225 if (IsAligned(dest_p)) {
226 MULTIPLY_ALL(loadu, store);
227 } else {
228 MULTIPLY_ALL(loadu, storeu);
229 }
230 }
231
232 #undef MULTIPLY_ALL
233 }
234
235 // dest[k] += scale * source[k]
Vsma(const float * source_p,const float * scale,float * dest_p,uint32_t frames_to_process)236 void Vsma(const float* source_p,
237 const float* scale,
238 float* dest_p,
239 uint32_t frames_to_process) {
240 const float* const source_end_p = source_p + frames_to_process;
241
242 DCHECK(IsAligned(source_p));
243 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
244
245 const MType m_scale = MM_PS(set1)(*scale);
246
247 #define SCALAR_MULTIPLY_AND_ADD_ALL(loadDest, storeDest) \
248 while (source_p < source_end_p) { \
249 MType m_source = MM_PS(load)(source_p); \
250 MType m_dest = MM_PS(loadDest)(dest_p); \
251 m_dest = MM_PS(add)(m_dest, MM_PS(mul)(m_scale, m_source)); \
252 MM_PS(storeDest)(dest_p, m_dest); \
253 source_p += kPackedFloatsPerRegister; \
254 dest_p += kPackedFloatsPerRegister; \
255 }
256
257 if (IsAligned(dest_p)) {
258 SCALAR_MULTIPLY_AND_ADD_ALL(load, store);
259 } else {
260 SCALAR_MULTIPLY_AND_ADD_ALL(loadu, storeu);
261 }
262
263 #undef SCALAR_MULTIPLY_AND_ADD_ALL
264 }
265
266 // dest[k] = scale * source[k]
Vsmul(const float * source_p,const float * scale,float * dest_p,uint32_t frames_to_process)267 void Vsmul(const float* source_p,
268 const float* scale,
269 float* dest_p,
270 uint32_t frames_to_process) {
271 const float* const source_end_p = source_p + frames_to_process;
272
273 DCHECK(IsAligned(source_p));
274 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
275
276 const MType m_scale = MM_PS(set1)(*scale);
277
278 #define SCALAR_MULTIPLY_ALL(storeDest) \
279 while (source_p < source_end_p) { \
280 MType m_source = MM_PS(load)(source_p); \
281 MType m_dest = MM_PS(mul)(m_scale, m_source); \
282 MM_PS(storeDest)(dest_p, m_dest); \
283 source_p += kPackedFloatsPerRegister; \
284 dest_p += kPackedFloatsPerRegister; \
285 }
286
287 if (IsAligned(dest_p)) {
288 SCALAR_MULTIPLY_ALL(store);
289 } else {
290 SCALAR_MULTIPLY_ALL(storeu);
291 }
292
293 #undef SCALAR_MULTIPLY_ALL
294 }
295
296 // sum += sum(source[k]^2) for all k
Vsvesq(const float * source_p,float * sum_p,uint32_t frames_to_process)297 void Vsvesq(const float* source_p, float* sum_p, uint32_t frames_to_process) {
298 const float* const source_end_p = source_p + frames_to_process;
299
300 DCHECK(IsAligned(source_p));
301 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
302
303 MType m_sum = MM_PS(setzero)();
304
305 while (source_p < source_end_p) {
306 MType m_source = MM_PS(load)(source_p);
307 m_sum = MM_PS(add)(m_sum, MM_PS(mul)(m_source, m_source));
308 source_p += kPackedFloatsPerRegister;
309 }
310
311 // Combine the packed floats.
312 const float* sums = reinterpret_cast<const float*>(&m_sum);
313 for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i)
314 *sum_p += sums[i];
315 }
316
317 // real_dest[k] = real1[k] * real2[k] - imag1[k] * imag2[k]
318 // imag_dest[k] = real1[k] * imag2[k] + imag1[k] * real2[k]
Zvmul(const float * real1p,const float * imag1p,const float * real2p,const float * imag2p,float * real_dest_p,float * imag_dest_p,uint32_t frames_to_process)319 void Zvmul(const float* real1p,
320 const float* imag1p,
321 const float* real2p,
322 const float* imag2p,
323 float* real_dest_p,
324 float* imag_dest_p,
325 uint32_t frames_to_process) {
326 DCHECK(IsAligned(real1p));
327 DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
328
329 #define MULTIPLY_ALL(loadOtherThanReal1, storeDest) \
330 for (size_t i = 0u; i < frames_to_process; i += kPackedFloatsPerRegister) { \
331 MType real1 = MM_PS(load)(real1p + i); \
332 MType real2 = MM_PS(loadOtherThanReal1)(real2p + i); \
333 MType imag1 = MM_PS(loadOtherThanReal1)(imag1p + i); \
334 MType imag2 = MM_PS(loadOtherThanReal1)(imag2p + i); \
335 MType real = \
336 MM_PS(sub)(MM_PS(mul)(real1, real2), MM_PS(mul)(imag1, imag2)); \
337 MType imag = \
338 MM_PS(add)(MM_PS(mul)(real1, imag2), MM_PS(mul)(imag1, real2)); \
339 MM_PS(storeDest)(real_dest_p + i, real); \
340 MM_PS(storeDest)(imag_dest_p + i, imag); \
341 }
342
343 if (IsAligned(imag1p) && IsAligned(real2p) && IsAligned(imag2p) &&
344 IsAligned(real_dest_p) && IsAligned(imag_dest_p)) {
345 MULTIPLY_ALL(load, store);
346 } else {
347 MULTIPLY_ALL(loadu, storeu);
348 }
349
350 #undef MULTIPLY_ALL
351 }
352
353 } // namespace VECTOR_MATH_SIMD_NAMESPACE_NAME
354 } // namespace vector_math
355 } // namespace blink
356
357 #endif // defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
358