1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // This file intentionally does not have header guards, it's included from
6 // vector_math_avx.h and from vector_math_sse.h with different macro
7 // definitions. The following line silences a presubmit warning that would
8 // otherwise be triggered by this: no-include-guard-because-multiply-included
9 
10 #include "build/build_config.h"
11 
12 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
13 
14 #include <algorithm>
15 #include <cmath>
16 
17 #include "third_party/blink/renderer/platform/audio/audio_array.h"
18 #include "third_party/blink/renderer/platform/wtf/assertions.h"
19 
20 namespace blink {
21 namespace vector_math {
22 namespace VECTOR_MATH_SIMD_NAMESPACE_NAME {
23 
24 // This stride is chosen so that the same prepared filter created by
25 // AVX::PrepareFilterForConv can be used by both AVX::Conv and sse::Conv.
26 // A prepared filter created by sse::PrepareFilterForConv can only be used
27 // by sse::Conv.
28 constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister;
29 
IsAligned(const float * p)30 bool IsAligned(const float* p) {
31   constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u;
32   constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u;
33   return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u;
34 }
35 
PrepareFilterForConv(const float * filter_p,int filter_stride,size_t filter_size,AudioFloatArray * prepared_filter)36 void PrepareFilterForConv(const float* filter_p,
37                           int filter_stride,
38                           size_t filter_size,
39                           AudioFloatArray* prepared_filter) {
40   // Only contiguous convolution is implemented. Correlation (positive
41   // |filter_stride|) and support for non-contiguous vectors are not
42   // implemented.
43   DCHECK_EQ(-1, filter_stride);
44   DCHECK(prepared_filter);
45 
46   // Reverse the filter and repeat each value across a vector
47   prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister *
48                             filter_size);
49   MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data());
50   for (size_t i = 0; i < filter_size; ++i) {
51     reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i));
52   }
53 }
54 
55 // Direct vector convolution:
56 // dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
57 // provided that |prepared_filter_p| is |prepared_filter->Data()| and that
58 // |prepared_filter| is prepared with |PrepareFilterForConv|.
Conv(const float * source_p,const float * prepared_filter_p,float * dest_p,uint32_t frames_to_process,size_t filter_size)59 void Conv(const float* source_p,
60           const float* prepared_filter_p,
61           float* dest_p,
62           uint32_t frames_to_process,
63           size_t filter_size) {
64   const float* const dest_end_p = dest_p + frames_to_process;
65 
66   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
67   DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister);
68 
69   const MType* reversed_filter =
70       reinterpret_cast<const MType*>(prepared_filter_p);
71 
72   // Do convolution with kPackedFloatsPerRegister inputs at a time.
73   while (dest_p < dest_end_p) {
74     MType m_convolution_sum = MM_PS(setzero)();
75 
76     // |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll
77     // the loop by kPackedFloatsPerRegister, manually.
78     for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) {
79       for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) {
80         size_t k = i + j;
81         MType m_product;
82         MType m_source;
83 
84         m_source = MM_PS(loadu)(source_p + k);
85         m_product =
86             MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source);
87         m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product);
88       }
89     }
90     MM_PS(storeu)(dest_p, m_convolution_sum);
91 
92     source_p += kPackedFloatsPerRegister;
93     dest_p += kPackedFloatsPerRegister;
94   }
95 }
96 
97 // dest[k] = source1[k] + source2[k]
Vadd(const float * source1p,const float * source2p,float * dest_p,uint32_t frames_to_process)98 void Vadd(const float* source1p,
99           const float* source2p,
100           float* dest_p,
101           uint32_t frames_to_process) {
102   const float* const source1_end_p = source1p + frames_to_process;
103 
104   DCHECK(IsAligned(source1p));
105   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
106 
107 #define ADD_ALL(loadSource2, storeDest)              \
108   while (source1p < source1_end_p) {                 \
109     MType m_source1 = MM_PS(load)(source1p);         \
110     MType m_source2 = MM_PS(loadSource2)(source2p);  \
111     MType m_dest = MM_PS(add)(m_source1, m_source2); \
112     MM_PS(storeDest)(dest_p, m_dest);                \
113     source1p += kPackedFloatsPerRegister;            \
114     source2p += kPackedFloatsPerRegister;            \
115     dest_p += kPackedFloatsPerRegister;              \
116   }
117 
118   if (IsAligned(source2p)) {
119     if (IsAligned(dest_p)) {
120       ADD_ALL(load, store);
121     } else {
122       ADD_ALL(load, storeu);
123     }
124   } else {
125     if (IsAligned(dest_p)) {
126       ADD_ALL(loadu, store);
127     } else {
128       ADD_ALL(loadu, storeu);
129     }
130   }
131 
132 #undef ADD_ALL
133 }
134 
135 // dest[k] = clip(source[k], low_threshold, high_threshold)
136 //         = max(low_threshold, min(high_threshold, source[k]))
Vclip(const float * source_p,const float * low_threshold_p,const float * high_threshold_p,float * dest_p,uint32_t frames_to_process)137 void Vclip(const float* source_p,
138            const float* low_threshold_p,
139            const float* high_threshold_p,
140            float* dest_p,
141            uint32_t frames_to_process) {
142   const float* const source_end_p = source_p + frames_to_process;
143 
144   DCHECK(IsAligned(source_p));
145   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
146 
147   MType m_low_threshold = MM_PS(set1)(*low_threshold_p);
148   MType m_high_threshold = MM_PS(set1)(*high_threshold_p);
149 
150 #define CLIP_ALL(storeDest)                                                  \
151   while (source_p < source_end_p) {                                          \
152     MType m_source = MM_PS(load)(source_p);                                  \
153     MType m_dest =                                                           \
154         MM_PS(max)(m_low_threshold, MM_PS(min)(m_high_threshold, m_source)); \
155     MM_PS(storeDest)(dest_p, m_dest);                                        \
156     source_p += kPackedFloatsPerRegister;                                    \
157     dest_p += kPackedFloatsPerRegister;                                      \
158   }
159 
160   if (IsAligned(dest_p)) {
161     CLIP_ALL(store);
162   } else {
163     CLIP_ALL(storeu);
164   }
165 
166 #undef CLIP_ALL
167 }
168 
169 // *max_p = max(*max_p, source_max) where
170 // source_max = max(abs(source[k])) for all k
Vmaxmgv(const float * source_p,float * max_p,uint32_t frames_to_process)171 void Vmaxmgv(const float* source_p, float* max_p, uint32_t frames_to_process) {
172   constexpr uint32_t kMask = 0x7FFFFFFFu;
173 
174   const float* const source_end_p = source_p + frames_to_process;
175 
176   DCHECK(IsAligned(source_p));
177   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
178 
179   MType m_mask = MM_PS(set1)(*reinterpret_cast<const float*>(&kMask));
180   MType m_max = MM_PS(setzero)();
181 
182   while (source_p < source_end_p) {
183     MType m_source = MM_PS(load)(source_p);
184     // Calculate the absolute value by ANDing the source with the mask,
185     // which will set the sign bit to 0.
186     m_source = MM_PS(and)(m_source, m_mask);
187     m_max = MM_PS(max)(m_source, m_max);
188     source_p += kPackedFloatsPerRegister;
189   }
190 
191   // Combine the packed floats.
192   const float* maxes = reinterpret_cast<const float*>(&m_max);
193   for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i)
194     *max_p = std::max(*max_p, maxes[i]);
195 }
196 
197 // dest[k] = source1[k] * source2[k]
Vmul(const float * source1p,const float * source2p,float * dest_p,uint32_t frames_to_process)198 void Vmul(const float* source1p,
199           const float* source2p,
200           float* dest_p,
201           uint32_t frames_to_process) {
202   const float* const source1_end_p = source1p + frames_to_process;
203 
204   DCHECK(IsAligned(source1p));
205   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
206 
207 #define MULTIPLY_ALL(loadSource2, storeDest)         \
208   while (source1p < source1_end_p) {                 \
209     MType m_source1 = MM_PS(load)(source1p);         \
210     MType m_source2 = MM_PS(loadSource2)(source2p);  \
211     MType m_dest = MM_PS(mul)(m_source1, m_source2); \
212     MM_PS(storeDest)(dest_p, m_dest);                \
213     source1p += kPackedFloatsPerRegister;            \
214     source2p += kPackedFloatsPerRegister;            \
215     dest_p += kPackedFloatsPerRegister;              \
216   }
217 
218   if (IsAligned(source2p)) {
219     if (IsAligned(dest_p)) {
220       MULTIPLY_ALL(load, store);
221     } else {
222       MULTIPLY_ALL(load, storeu);
223     }
224   } else {
225     if (IsAligned(dest_p)) {
226       MULTIPLY_ALL(loadu, store);
227     } else {
228       MULTIPLY_ALL(loadu, storeu);
229     }
230   }
231 
232 #undef MULTIPLY_ALL
233 }
234 
235 // dest[k] += scale * source[k]
Vsma(const float * source_p,const float * scale,float * dest_p,uint32_t frames_to_process)236 void Vsma(const float* source_p,
237           const float* scale,
238           float* dest_p,
239           uint32_t frames_to_process) {
240   const float* const source_end_p = source_p + frames_to_process;
241 
242   DCHECK(IsAligned(source_p));
243   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
244 
245   const MType m_scale = MM_PS(set1)(*scale);
246 
247 #define SCALAR_MULTIPLY_AND_ADD_ALL(loadDest, storeDest)        \
248   while (source_p < source_end_p) {                             \
249     MType m_source = MM_PS(load)(source_p);                     \
250     MType m_dest = MM_PS(loadDest)(dest_p);                     \
251     m_dest = MM_PS(add)(m_dest, MM_PS(mul)(m_scale, m_source)); \
252     MM_PS(storeDest)(dest_p, m_dest);                           \
253     source_p += kPackedFloatsPerRegister;                       \
254     dest_p += kPackedFloatsPerRegister;                         \
255   }
256 
257   if (IsAligned(dest_p)) {
258     SCALAR_MULTIPLY_AND_ADD_ALL(load, store);
259   } else {
260     SCALAR_MULTIPLY_AND_ADD_ALL(loadu, storeu);
261   }
262 
263 #undef SCALAR_MULTIPLY_AND_ADD_ALL
264 }
265 
266 // dest[k] = scale * source[k]
Vsmul(const float * source_p,const float * scale,float * dest_p,uint32_t frames_to_process)267 void Vsmul(const float* source_p,
268            const float* scale,
269            float* dest_p,
270            uint32_t frames_to_process) {
271   const float* const source_end_p = source_p + frames_to_process;
272 
273   DCHECK(IsAligned(source_p));
274   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
275 
276   const MType m_scale = MM_PS(set1)(*scale);
277 
278 #define SCALAR_MULTIPLY_ALL(storeDest)            \
279   while (source_p < source_end_p) {               \
280     MType m_source = MM_PS(load)(source_p);       \
281     MType m_dest = MM_PS(mul)(m_scale, m_source); \
282     MM_PS(storeDest)(dest_p, m_dest);             \
283     source_p += kPackedFloatsPerRegister;         \
284     dest_p += kPackedFloatsPerRegister;           \
285   }
286 
287   if (IsAligned(dest_p)) {
288     SCALAR_MULTIPLY_ALL(store);
289   } else {
290     SCALAR_MULTIPLY_ALL(storeu);
291   }
292 
293 #undef SCALAR_MULTIPLY_ALL
294 }
295 
296 // sum += sum(source[k]^2) for all k
Vsvesq(const float * source_p,float * sum_p,uint32_t frames_to_process)297 void Vsvesq(const float* source_p, float* sum_p, uint32_t frames_to_process) {
298   const float* const source_end_p = source_p + frames_to_process;
299 
300   DCHECK(IsAligned(source_p));
301   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
302 
303   MType m_sum = MM_PS(setzero)();
304 
305   while (source_p < source_end_p) {
306     MType m_source = MM_PS(load)(source_p);
307     m_sum = MM_PS(add)(m_sum, MM_PS(mul)(m_source, m_source));
308     source_p += kPackedFloatsPerRegister;
309   }
310 
311   // Combine the packed floats.
312   const float* sums = reinterpret_cast<const float*>(&m_sum);
313   for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i)
314     *sum_p += sums[i];
315 }
316 
317 // real_dest[k] = real1[k] * real2[k] - imag1[k] * imag2[k]
318 // imag_dest[k] = real1[k] * imag2[k] + imag1[k] * real2[k]
Zvmul(const float * real1p,const float * imag1p,const float * real2p,const float * imag2p,float * real_dest_p,float * imag_dest_p,uint32_t frames_to_process)319 void Zvmul(const float* real1p,
320            const float* imag1p,
321            const float* real2p,
322            const float* imag2p,
323            float* real_dest_p,
324            float* imag_dest_p,
325            uint32_t frames_to_process) {
326   DCHECK(IsAligned(real1p));
327   DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
328 
329 #define MULTIPLY_ALL(loadOtherThanReal1, storeDest)                           \
330   for (size_t i = 0u; i < frames_to_process; i += kPackedFloatsPerRegister) { \
331     MType real1 = MM_PS(load)(real1p + i);                                    \
332     MType real2 = MM_PS(loadOtherThanReal1)(real2p + i);                      \
333     MType imag1 = MM_PS(loadOtherThanReal1)(imag1p + i);                      \
334     MType imag2 = MM_PS(loadOtherThanReal1)(imag2p + i);                      \
335     MType real =                                                              \
336         MM_PS(sub)(MM_PS(mul)(real1, real2), MM_PS(mul)(imag1, imag2));       \
337     MType imag =                                                              \
338         MM_PS(add)(MM_PS(mul)(real1, imag2), MM_PS(mul)(imag1, real2));       \
339     MM_PS(storeDest)(real_dest_p + i, real);                                  \
340     MM_PS(storeDest)(imag_dest_p + i, imag);                                  \
341   }
342 
343   if (IsAligned(imag1p) && IsAligned(real2p) && IsAligned(imag2p) &&
344       IsAligned(real_dest_p) && IsAligned(imag_dest_p)) {
345     MULTIPLY_ALL(load, store);
346   } else {
347     MULTIPLY_ALL(loadu, storeu);
348   }
349 
350 #undef MULTIPLY_ALL
351 }
352 
353 }  // namespace VECTOR_MATH_SIMD_NAMESPACE_NAME
354 }  // namespace vector_math
355 }  // namespace blink
356 
357 #endif  // defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX)
358