1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
12 
13 #include <arm_neon.h>
14 
DotProductWithScaleNeon(int32_t * cross_correlation,const int16_t * vector1,const int16_t * vector2,int length,int scaling)15 static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
16                                            const int16_t* vector1,
17                                            const int16_t* vector2,
18                                            int length,
19                                            int scaling) {
20   int i = 0;
21   int len1 = length >> 3;
22   int len2 = length & 7;
23   int64x2_t sum0 = vdupq_n_s64(0);
24   int64x2_t sum1 = vdupq_n_s64(0);
25 
26   if (length < 0) {
27     *cross_correlation = 0;
28     return;
29   }
30 
31   for (i = len1; i > 0; i -= 1) {
32     int16x8_t seq1_16x8 = vld1q_s16(vector1);
33     int16x8_t seq2_16x8 = vld1q_s16(vector2);
34 #if defined(WEBRTC_ARCH_ARM64)
35     int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
36                                vget_low_s16(seq2_16x8));
37     int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
38 #else
39     int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
40                                vget_low_s16(seq2_16x8));
41     int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
42                                vget_high_s16(seq2_16x8));
43 #endif
44     sum0 = vpadalq_s32(sum0, tmp0);
45     sum1 = vpadalq_s32(sum1, tmp1);
46     vector1 += 8;
47     vector2 += 8;
48   }
49 
50   // Calculate the rest of the samples.
51   int64_t sum_res = 0;
52   for (i = len2; i > 0; i -= 1) {
53     sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
54     vector1++;
55     vector2++;
56   }
57 
58   sum0 = vaddq_s64(sum0, sum1);
59 #if defined(WEBRTC_ARCH_ARM64)
60   int64_t sum2 = vaddvq_s64(sum0);
61   *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
62 #else
63   int64x1_t shift = vdup_n_s64(-scaling);
64   int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
65   sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
66   sum2 = vshl_s64(sum2, shift);
67   vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
68 #endif
69 }
70 
71 /* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
WebRtcSpl_CrossCorrelationNeon(int32_t * cross_correlation,const int16_t * seq1,const int16_t * seq2,int16_t dim_seq,int16_t dim_cross_correlation,int16_t right_shifts,int16_t step_seq2)72 void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
73                                     const int16_t* seq1,
74                                     const int16_t* seq2,
75                                     int16_t dim_seq,
76                                     int16_t dim_cross_correlation,
77                                     int16_t right_shifts,
78                                     int16_t step_seq2) {
79   int i = 0;
80 
81   for (i = 0; i < dim_cross_correlation; i++) {
82     const int16_t* seq1_ptr = seq1;
83     const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
84 
85     DotProductWithScaleNeon(cross_correlation,
86                             seq1_ptr,
87                             seq2_ptr,
88                             dim_seq,
89                             right_shifts);
90     cross_correlation++;
91   }
92 }
93