1 /* -*- c++ -*- */
2 /*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
23 /*!
24 * \page volk_32fc_32f_add_32fcc
25 *
26 * \b Overview
27 *
28 * Adds two vectors together element by element:
29 *
30 * c[i] = a[i] + b[i]
31 *
32 * <b>Dispatcher Prototype</b>
33 * \code
34 * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points)
35 * \endcode
36 *
37 * \b Inputs
38 * \li aVector: First vector of input points.
39 * \li bVector: Second vector of input points.
40 * \li num_points: The number of values in both input vector.
41 *
42 * \b Outputs
43 * \li cVector: The output vector.
44 *
45 * \b Example
46 *
47 * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
48 *
49 * \code
50 * int N = 10;
51 * unsigned int alignment = volk_get_alignment();
52 * lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
53 * float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
54 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
55 *
56 * for(unsigned int ii = 0; ii < N; ++ii){
57 * increasing[ii] = (lv_32fc_t)ii;
58 * decreasing[ii] = 10.f - (float)ii;
59 * }
60 *
61 * volk_32fc_32f_add_32fc(out, increasing, decreasing, N);
62 *
63 * for(unsigned int ii = 0; ii < N; ++ii){
64 * printf("out[%u] = %1.2f\n", ii, out[ii]);
65 * }
66 *
67 * volk_free(increasing);
68 * volk_free(decreasing);
69 * volk_free(out);
70 * \endcode
71 */
72
73 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
74 #define INCLUDED_volk_32fc_32f_add_32fc_u_H
75
76 #ifdef LV_HAVE_GENERIC
77
78 static inline void
volk_32fc_32f_add_32fc_generic(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)79 volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
80 const float* bVector, unsigned int num_points)
81 {
82 lv_32fc_t* cPtr = cVector;
83 const lv_32fc_t* aPtr = aVector;
84 const float* bPtr= bVector;
85 unsigned int number = 0;
86
87 for(number = 0; number < num_points; number++){
88 *cPtr++ = (*aPtr++) + (*bPtr++);
89 }
90 }
91 #endif /* LV_HAVE_GENERIC */
92
93
94 #ifdef LV_HAVE_AVX
95 #include <immintrin.h>
96
97 static inline void
volk_32fc_32f_add_32fc_u_avx(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)98 volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
99 const float* bVector, unsigned int num_points)
100 {
101 unsigned int number = 0;
102 const unsigned int eighthPoints = num_points / 8;
103
104 lv_32fc_t* cPtr = cVector;
105 const lv_32fc_t* aPtr = aVector;
106 const float* bPtr= bVector;
107
108 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
109 __m256 cpx_b1, cpx_b2;
110 __m256 zero;
111 zero = _mm256_setzero_ps();
112 __m256 tmp1, tmp2;
113 for(;number < eighthPoints; number++){
114
115 aVal1 = _mm256_loadu_ps((float *) aPtr);
116 aVal2 = _mm256_loadu_ps((float *) (aPtr+4));
117 bVal = _mm256_loadu_ps(bPtr);
118 cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
119 cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
120
121 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
122 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
123
124 cVal1 = _mm256_add_ps(aVal1, tmp1);
125 cVal2 = _mm256_add_ps(aVal2, tmp2);
126
127 _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container
128 _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
129
130 aPtr += 8;
131 bPtr += 8;
132 cPtr += 8;
133 }
134
135 number = eighthPoints * 8;
136 for(;number < num_points; number++){
137 *cPtr++ = (*aPtr++) + (*bPtr++);
138 }
139 }
140 #endif /* LV_HAVE_AVX */
141
142 #ifdef LV_HAVE_AVX
143 #include <immintrin.h>
144
145 static inline void
volk_32fc_32f_add_32fc_a_avx(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)146 volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
147 const float* bVector, unsigned int num_points)
148 {
149 unsigned int number = 0;
150 const unsigned int eighthPoints = num_points / 8;
151
152 lv_32fc_t* cPtr = cVector;
153 const lv_32fc_t* aPtr = aVector;
154 const float* bPtr= bVector;
155
156 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
157 __m256 cpx_b1, cpx_b2;
158 __m256 zero;
159 zero = _mm256_setzero_ps();
160 __m256 tmp1, tmp2;
161 for(;number < eighthPoints; number++){
162
163 aVal1 = _mm256_load_ps((float *) aPtr);
164 aVal2 = _mm256_load_ps((float *) (aPtr+4));
165 bVal = _mm256_load_ps(bPtr);
166 cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
167 cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
168
169 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
170 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
171
172 cVal1 = _mm256_add_ps(aVal1, tmp1);
173 cVal2 = _mm256_add_ps(aVal2, tmp2);
174
175 _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container
176 _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
177
178 aPtr += 8;
179 bPtr += 8;
180 cPtr += 8;
181 }
182
183 number = eighthPoints * 8;
184 for(;number < num_points; number++){
185 *cPtr++ = (*aPtr++) + (*bPtr++);
186 }
187 }
188 #endif /* LV_HAVE_AVX */
189
190 #ifdef LV_HAVE_NEON
191 #include <arm_neon.h>
192
193 static inline void
volk_32fc_32f_add_32fc_neon(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)194 volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
195 const float* bVector, unsigned int num_points)
196 {
197 lv_32fc_t* cPtr = cVector;
198 const lv_32fc_t* aPtr = aVector;
199 const float* bPtr = bVector;
200
201 float32x4x4_t aVal0, aVal1;
202 float32x4x2_t bVal0, bVal1;
203
204 const unsigned int sixteenthPoints = num_points / 16;
205 unsigned int number = 0;
206 for(; number < sixteenthPoints; number++){
207 aVal0 = vld4q_f32((const float*)aPtr);
208 aPtr += 8;
209 aVal1 = vld4q_f32((const float*)aPtr);
210 aPtr += 8;
211 __VOLK_PREFETCH(aPtr+16);
212
213 bVal0 = vld2q_f32((const float*)bPtr);
214 bPtr += 8;
215 bVal1 = vld2q_f32((const float*)bPtr);
216 bPtr += 8;
217 __VOLK_PREFETCH(bPtr+16);
218
219 aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
220 aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
221
222 aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
223 aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
224
225 vst4q_f32((float*)(cPtr), aVal0);
226 cPtr += 8;
227 vst4q_f32((float*)(cPtr), aVal1);
228 cPtr += 8;
229 }
230
231 for(number = sixteenthPoints * 16; number < num_points; number++){
232 *cPtr++ = (*aPtr++) + (*bPtr++);
233 }
234 }
235 #endif /* LV_HAVE_NEON */
236
237
238 #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
239