1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING.  If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_32f_add_32fcc
25  *
26  * \b Overview
27  *
28  * Adds two vectors together element by element:
29  *
30  * c[i] = a[i] + b[i]
31  *
32  * <b>Dispatcher Prototype</b>
33  * \code
34  * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points)
35  * \endcode
36  *
37  * \b Inputs
38  * \li aVector: First vector of input points.
39  * \li bVector: Second vector of input points.
40  * \li num_points: The number of values in both input vector.
41  *
42  * \b Outputs
43  * \li cVector: The output vector.
44  *
45  * \b Example
46  *
47  * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
48  *
49  * \code
50  *   int N = 10;
51  *   unsigned int alignment = volk_get_alignment();
52  *   lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
53  *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
54  *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
55  *
56  *   for(unsigned int ii = 0; ii < N; ++ii){
57  *       increasing[ii] = (lv_32fc_t)ii;
58  *       decreasing[ii] = 10.f - (float)ii;
59  *   }
60  *
61  *   volk_32fc_32f_add_32fc(out, increasing, decreasing, N);
62  *
63  *   for(unsigned int ii = 0; ii < N; ++ii){
64  *       printf("out[%u] = %1.2f\n", ii, out[ii]);
65  *   }
66  *
67  *   volk_free(increasing);
68  *   volk_free(decreasing);
69  *   volk_free(out);
70  * \endcode
71  */
72 
73 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
74 #define INCLUDED_volk_32fc_32f_add_32fc_u_H
75 
76 #ifdef LV_HAVE_GENERIC
77 
78 static inline void
volk_32fc_32f_add_32fc_generic(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)79 volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
80                             const float* bVector, unsigned int num_points)
81 {
82   lv_32fc_t* cPtr = cVector;
83   const lv_32fc_t* aPtr = aVector;
84   const float* bPtr=  bVector;
85   unsigned int number = 0;
86 
87   for(number = 0; number < num_points; number++){
88     *cPtr++ = (*aPtr++) + (*bPtr++);
89   }
90 }
91 #endif /* LV_HAVE_GENERIC */
92 
93 
94 #ifdef LV_HAVE_AVX
95 #include <immintrin.h>
96 
97 static inline void
volk_32fc_32f_add_32fc_u_avx(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)98 volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
99                           const float* bVector, unsigned int num_points)
100 {
101   unsigned int number = 0;
102   const unsigned int eighthPoints = num_points / 8;
103 
104   lv_32fc_t* cPtr = cVector;
105   const lv_32fc_t* aPtr = aVector;
106   const float* bPtr=  bVector;
107 
108   __m256 aVal1, aVal2, bVal, cVal1, cVal2;
109   __m256 cpx_b1, cpx_b2;
110   __m256 zero;
111   zero = _mm256_setzero_ps();
112   __m256 tmp1, tmp2;
113   for(;number < eighthPoints; number++){
114 
115     aVal1 = _mm256_loadu_ps((float *) aPtr);
116     aVal2 = _mm256_loadu_ps((float *) (aPtr+4));
117     bVal = _mm256_loadu_ps(bPtr);
118     cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
119     cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
120 
121     tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
122     tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
123 
124     cVal1 = _mm256_add_ps(aVal1, tmp1);
125     cVal2 = _mm256_add_ps(aVal2, tmp2);
126 
127     _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container
128     _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
129 
130     aPtr += 8;
131     bPtr += 8;
132     cPtr += 8;
133   }
134 
135   number = eighthPoints * 8;
136   for(;number < num_points; number++){
137     *cPtr++ = (*aPtr++) + (*bPtr++);
138   }
139 }
140 #endif /* LV_HAVE_AVX */
141 
142 #ifdef LV_HAVE_AVX
143 #include <immintrin.h>
144 
145 static inline void
volk_32fc_32f_add_32fc_a_avx(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)146 volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
147                           const float* bVector, unsigned int num_points)
148 {
149   unsigned int number = 0;
150   const unsigned int eighthPoints = num_points / 8;
151 
152   lv_32fc_t* cPtr = cVector;
153   const lv_32fc_t* aPtr = aVector;
154   const float* bPtr=  bVector;
155 
156   __m256 aVal1, aVal2, bVal, cVal1, cVal2;
157   __m256 cpx_b1, cpx_b2;
158   __m256 zero;
159   zero = _mm256_setzero_ps();
160   __m256 tmp1, tmp2;
161   for(;number < eighthPoints; number++){
162 
163     aVal1 = _mm256_load_ps((float *) aPtr);
164     aVal2 = _mm256_load_ps((float *) (aPtr+4));
165     bVal = _mm256_load_ps(bPtr);
166     cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
167     cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
168 
169     tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
170     tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
171 
172     cVal1 = _mm256_add_ps(aVal1, tmp1);
173     cVal2 = _mm256_add_ps(aVal2, tmp2);
174 
175     _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container
176     _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
177 
178     aPtr += 8;
179     bPtr += 8;
180     cPtr += 8;
181   }
182 
183   number = eighthPoints * 8;
184   for(;number < num_points; number++){
185     *cPtr++ = (*aPtr++) + (*bPtr++);
186   }
187 }
188 #endif /* LV_HAVE_AVX */
189 
190 #ifdef LV_HAVE_NEON
191 #include <arm_neon.h>
192 
193 static inline void
volk_32fc_32f_add_32fc_neon(lv_32fc_t * cVector,const lv_32fc_t * aVector,const float * bVector,unsigned int num_points)194 volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
195 			    const float* bVector, unsigned int num_points)
196 {
197   lv_32fc_t* cPtr = cVector;
198   const lv_32fc_t* aPtr = aVector;
199   const float* bPtr = bVector;
200 
201   float32x4x4_t aVal0, aVal1;
202   float32x4x2_t bVal0, bVal1;
203 
204   const unsigned int sixteenthPoints = num_points / 16;
205   unsigned int number = 0;
206   for(; number < sixteenthPoints; number++){
207     aVal0 = vld4q_f32((const float*)aPtr);
208     aPtr += 8;
209     aVal1 = vld4q_f32((const float*)aPtr);
210     aPtr += 8;
211     __VOLK_PREFETCH(aPtr+16);
212 
213     bVal0 = vld2q_f32((const float*)bPtr);
214     bPtr += 8;
215     bVal1 = vld2q_f32((const float*)bPtr);
216     bPtr += 8;
217     __VOLK_PREFETCH(bPtr+16);
218 
219     aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
220     aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
221 
222     aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
223     aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
224 
225     vst4q_f32((float*)(cPtr), aVal0);
226     cPtr += 8;
227     vst4q_f32((float*)(cPtr), aVal1);
228     cPtr += 8;
229   }
230 
231   for(number = sixteenthPoints * 16; number < num_points; number++){
232     *cPtr++ = (*aPtr++) + (*bPtr++);
233   }
234 }
235 #endif /* LV_HAVE_NEON */
236 
237 
238 #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
239