1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING.  If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_64f_add_64f
25  *
26  * \b Overview
27  *
28  * Adds two input vectors and store result as a double-precision vectors. One
29  * of the input vector is defined as a single precision floating point, so
30  * upcasting is performed before the addition
31  *
32  * c[i] = a[i] + b[i]
33  *
34  * <b>Dispatcher Prototype</b>
35  * \code
36  * void volk_32f_64f_add_64f(double* cVector, const double* aVector, const
37  * double* bVector, unsigned int num_points) \endcode
38  *
39  * \b Inputs
40  * \li aVector: First input vector.
41  * \li bVector: Second input vector.
42  * \li num_points: The number of values in both input vectors.
43  *
44  * \b Outputs
45  * \li cVector: The output vector.
46  *
47  * \b Example
48  * add elements of an increasing vector by those of a decreasing vector.
49  * \code
50  *   int N = 10;
51  *   unsigned int alignment = volk_get_alignment();
52  *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
53  *   double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
54  *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
55  *
56  *   for(unsigned int ii = 0; ii < N; ++ii){
57  *       increasing[ii] = (double)ii;
58  *       decreasing[ii] = 10.f - (double)ii;
59  *   }
60  *
61  *   volk_32f_64f_add_64f(out, increasing, decreasing, N);
62  *
63  *   for(unsigned int ii = 0; ii < N; ++ii){
64  *       printf("out[%u] = %1.2F\n", ii, out[ii]);
65  *   }
66  *
67  *   volk_free(increasing);
68  *   volk_free(decreasing);
69  *   volk_free(out);
70  * \endcode
71  */
72 
73 #ifndef INCLUDED_volk_32f_64f_add_64f_H
74 #define INCLUDED_volk_32f_64f_add_64f_H
75 
76 #include <inttypes.h>
77 
78 #ifdef LV_HAVE_GENERIC
79 
volk_32f_64f_add_64f_generic(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)80 static inline void volk_32f_64f_add_64f_generic(double *cVector,
81                                                 const float *aVector,
82                                                 const double *bVector,
83                                                 unsigned int num_points) {
84   double *cPtr = cVector;
85   const float *aPtr = aVector;
86   const double *bPtr = bVector;
87   unsigned int number = 0;
88 
89   for (number = 0; number < num_points; number++) {
90     *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
91   }
92 }
93 
94 #endif /* LV_HAVE_GENERIC */
95 
96 #ifdef LV_HAVE_NEONV8
97 #include <arm_neon.h>
98 
volk_32f_64f_add_64f_neon(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)99 static inline void volk_32f_64f_add_64f_neon(double *cVector,
100                                              const float *aVector,
101                                              const double *bVector,
102                                              unsigned int num_points) {
103   unsigned int number = 0;
104   const unsigned int half_points = num_points / 2;
105 
106   double *cPtr = cVector;
107   const float *aPtr = aVector;
108   const double *bPtr = bVector;
109 
110   float64x2_t aVal, bVal, cVal;
111   float32x2_t aVal1;
112   for (number = 0; number < half_points; number++) {
113     // Load in to NEON registers
114     aVal1 = vld1_f32(aPtr);
115     bVal = vld1q_f64(bPtr);
116     __VOLK_PREFETCH(aPtr + 2);
117     __VOLK_PREFETCH(bPtr + 2);
118     aPtr += 2; // q uses quadwords, 4 floats per vadd
119     bPtr += 2;
120 
121     // Vector conversion
122     aVal = vcvt_f64_f32(aVal1);
123     // vector add
124     cVal = vaddq_f64(aVal, bVal);
125     // Store the results back into the C container
126     vst1q_f64(cPtr, cVal);
127 
128     cPtr += 2;
129   }
130 
131   number = half_points * 2; // should be = num_points
132   for (; number < num_points; number++) {
133     *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
134   }
135 }
136 
137 #endif /* LV_HAVE_NEONV8 */
138 
139 #ifdef LV_HAVE_AVX
140 
141 #include <immintrin.h>
142 #include <xmmintrin.h>
143 
volk_32f_64f_add_64f_u_avx(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)144 static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
145                                               const float *aVector,
146                                               const double *bVector,
147                                               unsigned int num_points) {
148   unsigned int number = 0;
149   const unsigned int eighth_points = num_points / 8;
150 
151   double *cPtr = cVector;
152   const float *aPtr = aVector;
153   const double *bPtr = bVector;
154 
155   __m256 aVal;
156   __m128 aVal1, aVal2;
157   __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
158   for (; number < eighth_points; number++) {
159 
160     aVal = _mm256_loadu_ps(aPtr);
161     bVal1 = _mm256_loadu_pd(bPtr);
162     bVal2 = _mm256_loadu_pd(bPtr + 4);
163 
164     aVal1 = _mm256_extractf128_ps(aVal, 0);
165     aVal2 = _mm256_extractf128_ps(aVal, 1);
166 
167     aDbl1 = _mm256_cvtps_pd(aVal1);
168     aDbl2 = _mm256_cvtps_pd(aVal2);
169 
170     cVal1 = _mm256_add_pd(aDbl1, bVal1);
171     cVal2 = _mm256_add_pd(aDbl2, bVal2);
172 
173     _mm256_storeu_pd(cPtr,
174                      cVal1); // Store the results back into the C container
175     _mm256_storeu_pd(cPtr + 4,
176                      cVal2); // Store the results back into the C container
177 
178     aPtr += 8;
179     bPtr += 8;
180     cPtr += 8;
181   }
182 
183   number = eighth_points * 8;
184   for (; number < num_points; number++) {
185     *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
186   }
187 }
188 
189 #endif /* LV_HAVE_AVX */
190 
191 #ifdef LV_HAVE_AVX
192 
193 #include <immintrin.h>
194 #include <xmmintrin.h>
195 
volk_32f_64f_add_64f_a_avx(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)196 static inline void volk_32f_64f_add_64f_a_avx(double *cVector,
197                                               const float *aVector,
198                                               const double *bVector,
199                                               unsigned int num_points) {
200   unsigned int number = 0;
201   const unsigned int eighth_points = num_points / 8;
202 
203   double *cPtr = cVector;
204   const float *aPtr = aVector;
205   const double *bPtr = bVector;
206 
207   __m256 aVal;
208   __m128 aVal1, aVal2;
209   __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
210   for (; number < eighth_points; number++) {
211 
212     aVal = _mm256_load_ps(aPtr);
213     bVal1 = _mm256_load_pd(bPtr);
214     bVal2 = _mm256_load_pd(bPtr + 4);
215 
216     aVal1 = _mm256_extractf128_ps(aVal, 0);
217     aVal2 = _mm256_extractf128_ps(aVal, 1);
218 
219     aDbl1 = _mm256_cvtps_pd(aVal1);
220     aDbl2 = _mm256_cvtps_pd(aVal2);
221 
222     cVal1 = _mm256_add_pd(aDbl1, bVal1);
223     cVal2 = _mm256_add_pd(aDbl2, bVal2);
224 
225     _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
226     _mm256_store_pd(cPtr + 4,
227                     cVal2); // Store the results back into the C container
228 
229     aPtr += 8;
230     bPtr += 8;
231     cPtr += 8;
232   }
233 
234   number = eighth_points * 8;
235   for (; number < num_points; number++) {
236     *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
237   }
238 }
239 
240 #endif /* LV_HAVE_AVX */
241 
242 #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
243