1 /* -*- c++ -*- */
2 /*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
23 /*!
24 * \page volk_32f_64f_add_64f
25 *
26 * \b Overview
27 *
28 * Adds two input vectors and store result as a double-precision vectors. One
29 * of the input vector is defined as a single precision floating point, so
30 * upcasting is performed before the addition
31 *
32 * c[i] = a[i] + b[i]
33 *
34 * <b>Dispatcher Prototype</b>
35 * \code
36 * void volk_32f_64f_add_64f(double* cVector, const double* aVector, const
37 * double* bVector, unsigned int num_points) \endcode
38 *
39 * \b Inputs
40 * \li aVector: First input vector.
41 * \li bVector: Second input vector.
42 * \li num_points: The number of values in both input vectors.
43 *
44 * \b Outputs
45 * \li cVector: The output vector.
46 *
47 * \b Example
48 * add elements of an increasing vector by those of a decreasing vector.
49 * \code
50 * int N = 10;
51 * unsigned int alignment = volk_get_alignment();
52 * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
53 * double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
54 * double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
55 *
56 * for(unsigned int ii = 0; ii < N; ++ii){
57 * increasing[ii] = (double)ii;
58 * decreasing[ii] = 10.f - (double)ii;
59 * }
60 *
61 * volk_32f_64f_add_64f(out, increasing, decreasing, N);
62 *
63 * for(unsigned int ii = 0; ii < N; ++ii){
64 * printf("out[%u] = %1.2F\n", ii, out[ii]);
65 * }
66 *
67 * volk_free(increasing);
68 * volk_free(decreasing);
69 * volk_free(out);
70 * \endcode
71 */
72
73 #ifndef INCLUDED_volk_32f_64f_add_64f_H
74 #define INCLUDED_volk_32f_64f_add_64f_H
75
76 #include <inttypes.h>
77
78 #ifdef LV_HAVE_GENERIC
79
volk_32f_64f_add_64f_generic(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)80 static inline void volk_32f_64f_add_64f_generic(double *cVector,
81 const float *aVector,
82 const double *bVector,
83 unsigned int num_points) {
84 double *cPtr = cVector;
85 const float *aPtr = aVector;
86 const double *bPtr = bVector;
87 unsigned int number = 0;
88
89 for (number = 0; number < num_points; number++) {
90 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
91 }
92 }
93
94 #endif /* LV_HAVE_GENERIC */
95
96 #ifdef LV_HAVE_NEONV8
97 #include <arm_neon.h>
98
volk_32f_64f_add_64f_neon(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)99 static inline void volk_32f_64f_add_64f_neon(double *cVector,
100 const float *aVector,
101 const double *bVector,
102 unsigned int num_points) {
103 unsigned int number = 0;
104 const unsigned int half_points = num_points / 2;
105
106 double *cPtr = cVector;
107 const float *aPtr = aVector;
108 const double *bPtr = bVector;
109
110 float64x2_t aVal, bVal, cVal;
111 float32x2_t aVal1;
112 for (number = 0; number < half_points; number++) {
113 // Load in to NEON registers
114 aVal1 = vld1_f32(aPtr);
115 bVal = vld1q_f64(bPtr);
116 __VOLK_PREFETCH(aPtr + 2);
117 __VOLK_PREFETCH(bPtr + 2);
118 aPtr += 2; // q uses quadwords, 4 floats per vadd
119 bPtr += 2;
120
121 // Vector conversion
122 aVal = vcvt_f64_f32(aVal1);
123 // vector add
124 cVal = vaddq_f64(aVal, bVal);
125 // Store the results back into the C container
126 vst1q_f64(cPtr, cVal);
127
128 cPtr += 2;
129 }
130
131 number = half_points * 2; // should be = num_points
132 for (; number < num_points; number++) {
133 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
134 }
135 }
136
137 #endif /* LV_HAVE_NEONV8 */
138
139 #ifdef LV_HAVE_AVX
140
141 #include <immintrin.h>
142 #include <xmmintrin.h>
143
volk_32f_64f_add_64f_u_avx(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)144 static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
145 const float *aVector,
146 const double *bVector,
147 unsigned int num_points) {
148 unsigned int number = 0;
149 const unsigned int eighth_points = num_points / 8;
150
151 double *cPtr = cVector;
152 const float *aPtr = aVector;
153 const double *bPtr = bVector;
154
155 __m256 aVal;
156 __m128 aVal1, aVal2;
157 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
158 for (; number < eighth_points; number++) {
159
160 aVal = _mm256_loadu_ps(aPtr);
161 bVal1 = _mm256_loadu_pd(bPtr);
162 bVal2 = _mm256_loadu_pd(bPtr + 4);
163
164 aVal1 = _mm256_extractf128_ps(aVal, 0);
165 aVal2 = _mm256_extractf128_ps(aVal, 1);
166
167 aDbl1 = _mm256_cvtps_pd(aVal1);
168 aDbl2 = _mm256_cvtps_pd(aVal2);
169
170 cVal1 = _mm256_add_pd(aDbl1, bVal1);
171 cVal2 = _mm256_add_pd(aDbl2, bVal2);
172
173 _mm256_storeu_pd(cPtr,
174 cVal1); // Store the results back into the C container
175 _mm256_storeu_pd(cPtr + 4,
176 cVal2); // Store the results back into the C container
177
178 aPtr += 8;
179 bPtr += 8;
180 cPtr += 8;
181 }
182
183 number = eighth_points * 8;
184 for (; number < num_points; number++) {
185 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
186 }
187 }
188
189 #endif /* LV_HAVE_AVX */
190
191 #ifdef LV_HAVE_AVX
192
193 #include <immintrin.h>
194 #include <xmmintrin.h>
195
volk_32f_64f_add_64f_a_avx(double * cVector,const float * aVector,const double * bVector,unsigned int num_points)196 static inline void volk_32f_64f_add_64f_a_avx(double *cVector,
197 const float *aVector,
198 const double *bVector,
199 unsigned int num_points) {
200 unsigned int number = 0;
201 const unsigned int eighth_points = num_points / 8;
202
203 double *cPtr = cVector;
204 const float *aPtr = aVector;
205 const double *bPtr = bVector;
206
207 __m256 aVal;
208 __m128 aVal1, aVal2;
209 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
210 for (; number < eighth_points; number++) {
211
212 aVal = _mm256_load_ps(aPtr);
213 bVal1 = _mm256_load_pd(bPtr);
214 bVal2 = _mm256_load_pd(bPtr + 4);
215
216 aVal1 = _mm256_extractf128_ps(aVal, 0);
217 aVal2 = _mm256_extractf128_ps(aVal, 1);
218
219 aDbl1 = _mm256_cvtps_pd(aVal1);
220 aDbl2 = _mm256_cvtps_pd(aVal2);
221
222 cVal1 = _mm256_add_pd(aDbl1, bVal1);
223 cVal2 = _mm256_add_pd(aDbl2, bVal2);
224
225 _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
226 _mm256_store_pd(cPtr + 4,
227 cVal2); // Store the results back into the C container
228
229 aPtr += 8;
230 bPtr += 8;
231 cPtr += 8;
232 }
233
234 number = eighth_points * 8;
235 for (; number < num_points; number++) {
236 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
237 }
238 }
239
240 #endif /* LV_HAVE_AVX */
241
242 #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
243