Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_accumulator_s32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
63 #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
64 #define INCLUDED_volk_32f_accumulator_s32f_a_H
65 
66 #include <inttypes.h>
67 #include <volk/volk_common.h>
68 
69 #ifdef LV_HAVE_AVX
70 #include <immintrin.h>
71 
72 static inline void volk_32f_accumulator_s32f_a_avx(float* result,
73  const float* inputBuffer,
74  unsigned int num_points)
75 {
76  float returnValue = 0;
77  unsigned int number = 0;
78  const unsigned int eighthPoints = num_points / 8;
79 
80  const float* aPtr = inputBuffer;
81  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
82 
83  __m256 accumulator = _mm256_setzero_ps();
84  __m256 aVal = _mm256_setzero_ps();
85 
86  for (; number < eighthPoints; number++) {
87  aVal = _mm256_load_ps(aPtr);
88  accumulator = _mm256_add_ps(accumulator, aVal);
89  aPtr += 8;
90  }
91 
92  _mm256_store_ps(tempBuffer, accumulator);
93 
94  returnValue = tempBuffer[0];
95  returnValue += tempBuffer[1];
96  returnValue += tempBuffer[2];
97  returnValue += tempBuffer[3];
98  returnValue += tempBuffer[4];
99  returnValue += tempBuffer[5];
100  returnValue += tempBuffer[6];
101  returnValue += tempBuffer[7];
102 
103  number = eighthPoints * 8;
104  for (; number < num_points; number++) {
105  returnValue += (*aPtr++);
106  }
107  *result = returnValue;
108 }
109 #endif /* LV_HAVE_AVX */
110 
111 
112 #ifdef LV_HAVE_AVX
113 #include <immintrin.h>
114 
115 static inline void volk_32f_accumulator_s32f_u_avx(float* result,
116  const float* inputBuffer,
117  unsigned int num_points)
118 {
119  float returnValue = 0;
120  unsigned int number = 0;
121  const unsigned int eighthPoints = num_points / 8;
122 
123  const float* aPtr = inputBuffer;
124  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
125 
126  __m256 accumulator = _mm256_setzero_ps();
127  __m256 aVal = _mm256_setzero_ps();
128 
129  for (; number < eighthPoints; number++) {
130  aVal = _mm256_loadu_ps(aPtr);
131  accumulator = _mm256_add_ps(accumulator, aVal);
132  aPtr += 8;
133  }
134 
135  _mm256_store_ps(tempBuffer, accumulator);
136 
137  returnValue = tempBuffer[0];
138  returnValue += tempBuffer[1];
139  returnValue += tempBuffer[2];
140  returnValue += tempBuffer[3];
141  returnValue += tempBuffer[4];
142  returnValue += tempBuffer[5];
143  returnValue += tempBuffer[6];
144  returnValue += tempBuffer[7];
145 
146  number = eighthPoints * 8;
147  for (; number < num_points; number++) {
148  returnValue += (*aPtr++);
149  }
150  *result = returnValue;
151 }
152 #endif /* LV_HAVE_AVX */
153 
154 
155 #ifdef LV_HAVE_SSE
156 #include <xmmintrin.h>
157 
158 static inline void volk_32f_accumulator_s32f_a_sse(float* result,
159  const float* inputBuffer,
160  unsigned int num_points)
161 {
162  float returnValue = 0;
163  unsigned int number = 0;
164  const unsigned int quarterPoints = num_points / 4;
165 
166  const float* aPtr = inputBuffer;
167  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
168 
169  __m128 accumulator = _mm_setzero_ps();
170  __m128 aVal = _mm_setzero_ps();
171 
172  for (; number < quarterPoints; number++) {
173  aVal = _mm_load_ps(aPtr);
174  accumulator = _mm_add_ps(accumulator, aVal);
175  aPtr += 4;
176  }
177 
178  _mm_store_ps(tempBuffer, accumulator);
179 
180  returnValue = tempBuffer[0];
181  returnValue += tempBuffer[1];
182  returnValue += tempBuffer[2];
183  returnValue += tempBuffer[3];
184 
185  number = quarterPoints * 4;
186  for (; number < num_points; number++) {
187  returnValue += (*aPtr++);
188  }
189  *result = returnValue;
190 }
191 #endif /* LV_HAVE_SSE */
192 
193 
194 #ifdef LV_HAVE_SSE
195 #include <xmmintrin.h>
196 
197 static inline void volk_32f_accumulator_s32f_u_sse(float* result,
198  const float* inputBuffer,
199  unsigned int num_points)
200 {
201  float returnValue = 0;
202  unsigned int number = 0;
203  const unsigned int quarterPoints = num_points / 4;
204 
205  const float* aPtr = inputBuffer;
206  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
207 
208  __m128 accumulator = _mm_setzero_ps();
209  __m128 aVal = _mm_setzero_ps();
210 
211  for (; number < quarterPoints; number++) {
212  aVal = _mm_load_ps(aPtr);
213  accumulator = _mm_add_ps(accumulator, aVal);
214  aPtr += 4;
215  }
216 
217  _mm_store_ps(tempBuffer, accumulator);
218 
219  returnValue = tempBuffer[0];
220  returnValue += tempBuffer[1];
221  returnValue += tempBuffer[2];
222  returnValue += tempBuffer[3];
223 
224  number = quarterPoints * 4;
225  for (; number < num_points; number++) {
226  returnValue += (*aPtr++);
227  }
228  *result = returnValue;
229 }
230 #endif /* LV_HAVE_SSE */
231 
232 #ifdef LV_HAVE_GENERIC
233 static inline void volk_32f_accumulator_s32f_generic(float* result,
234  const float* inputBuffer,
235  unsigned int num_points)
236 {
237  const float* aPtr = inputBuffer;
238  unsigned int number = 0;
239  float returnValue = 0;
240 
241  for (; number < num_points; number++) {
242  returnValue += (*aPtr++);
243  }
244  *result = returnValue;
245 }
246 #endif /* LV_HAVE_GENERIC */
247 
248 #endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:115
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:72
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:197
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:233
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:158