Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
57 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
58 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
59 
60 #include <inttypes.h>
61 #include <stdio.h>
62 #include <volk/volk_complex.h>
63 
64 #ifdef LV_HAVE_AVX2
65 #include <immintrin.h>
66 
67 static inline void
68 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
69  const lv_8sc_t* aVector,
70  const lv_8sc_t* bVector,
71  const float scalar,
72  unsigned int num_points)
73 {
74  unsigned int number = 0;
75  const unsigned int oneEigthPoints = num_points / 8;
76 
77  __m256i x, y, realz, imagz;
78  __m256 ret, retlo, rethi;
79  lv_32fc_t* c = cVector;
80  const lv_8sc_t* a = aVector;
81  const lv_8sc_t* b = bVector;
82  __m256i conjugateSign =
83  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
84 
85  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
86 
87  for (; number < oneEigthPoints; number++) {
88  // Convert 8 bit values into 16 bit values
89  x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
90  y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
91 
92  // Calculate the ar*cr - ai*(-ci) portions
93  realz = _mm256_madd_epi16(x, y);
94 
95  // Calculate the complex conjugate of the cr + ci j values
96  y = _mm256_sign_epi16(y, conjugateSign);
97 
98  // Shift the order of the cr and ci values
99  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
100  _MM_SHUFFLE(2, 3, 0, 1));
101 
102  // Calculate the ar*(-ci) + cr*(ai)
103  imagz = _mm256_madd_epi16(x, y);
104 
105  // Interleave real and imaginary and then convert to float values
106  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
107 
108  // Normalize the floating point values
109  retlo = _mm256_mul_ps(retlo, invScalar);
110 
111  // Interleave real and imaginary and then convert to float values
112  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
113 
114  // Normalize the floating point values
115  rethi = _mm256_mul_ps(rethi, invScalar);
116 
117  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
118  _mm256_store_ps((float*)c, ret);
119  c += 4;
120 
121  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
122  _mm256_store_ps((float*)c, ret);
123  c += 4;
124 
125  a += 8;
126  b += 8;
127  }
128 
129  number = oneEigthPoints * 8;
130  float* cFloatPtr = (float*)&cVector[number];
131  int8_t* a8Ptr = (int8_t*)&aVector[number];
132  int8_t* b8Ptr = (int8_t*)&bVector[number];
133  for (; number < num_points; number++) {
134  float aReal = (float)*a8Ptr++;
135  float aImag = (float)*a8Ptr++;
136  lv_32fc_t aVal = lv_cmake(aReal, aImag);
137  float bReal = (float)*b8Ptr++;
138  float bImag = (float)*b8Ptr++;
139  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
140  lv_32fc_t temp = aVal * bVal;
141 
142  *cFloatPtr++ = lv_creal(temp) / scalar;
143  *cFloatPtr++ = lv_cimag(temp) / scalar;
144  }
145 }
146 #endif /* LV_HAVE_AVX2*/
147 
148 
149 #ifdef LV_HAVE_SSE4_1
150 #include <smmintrin.h>
151 
152 static inline void
153 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
154  const lv_8sc_t* aVector,
155  const lv_8sc_t* bVector,
156  const float scalar,
157  unsigned int num_points)
158 {
159  unsigned int number = 0;
160  const unsigned int quarterPoints = num_points / 4;
161 
162  __m128i x, y, realz, imagz;
163  __m128 ret;
164  lv_32fc_t* c = cVector;
165  const lv_8sc_t* a = aVector;
166  const lv_8sc_t* b = bVector;
167  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
168 
169  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
170 
171  for (; number < quarterPoints; number++) {
172  // Convert into 8 bit values into 16 bit values
173  x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
174  y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
175 
176  // Calculate the ar*cr - ai*(-ci) portions
177  realz = _mm_madd_epi16(x, y);
178 
179  // Calculate the complex conjugate of the cr + ci j values
180  y = _mm_sign_epi16(y, conjugateSign);
181 
182  // Shift the order of the cr and ci values
183  y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
184  _MM_SHUFFLE(2, 3, 0, 1));
185 
186  // Calculate the ar*(-ci) + cr*(ai)
187  imagz = _mm_madd_epi16(x, y);
188 
189  // Interleave real and imaginary and then convert to float values
190  ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
191 
192  // Normalize the floating point values
193  ret = _mm_mul_ps(ret, invScalar);
194 
195  // Store the floating point values
196  _mm_store_ps((float*)c, ret);
197  c += 2;
198 
199  // Interleave real and imaginary and then convert to float values
200  ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
201 
202  // Normalize the floating point values
203  ret = _mm_mul_ps(ret, invScalar);
204 
205  // Store the floating point values
206  _mm_store_ps((float*)c, ret);
207  c += 2;
208 
209  a += 4;
210  b += 4;
211  }
212 
213  number = quarterPoints * 4;
214  float* cFloatPtr = (float*)&cVector[number];
215  int8_t* a8Ptr = (int8_t*)&aVector[number];
216  int8_t* b8Ptr = (int8_t*)&bVector[number];
217  for (; number < num_points; number++) {
218  float aReal = (float)*a8Ptr++;
219  float aImag = (float)*a8Ptr++;
220  lv_32fc_t aVal = lv_cmake(aReal, aImag);
221  float bReal = (float)*b8Ptr++;
222  float bImag = (float)*b8Ptr++;
223  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
224  lv_32fc_t temp = aVal * bVal;
225 
226  *cFloatPtr++ = lv_creal(temp) / scalar;
227  *cFloatPtr++ = lv_cimag(temp) / scalar;
228  }
229 }
230 #endif /* LV_HAVE_SSE4_1 */
231 
232 
233 #ifdef LV_HAVE_GENERIC
234 
235 static inline void
237  const lv_8sc_t* aVector,
238  const lv_8sc_t* bVector,
239  const float scalar,
240  unsigned int num_points)
241 {
242  unsigned int number = 0;
243  float* cPtr = (float*)cVector;
244  const float invScalar = 1.0 / scalar;
245  int8_t* a8Ptr = (int8_t*)aVector;
246  int8_t* b8Ptr = (int8_t*)bVector;
247  for (number = 0; number < num_points; number++) {
248  float aReal = (float)*a8Ptr++;
249  float aImag = (float)*a8Ptr++;
250  lv_32fc_t aVal = lv_cmake(aReal, aImag);
251  float bReal = (float)*b8Ptr++;
252  float bImag = (float)*b8Ptr++;
253  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
254  lv_32fc_t temp = aVal * bVal;
255 
256  *cPtr++ = (lv_creal(temp) * invScalar);
257  *cPtr++ = (lv_cimag(temp) * invScalar);
258  }
259 }
260 #endif /* LV_HAVE_GENERIC */
261 
262 
263 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
264 
265 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
266 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
267 
268 #include <inttypes.h>
269 #include <stdio.h>
270 #include <volk/volk_complex.h>
271 
272 #ifdef LV_HAVE_AVX2
273 #include <immintrin.h>
274 
275 static inline void
276 volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
277  const lv_8sc_t* aVector,
278  const lv_8sc_t* bVector,
279  const float scalar,
280  unsigned int num_points)
281 {
282  unsigned int number = 0;
283  const unsigned int oneEigthPoints = num_points / 8;
284 
285  __m256i x, y, realz, imagz;
286  __m256 ret, retlo, rethi;
287  lv_32fc_t* c = cVector;
288  const lv_8sc_t* a = aVector;
289  const lv_8sc_t* b = bVector;
290  __m256i conjugateSign =
291  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
292 
293  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
294 
295  for (; number < oneEigthPoints; number++) {
296  // Convert 8 bit values into 16 bit values
297  x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
298  y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
299 
300  // Calculate the ar*cr - ai*(-ci) portions
301  realz = _mm256_madd_epi16(x, y);
302 
303  // Calculate the complex conjugate of the cr + ci j values
304  y = _mm256_sign_epi16(y, conjugateSign);
305 
306  // Shift the order of the cr and ci values
307  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
308  _MM_SHUFFLE(2, 3, 0, 1));
309 
310  // Calculate the ar*(-ci) + cr*(ai)
311  imagz = _mm256_madd_epi16(x, y);
312 
313  // Interleave real and imaginary and then convert to float values
314  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
315 
316  // Normalize the floating point values
317  retlo = _mm256_mul_ps(retlo, invScalar);
318 
319  // Interleave real and imaginary and then convert to float values
320  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
321 
322  // Normalize the floating point values
323  rethi = _mm256_mul_ps(rethi, invScalar);
324 
325  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
326  _mm256_storeu_ps((float*)c, ret);
327  c += 4;
328 
329  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
330  _mm256_storeu_ps((float*)c, ret);
331  c += 4;
332 
333  a += 8;
334  b += 8;
335  }
336 
337  number = oneEigthPoints * 8;
338  float* cFloatPtr = (float*)&cVector[number];
339  int8_t* a8Ptr = (int8_t*)&aVector[number];
340  int8_t* b8Ptr = (int8_t*)&bVector[number];
341  for (; number < num_points; number++) {
342  float aReal = (float)*a8Ptr++;
343  float aImag = (float)*a8Ptr++;
344  lv_32fc_t aVal = lv_cmake(aReal, aImag);
345  float bReal = (float)*b8Ptr++;
346  float bImag = (float)*b8Ptr++;
347  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
348  lv_32fc_t temp = aVal * bVal;
349 
350  *cFloatPtr++ = lv_creal(temp) / scalar;
351  *cFloatPtr++ = lv_cimag(temp) / scalar;
352  }
353 }
354 #endif /* LV_HAVE_AVX2*/
355 
356 
357 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_x2_s32f_multiply_conjugate_32fc.h:236
#define lv_cmake(r, i)
Definition: volk_complex.h:73
float complex lv_32fc_t
Definition: volk_complex.h:70
#define lv_creal(x)
Definition: volk_complex.h:92
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:66
#define lv_cimag(x)
Definition: volk_complex.h:94