Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #include <inttypes.h>
73 #include <math.h>
74 #include <stdio.h>
75 
76 #ifndef INCLUDED_volk_32f_sin_32f_a_H
77 #define INCLUDED_volk_32f_sin_32f_a_H
78 
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int eighthPoints = num_points / 8;
91  unsigned int i = 0;
92 
93  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
94  fzeroes;
95  __m256 sine, cosine, condition1, condition2;
96  __m256i q, r, ones, twos, fours;
97 
98  m4pi = _mm256_set1_ps(1.273239545);
99  pio4A = _mm256_set1_ps(0.78515625);
100  pio4B = _mm256_set1_ps(0.241876e-3);
101  ffours = _mm256_set1_ps(4.0);
102  ftwos = _mm256_set1_ps(2.0);
103  fones = _mm256_set1_ps(1.0);
104  fzeroes = _mm256_setzero_ps();
105  ones = _mm256_set1_epi32(1);
106  twos = _mm256_set1_epi32(2);
107  fours = _mm256_set1_epi32(4);
108 
109  cp1 = _mm256_set1_ps(1.0);
110  cp2 = _mm256_set1_ps(0.83333333e-1);
111  cp3 = _mm256_set1_ps(0.2777778e-2);
112  cp4 = _mm256_set1_ps(0.49603e-4);
113  cp5 = _mm256_set1_ps(0.551e-6);
114 
115  for (; number < eighthPoints; number++) {
116  aVal = _mm256_load_ps(aPtr);
117  s = _mm256_sub_ps(aVal,
118  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
119  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
120  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
121  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
122 
123  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
124  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
125 
126  s = _mm256_div_ps(
127  s,
128  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
129  s = _mm256_mul_ps(s, s);
130  // Evaluate Taylor series
131  s = _mm256_mul_ps(
132  _mm256_fmadd_ps(
133  _mm256_fmsub_ps(
134  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
135  s,
136  cp1),
137  s);
138 
139  for (i = 0; i < 3; i++) {
140  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
141  }
142  s = _mm256_div_ps(s, ftwos);
143 
144  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
145  cosine = _mm256_sub_ps(fones, s);
146 
147  condition1 = _mm256_cmp_ps(
148  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
149  fzeroes,
150  _CMP_NEQ_UQ);
151  condition2 = _mm256_cmp_ps(
152  _mm256_cmp_ps(
153  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
154  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
155  _CMP_NEQ_UQ);
156  // Need this condition only for cos
157  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
158  // twos), fours)), fzeroes);
159 
160  sine =
161  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
162  sine = _mm256_sub_ps(
163  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
164  _mm256_store_ps(bPtr, sine);
165  aPtr += 8;
166  bPtr += 8;
167  }
168 
169  number = eighthPoints * 8;
170  for (; number < num_points; number++) {
171  *bPtr++ = sin(*aPtr++);
172  }
173 }
174 
175 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
176 
177 #ifdef LV_HAVE_AVX2
178 #include <immintrin.h>
179 
180 static inline void
181 volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
182 {
183  float* bPtr = bVector;
184  const float* aPtr = aVector;
185 
186  unsigned int number = 0;
187  unsigned int eighthPoints = num_points / 8;
188  unsigned int i = 0;
189 
190  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
191  fzeroes;
192  __m256 sine, cosine, condition1, condition2;
193  __m256i q, r, ones, twos, fours;
194 
195  m4pi = _mm256_set1_ps(1.273239545);
196  pio4A = _mm256_set1_ps(0.78515625);
197  pio4B = _mm256_set1_ps(0.241876e-3);
198  ffours = _mm256_set1_ps(4.0);
199  ftwos = _mm256_set1_ps(2.0);
200  fones = _mm256_set1_ps(1.0);
201  fzeroes = _mm256_setzero_ps();
202  ones = _mm256_set1_epi32(1);
203  twos = _mm256_set1_epi32(2);
204  fours = _mm256_set1_epi32(4);
205 
206  cp1 = _mm256_set1_ps(1.0);
207  cp2 = _mm256_set1_ps(0.83333333e-1);
208  cp3 = _mm256_set1_ps(0.2777778e-2);
209  cp4 = _mm256_set1_ps(0.49603e-4);
210  cp5 = _mm256_set1_ps(0.551e-6);
211 
212  for (; number < eighthPoints; number++) {
213  aVal = _mm256_load_ps(aPtr);
214  s = _mm256_sub_ps(aVal,
215  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
216  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
217  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
218  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
219 
220  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
221  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
222 
223  s = _mm256_div_ps(
224  s,
225  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
226  s = _mm256_mul_ps(s, s);
227  // Evaluate Taylor series
228  s = _mm256_mul_ps(
229  _mm256_add_ps(
230  _mm256_mul_ps(
231  _mm256_sub_ps(
232  _mm256_mul_ps(
233  _mm256_add_ps(
234  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
235  s),
236  cp3),
237  s),
238  cp2),
239  s),
240  cp1),
241  s);
242 
243  for (i = 0; i < 3; i++) {
244  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
245  }
246  s = _mm256_div_ps(s, ftwos);
247 
248  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
249  cosine = _mm256_sub_ps(fones, s);
250 
251  condition1 = _mm256_cmp_ps(
252  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
253  fzeroes,
254  _CMP_NEQ_UQ);
255  condition2 = _mm256_cmp_ps(
256  _mm256_cmp_ps(
257  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
258  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
259  _CMP_NEQ_UQ);
260  // Need this condition only for cos
261  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
262  // twos), fours)), fzeroes);
263 
264  sine =
265  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
266  sine = _mm256_sub_ps(
267  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
268  _mm256_store_ps(bPtr, sine);
269  aPtr += 8;
270  bPtr += 8;
271  }
272 
273  number = eighthPoints * 8;
274  for (; number < num_points; number++) {
275  *bPtr++ = sin(*aPtr++);
276  }
277 }
278 
279 #endif /* LV_HAVE_AVX2 for aligned */
280 
281 #ifdef LV_HAVE_SSE4_1
282 #include <smmintrin.h>
283 
284 static inline void
285 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
286 {
287  float* bPtr = bVector;
288  const float* aPtr = aVector;
289 
290  unsigned int number = 0;
291  unsigned int quarterPoints = num_points / 4;
292  unsigned int i = 0;
293 
294  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
295  fzeroes;
296  __m128 sine, cosine, condition1, condition2;
297  __m128i q, r, ones, twos, fours;
298 
299  m4pi = _mm_set1_ps(1.273239545);
300  pio4A = _mm_set1_ps(0.78515625);
301  pio4B = _mm_set1_ps(0.241876e-3);
302  ffours = _mm_set1_ps(4.0);
303  ftwos = _mm_set1_ps(2.0);
304  fones = _mm_set1_ps(1.0);
305  fzeroes = _mm_setzero_ps();
306  ones = _mm_set1_epi32(1);
307  twos = _mm_set1_epi32(2);
308  fours = _mm_set1_epi32(4);
309 
310  cp1 = _mm_set1_ps(1.0);
311  cp2 = _mm_set1_ps(0.83333333e-1);
312  cp3 = _mm_set1_ps(0.2777778e-2);
313  cp4 = _mm_set1_ps(0.49603e-4);
314  cp5 = _mm_set1_ps(0.551e-6);
315 
316  for (; number < quarterPoints; number++) {
317  aVal = _mm_load_ps(aPtr);
318  s = _mm_sub_ps(aVal,
319  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
320  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
321  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
322 
323  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
324  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
325 
326  s = _mm_div_ps(
327  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
328  s = _mm_mul_ps(s, s);
329  // Evaluate Taylor series
330  s = _mm_mul_ps(
331  _mm_add_ps(
332  _mm_mul_ps(
333  _mm_sub_ps(
334  _mm_mul_ps(
335  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
336  cp3),
337  s),
338  cp2),
339  s),
340  cp1),
341  s);
342 
343  for (i = 0; i < 3; i++) {
344  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
345  }
346  s = _mm_div_ps(s, ftwos);
347 
348  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
349  cosine = _mm_sub_ps(fones, s);
350 
351  condition1 = _mm_cmpneq_ps(
352  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
353  condition2 = _mm_cmpneq_ps(
354  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
355  _mm_cmplt_ps(aVal, fzeroes));
356  // Need this condition only for cos
357  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
358  // twos), fours)), fzeroes);
359 
360  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
361  sine =
362  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
363  _mm_store_ps(bPtr, sine);
364  aPtr += 4;
365  bPtr += 4;
366  }
367 
368  number = quarterPoints * 4;
369  for (; number < num_points; number++) {
370  *bPtr++ = sinf(*aPtr++);
371  }
372 }
373 
374 #endif /* LV_HAVE_SSE4_1 for aligned */
375 
376 
377 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
378 
379 #ifndef INCLUDED_volk_32f_sin_32f_u_H
380 #define INCLUDED_volk_32f_sin_32f_u_H
381 
382 #if LV_HAVE_AVX2 && LV_HAVE_FMA
383 #include <immintrin.h>
384 
385 static inline void
386 volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
387 {
388  float* bPtr = bVector;
389  const float* aPtr = aVector;
390 
391  unsigned int number = 0;
392  unsigned int eighthPoints = num_points / 8;
393  unsigned int i = 0;
394 
395  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
396  fzeroes;
397  __m256 sine, cosine, condition1, condition2;
398  __m256i q, r, ones, twos, fours;
399 
400  m4pi = _mm256_set1_ps(1.273239545);
401  pio4A = _mm256_set1_ps(0.78515625);
402  pio4B = _mm256_set1_ps(0.241876e-3);
403  ffours = _mm256_set1_ps(4.0);
404  ftwos = _mm256_set1_ps(2.0);
405  fones = _mm256_set1_ps(1.0);
406  fzeroes = _mm256_setzero_ps();
407  ones = _mm256_set1_epi32(1);
408  twos = _mm256_set1_epi32(2);
409  fours = _mm256_set1_epi32(4);
410 
411  cp1 = _mm256_set1_ps(1.0);
412  cp2 = _mm256_set1_ps(0.83333333e-1);
413  cp3 = _mm256_set1_ps(0.2777778e-2);
414  cp4 = _mm256_set1_ps(0.49603e-4);
415  cp5 = _mm256_set1_ps(0.551e-6);
416 
417  for (; number < eighthPoints; number++) {
418  aVal = _mm256_loadu_ps(aPtr);
419  s = _mm256_sub_ps(aVal,
420  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
421  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
422  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
423  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
424 
425  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
426  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
427 
428  s = _mm256_div_ps(
429  s,
430  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
431  s = _mm256_mul_ps(s, s);
432  // Evaluate Taylor series
433  s = _mm256_mul_ps(
434  _mm256_fmadd_ps(
435  _mm256_fmsub_ps(
436  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
437  s,
438  cp1),
439  s);
440 
441  for (i = 0; i < 3; i++) {
442  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
443  }
444  s = _mm256_div_ps(s, ftwos);
445 
446  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
447  cosine = _mm256_sub_ps(fones, s);
448 
449  condition1 = _mm256_cmp_ps(
450  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
451  fzeroes,
452  _CMP_NEQ_UQ);
453  condition2 = _mm256_cmp_ps(
454  _mm256_cmp_ps(
455  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
456  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
457  _CMP_NEQ_UQ);
458  // Need this condition only for cos
459  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
460  // twos), fours)), fzeroes);
461 
462  sine =
463  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
464  sine = _mm256_sub_ps(
465  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
466  _mm256_storeu_ps(bPtr, sine);
467  aPtr += 8;
468  bPtr += 8;
469  }
470 
471  number = eighthPoints * 8;
472  for (; number < num_points; number++) {
473  *bPtr++ = sin(*aPtr++);
474  }
475 }
476 
477 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
478 
479 #ifdef LV_HAVE_AVX2
480 #include <immintrin.h>
481 
482 static inline void
483 volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
484 {
485  float* bPtr = bVector;
486  const float* aPtr = aVector;
487 
488  unsigned int number = 0;
489  unsigned int eighthPoints = num_points / 8;
490  unsigned int i = 0;
491 
492  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
493  fzeroes;
494  __m256 sine, cosine, condition1, condition2;
495  __m256i q, r, ones, twos, fours;
496 
497  m4pi = _mm256_set1_ps(1.273239545);
498  pio4A = _mm256_set1_ps(0.78515625);
499  pio4B = _mm256_set1_ps(0.241876e-3);
500  ffours = _mm256_set1_ps(4.0);
501  ftwos = _mm256_set1_ps(2.0);
502  fones = _mm256_set1_ps(1.0);
503  fzeroes = _mm256_setzero_ps();
504  ones = _mm256_set1_epi32(1);
505  twos = _mm256_set1_epi32(2);
506  fours = _mm256_set1_epi32(4);
507 
508  cp1 = _mm256_set1_ps(1.0);
509  cp2 = _mm256_set1_ps(0.83333333e-1);
510  cp3 = _mm256_set1_ps(0.2777778e-2);
511  cp4 = _mm256_set1_ps(0.49603e-4);
512  cp5 = _mm256_set1_ps(0.551e-6);
513 
514  for (; number < eighthPoints; number++) {
515  aVal = _mm256_loadu_ps(aPtr);
516  s = _mm256_sub_ps(aVal,
517  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
518  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
519  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
520  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
521 
522  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
523  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
524 
525  s = _mm256_div_ps(
526  s,
527  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
528  s = _mm256_mul_ps(s, s);
529  // Evaluate Taylor series
530  s = _mm256_mul_ps(
531  _mm256_add_ps(
532  _mm256_mul_ps(
533  _mm256_sub_ps(
534  _mm256_mul_ps(
535  _mm256_add_ps(
536  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
537  s),
538  cp3),
539  s),
540  cp2),
541  s),
542  cp1),
543  s);
544 
545  for (i = 0; i < 3; i++) {
546  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
547  }
548  s = _mm256_div_ps(s, ftwos);
549 
550  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
551  cosine = _mm256_sub_ps(fones, s);
552 
553  condition1 = _mm256_cmp_ps(
554  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
555  fzeroes,
556  _CMP_NEQ_UQ);
557  condition2 = _mm256_cmp_ps(
558  _mm256_cmp_ps(
559  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
560  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
561  _CMP_NEQ_UQ);
562  // Need this condition only for cos
563  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
564  // twos), fours)), fzeroes);
565 
566  sine =
567  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
568  sine = _mm256_sub_ps(
569  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
570  _mm256_storeu_ps(bPtr, sine);
571  aPtr += 8;
572  bPtr += 8;
573  }
574 
575  number = eighthPoints * 8;
576  for (; number < num_points; number++) {
577  *bPtr++ = sin(*aPtr++);
578  }
579 }
580 
581 #endif /* LV_HAVE_AVX2 for unaligned */
582 
583 
584 #ifdef LV_HAVE_SSE4_1
585 #include <smmintrin.h>
586 
587 static inline void
588 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
589 {
590  float* bPtr = bVector;
591  const float* aPtr = aVector;
592 
593  unsigned int number = 0;
594  unsigned int quarterPoints = num_points / 4;
595  unsigned int i = 0;
596 
597  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
598  fzeroes;
599  __m128 sine, cosine, condition1, condition2;
600  __m128i q, r, ones, twos, fours;
601 
602  m4pi = _mm_set1_ps(1.273239545);
603  pio4A = _mm_set1_ps(0.78515625);
604  pio4B = _mm_set1_ps(0.241876e-3);
605  ffours = _mm_set1_ps(4.0);
606  ftwos = _mm_set1_ps(2.0);
607  fones = _mm_set1_ps(1.0);
608  fzeroes = _mm_setzero_ps();
609  ones = _mm_set1_epi32(1);
610  twos = _mm_set1_epi32(2);
611  fours = _mm_set1_epi32(4);
612 
613  cp1 = _mm_set1_ps(1.0);
614  cp2 = _mm_set1_ps(0.83333333e-1);
615  cp3 = _mm_set1_ps(0.2777778e-2);
616  cp4 = _mm_set1_ps(0.49603e-4);
617  cp5 = _mm_set1_ps(0.551e-6);
618 
619  for (; number < quarterPoints; number++) {
620  aVal = _mm_loadu_ps(aPtr);
621  s = _mm_sub_ps(aVal,
622  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
623  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
624  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
625 
626  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
627  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
628 
629  s = _mm_div_ps(
630  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
631  s = _mm_mul_ps(s, s);
632  // Evaluate Taylor series
633  s = _mm_mul_ps(
634  _mm_add_ps(
635  _mm_mul_ps(
636  _mm_sub_ps(
637  _mm_mul_ps(
638  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
639  cp3),
640  s),
641  cp2),
642  s),
643  cp1),
644  s);
645 
646  for (i = 0; i < 3; i++) {
647  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
648  }
649  s = _mm_div_ps(s, ftwos);
650 
651  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
652  cosine = _mm_sub_ps(fones, s);
653 
654  condition1 = _mm_cmpneq_ps(
655  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
656  condition2 = _mm_cmpneq_ps(
657  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
658  _mm_cmplt_ps(aVal, fzeroes));
659 
660  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
661  sine =
662  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
663  _mm_storeu_ps(bPtr, sine);
664  aPtr += 4;
665  bPtr += 4;
666  }
667 
668  number = quarterPoints * 4;
669  for (; number < num_points; number++) {
670  *bPtr++ = sinf(*aPtr++);
671  }
672 }
673 
674 #endif /* LV_HAVE_SSE4_1 for unaligned */
675 
676 
677 #ifdef LV_HAVE_GENERIC
678 
679 static inline void
680 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
681 {
682  float* bPtr = bVector;
683  const float* aPtr = aVector;
684  unsigned int number = 0;
685 
686  for (number = 0; number < num_points; number++) {
687  *bPtr++ = sinf(*aPtr++);
688  }
689 }
690 
691 #endif /* LV_HAVE_GENERIC */
692 
693 
694 #ifdef LV_HAVE_NEON
695 #include <arm_neon.h>
697 
698 static inline void
699 volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
700 {
701  unsigned int number = 0;
702  unsigned int quarter_points = num_points / 4;
703  float* bVectorPtr = bVector;
704  const float* aVectorPtr = aVector;
705 
706  float32x4_t b_vec;
707  float32x4_t a_vec;
708 
709  for (number = 0; number < quarter_points; number++) {
710  a_vec = vld1q_f32(aVectorPtr);
711  // Prefetch next one, speeds things up
712  __VOLK_PREFETCH(aVectorPtr + 4);
713  b_vec = _vsinq_f32(a_vec);
714  vst1q_f32(bVectorPtr, b_vec);
715  // move pointers ahead
716  bVectorPtr += 4;
717  aVectorPtr += 4;
718  }
719 
720  // Deal with the rest
721  for (number = quarter_points * 4; number < num_points; number++) {
722  *bVectorPtr++ = sinf(*aVectorPtr++);
723  }
724 }
725 
726 #endif /* LV_HAVE_NEON */
727 
728 
729 #endif /* INCLUDED_volk_32f_sin_32f_u_H */
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:263
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:680
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:699