Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
59 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
60 
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 
68 static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
69  const float* input,
70  const float* taps,
71  unsigned int num_points)
72 {
73 
74  float dotProduct = 0;
75  const float* aPtr = input;
76  const float* bPtr = taps;
77  unsigned int number = 0;
78 
79  for (number = 0; number < num_points; number++) {
80  dotProduct += ((*aPtr++) * (*bPtr++));
81  }
82 
83  *result = (int16_t)dotProduct;
84 }
85 
86 #endif /*LV_HAVE_GENERIC*/
87 
88 
89 #ifdef LV_HAVE_SSE
90 
91 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
92  const float* input,
93  const float* taps,
94  unsigned int num_points)
95 {
96 
97  unsigned int number = 0;
98  const unsigned int sixteenthPoints = num_points / 16;
99 
100  float dotProduct = 0;
101  const float* aPtr = input;
102  const float* bPtr = taps;
103 
104  __m128 a0Val, a1Val, a2Val, a3Val;
105  __m128 b0Val, b1Val, b2Val, b3Val;
106  __m128 c0Val, c1Val, c2Val, c3Val;
107 
108  __m128 dotProdVal0 = _mm_setzero_ps();
109  __m128 dotProdVal1 = _mm_setzero_ps();
110  __m128 dotProdVal2 = _mm_setzero_ps();
111  __m128 dotProdVal3 = _mm_setzero_ps();
112 
113  for (; number < sixteenthPoints; number++) {
114 
115  a0Val = _mm_load_ps(aPtr);
116  a1Val = _mm_load_ps(aPtr + 4);
117  a2Val = _mm_load_ps(aPtr + 8);
118  a3Val = _mm_load_ps(aPtr + 12);
119  b0Val = _mm_load_ps(bPtr);
120  b1Val = _mm_load_ps(bPtr + 4);
121  b2Val = _mm_load_ps(bPtr + 8);
122  b3Val = _mm_load_ps(bPtr + 12);
123 
124  c0Val = _mm_mul_ps(a0Val, b0Val);
125  c1Val = _mm_mul_ps(a1Val, b1Val);
126  c2Val = _mm_mul_ps(a2Val, b2Val);
127  c3Val = _mm_mul_ps(a3Val, b3Val);
128 
129  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
130  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
131  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
132  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
133 
134  aPtr += 16;
135  bPtr += 16;
136  }
137 
138  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
139  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
140  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
141 
142  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
143 
144  _mm_store_ps(dotProductVector,
145  dotProdVal0); // Store the results back into the dot product vector
146 
147  dotProduct = dotProductVector[0];
148  dotProduct += dotProductVector[1];
149  dotProduct += dotProductVector[2];
150  dotProduct += dotProductVector[3];
151 
152  number = sixteenthPoints * 16;
153  for (; number < num_points; number++) {
154  dotProduct += ((*aPtr++) * (*bPtr++));
155  }
156 
157  *result = (short)dotProduct;
158 }
159 
160 #endif /*LV_HAVE_SSE*/
161 
162 
163 #if LV_HAVE_AVX2 && LV_HAVE_FMA
164 
165 static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
166  const float* input,
167  const float* taps,
168  unsigned int num_points)
169 {
170 
171  unsigned int number = 0;
172  const unsigned int thirtysecondPoints = num_points / 32;
173 
174  float dotProduct = 0;
175  const float* aPtr = input;
176  const float* bPtr = taps;
177 
178  __m256 a0Val, a1Val, a2Val, a3Val;
179  __m256 b0Val, b1Val, b2Val, b3Val;
180 
181  __m256 dotProdVal0 = _mm256_setzero_ps();
182  __m256 dotProdVal1 = _mm256_setzero_ps();
183  __m256 dotProdVal2 = _mm256_setzero_ps();
184  __m256 dotProdVal3 = _mm256_setzero_ps();
185 
186  for (; number < thirtysecondPoints; number++) {
187 
188  a0Val = _mm256_load_ps(aPtr);
189  a1Val = _mm256_load_ps(aPtr + 8);
190  a2Val = _mm256_load_ps(aPtr + 16);
191  a3Val = _mm256_load_ps(aPtr + 24);
192  b0Val = _mm256_load_ps(bPtr);
193  b1Val = _mm256_load_ps(bPtr + 8);
194  b2Val = _mm256_load_ps(bPtr + 16);
195  b3Val = _mm256_load_ps(bPtr + 24);
196 
197  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
198  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
199  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
200  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
201 
202  aPtr += 32;
203  bPtr += 32;
204  }
205 
206  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
207  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
208  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
209 
210  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
211 
212  _mm256_store_ps(dotProductVector,
213  dotProdVal0); // Store the results back into the dot product vector
214 
215  dotProduct = dotProductVector[0];
216  dotProduct += dotProductVector[1];
217  dotProduct += dotProductVector[2];
218  dotProduct += dotProductVector[3];
219  dotProduct += dotProductVector[4];
220  dotProduct += dotProductVector[5];
221  dotProduct += dotProductVector[6];
222  dotProduct += dotProductVector[7];
223 
224  number = thirtysecondPoints * 32;
225  for (; number < num_points; number++) {
226  dotProduct += ((*aPtr++) * (*bPtr++));
227  }
228 
229  *result = (short)dotProduct;
230 }
231 
232 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
233 
234 
235 #ifdef LV_HAVE_AVX
236 
237 static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
238  const float* input,
239  const float* taps,
240  unsigned int num_points)
241 {
242 
243  unsigned int number = 0;
244  const unsigned int thirtysecondPoints = num_points / 32;
245 
246  float dotProduct = 0;
247  const float* aPtr = input;
248  const float* bPtr = taps;
249 
250  __m256 a0Val, a1Val, a2Val, a3Val;
251  __m256 b0Val, b1Val, b2Val, b3Val;
252  __m256 c0Val, c1Val, c2Val, c3Val;
253 
254  __m256 dotProdVal0 = _mm256_setzero_ps();
255  __m256 dotProdVal1 = _mm256_setzero_ps();
256  __m256 dotProdVal2 = _mm256_setzero_ps();
257  __m256 dotProdVal3 = _mm256_setzero_ps();
258 
259  for (; number < thirtysecondPoints; number++) {
260 
261  a0Val = _mm256_load_ps(aPtr);
262  a1Val = _mm256_load_ps(aPtr + 8);
263  a2Val = _mm256_load_ps(aPtr + 16);
264  a3Val = _mm256_load_ps(aPtr + 24);
265  b0Val = _mm256_load_ps(bPtr);
266  b1Val = _mm256_load_ps(bPtr + 8);
267  b2Val = _mm256_load_ps(bPtr + 16);
268  b3Val = _mm256_load_ps(bPtr + 24);
269 
270  c0Val = _mm256_mul_ps(a0Val, b0Val);
271  c1Val = _mm256_mul_ps(a1Val, b1Val);
272  c2Val = _mm256_mul_ps(a2Val, b2Val);
273  c3Val = _mm256_mul_ps(a3Val, b3Val);
274 
275  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
276  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
277  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
278  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
279 
280  aPtr += 32;
281  bPtr += 32;
282  }
283 
284  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
285  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
286  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
287 
288  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
289 
290  _mm256_store_ps(dotProductVector,
291  dotProdVal0); // Store the results back into the dot product vector
292 
293  dotProduct = dotProductVector[0];
294  dotProduct += dotProductVector[1];
295  dotProduct += dotProductVector[2];
296  dotProduct += dotProductVector[3];
297  dotProduct += dotProductVector[4];
298  dotProduct += dotProductVector[5];
299  dotProduct += dotProductVector[6];
300  dotProduct += dotProductVector[7];
301 
302  number = thirtysecondPoints * 32;
303  for (; number < num_points; number++) {
304  dotProduct += ((*aPtr++) * (*bPtr++));
305  }
306 
307  *result = (short)dotProduct;
308 }
309 
310 #endif /*LV_HAVE_AVX*/
311 
312 #ifdef LV_HAVE_AVX512F
313 
314 static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
315  const float* input,
316  const float* taps,
317  unsigned int num_points)
318 {
319 
320  unsigned int number = 0;
321  const unsigned int sixtyfourthPoints = num_points / 64;
322 
323  float dotProduct = 0;
324  const float* aPtr = input;
325  const float* bPtr = taps;
326 
327  __m512 a0Val, a1Val, a2Val, a3Val;
328  __m512 b0Val, b1Val, b2Val, b3Val;
329 
330  __m512 dotProdVal0 = _mm512_setzero_ps();
331  __m512 dotProdVal1 = _mm512_setzero_ps();
332  __m512 dotProdVal2 = _mm512_setzero_ps();
333  __m512 dotProdVal3 = _mm512_setzero_ps();
334 
335  for (; number < sixtyfourthPoints; number++) {
336 
337  a0Val = _mm512_load_ps(aPtr);
338  a1Val = _mm512_load_ps(aPtr + 16);
339  a2Val = _mm512_load_ps(aPtr + 32);
340  a3Val = _mm512_load_ps(aPtr + 48);
341  b0Val = _mm512_load_ps(bPtr);
342  b1Val = _mm512_load_ps(bPtr + 16);
343  b2Val = _mm512_load_ps(bPtr + 32);
344  b3Val = _mm512_load_ps(bPtr + 48);
345 
346  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
347  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
348  dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
349  dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
350 
351  aPtr += 64;
352  bPtr += 64;
353  }
354 
355  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
356  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
357  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
358 
359  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
360 
361  _mm512_store_ps(dotProductVector,
362  dotProdVal0); // Store the results back into the dot product vector
363 
364  dotProduct = dotProductVector[0];
365  dotProduct += dotProductVector[1];
366  dotProduct += dotProductVector[2];
367  dotProduct += dotProductVector[3];
368  dotProduct += dotProductVector[4];
369  dotProduct += dotProductVector[5];
370  dotProduct += dotProductVector[6];
371  dotProduct += dotProductVector[7];
372  dotProduct += dotProductVector[8];
373  dotProduct += dotProductVector[9];
374  dotProduct += dotProductVector[10];
375  dotProduct += dotProductVector[11];
376  dotProduct += dotProductVector[12];
377  dotProduct += dotProductVector[13];
378  dotProduct += dotProductVector[14];
379  dotProduct += dotProductVector[15];
380 
381  number = sixtyfourthPoints * 64;
382  for (; number < num_points; number++) {
383  dotProduct += ((*aPtr++) * (*bPtr++));
384  }
385 
386  *result = (short)dotProduct;
387 }
388 
389 #endif /*LV_HAVE_AVX512F*/
390 
391 
392 #ifdef LV_HAVE_SSE
393 
394 static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
395  const float* input,
396  const float* taps,
397  unsigned int num_points)
398 {
399 
400  unsigned int number = 0;
401  const unsigned int sixteenthPoints = num_points / 16;
402 
403  float dotProduct = 0;
404  const float* aPtr = input;
405  const float* bPtr = taps;
406 
407  __m128 a0Val, a1Val, a2Val, a3Val;
408  __m128 b0Val, b1Val, b2Val, b3Val;
409  __m128 c0Val, c1Val, c2Val, c3Val;
410 
411  __m128 dotProdVal0 = _mm_setzero_ps();
412  __m128 dotProdVal1 = _mm_setzero_ps();
413  __m128 dotProdVal2 = _mm_setzero_ps();
414  __m128 dotProdVal3 = _mm_setzero_ps();
415 
416  for (; number < sixteenthPoints; number++) {
417 
418  a0Val = _mm_loadu_ps(aPtr);
419  a1Val = _mm_loadu_ps(aPtr + 4);
420  a2Val = _mm_loadu_ps(aPtr + 8);
421  a3Val = _mm_loadu_ps(aPtr + 12);
422  b0Val = _mm_loadu_ps(bPtr);
423  b1Val = _mm_loadu_ps(bPtr + 4);
424  b2Val = _mm_loadu_ps(bPtr + 8);
425  b3Val = _mm_loadu_ps(bPtr + 12);
426 
427  c0Val = _mm_mul_ps(a0Val, b0Val);
428  c1Val = _mm_mul_ps(a1Val, b1Val);
429  c2Val = _mm_mul_ps(a2Val, b2Val);
430  c3Val = _mm_mul_ps(a3Val, b3Val);
431 
432  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
433  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
434  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
435  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
436 
437  aPtr += 16;
438  bPtr += 16;
439  }
440 
441  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
442  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
443  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
444 
445  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
446 
447  _mm_store_ps(dotProductVector,
448  dotProdVal0); // Store the results back into the dot product vector
449 
450  dotProduct = dotProductVector[0];
451  dotProduct += dotProductVector[1];
452  dotProduct += dotProductVector[2];
453  dotProduct += dotProductVector[3];
454 
455  number = sixteenthPoints * 16;
456  for (; number < num_points; number++) {
457  dotProduct += ((*aPtr++) * (*bPtr++));
458  }
459 
460  *result = (short)dotProduct;
461 }
462 
463 #endif /*LV_HAVE_SSE*/
464 
465 
466 #if LV_HAVE_AVX2 && LV_HAVE_FMA
467 
468 static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
469  const float* input,
470  const float* taps,
471  unsigned int num_points)
472 {
473 
474  unsigned int number = 0;
475  const unsigned int thirtysecondPoints = num_points / 32;
476 
477  float dotProduct = 0;
478  const float* aPtr = input;
479  const float* bPtr = taps;
480 
481  __m256 a0Val, a1Val, a2Val, a3Val;
482  __m256 b0Val, b1Val, b2Val, b3Val;
483 
484  __m256 dotProdVal0 = _mm256_setzero_ps();
485  __m256 dotProdVal1 = _mm256_setzero_ps();
486  __m256 dotProdVal2 = _mm256_setzero_ps();
487  __m256 dotProdVal3 = _mm256_setzero_ps();
488 
489  for (; number < thirtysecondPoints; number++) {
490 
491  a0Val = _mm256_loadu_ps(aPtr);
492  a1Val = _mm256_loadu_ps(aPtr + 8);
493  a2Val = _mm256_loadu_ps(aPtr + 16);
494  a3Val = _mm256_loadu_ps(aPtr + 24);
495  b0Val = _mm256_loadu_ps(bPtr);
496  b1Val = _mm256_loadu_ps(bPtr + 8);
497  b2Val = _mm256_loadu_ps(bPtr + 16);
498  b3Val = _mm256_loadu_ps(bPtr + 24);
499 
500  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
501  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
502  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
503  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
504 
505  aPtr += 32;
506  bPtr += 32;
507  }
508 
509  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
510  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
511  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
512 
513  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
514 
515  _mm256_store_ps(dotProductVector,
516  dotProdVal0); // Store the results back into the dot product vector
517 
518  dotProduct = dotProductVector[0];
519  dotProduct += dotProductVector[1];
520  dotProduct += dotProductVector[2];
521  dotProduct += dotProductVector[3];
522  dotProduct += dotProductVector[4];
523  dotProduct += dotProductVector[5];
524  dotProduct += dotProductVector[6];
525  dotProduct += dotProductVector[7];
526 
527  number = thirtysecondPoints * 32;
528  for (; number < num_points; number++) {
529  dotProduct += ((*aPtr++) * (*bPtr++));
530  }
531 
532  *result = (short)dotProduct;
533 }
534 
535 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
536 
537 
538 #ifdef LV_HAVE_AVX
539 
540 static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
541  const float* input,
542  const float* taps,
543  unsigned int num_points)
544 {
545 
546  unsigned int number = 0;
547  const unsigned int thirtysecondPoints = num_points / 32;
548 
549  float dotProduct = 0;
550  const float* aPtr = input;
551  const float* bPtr = taps;
552 
553  __m256 a0Val, a1Val, a2Val, a3Val;
554  __m256 b0Val, b1Val, b2Val, b3Val;
555  __m256 c0Val, c1Val, c2Val, c3Val;
556 
557  __m256 dotProdVal0 = _mm256_setzero_ps();
558  __m256 dotProdVal1 = _mm256_setzero_ps();
559  __m256 dotProdVal2 = _mm256_setzero_ps();
560  __m256 dotProdVal3 = _mm256_setzero_ps();
561 
562  for (; number < thirtysecondPoints; number++) {
563 
564  a0Val = _mm256_loadu_ps(aPtr);
565  a1Val = _mm256_loadu_ps(aPtr + 8);
566  a2Val = _mm256_loadu_ps(aPtr + 16);
567  a3Val = _mm256_loadu_ps(aPtr + 24);
568  b0Val = _mm256_loadu_ps(bPtr);
569  b1Val = _mm256_loadu_ps(bPtr + 8);
570  b2Val = _mm256_loadu_ps(bPtr + 16);
571  b3Val = _mm256_loadu_ps(bPtr + 24);
572 
573  c0Val = _mm256_mul_ps(a0Val, b0Val);
574  c1Val = _mm256_mul_ps(a1Val, b1Val);
575  c2Val = _mm256_mul_ps(a2Val, b2Val);
576  c3Val = _mm256_mul_ps(a3Val, b3Val);
577 
578  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
579  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
580  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
581  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
582 
583  aPtr += 32;
584  bPtr += 32;
585  }
586 
587  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
588  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
589  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
590 
591  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
592 
593  _mm256_store_ps(dotProductVector,
594  dotProdVal0); // Store the results back into the dot product vector
595 
596  dotProduct = dotProductVector[0];
597  dotProduct += dotProductVector[1];
598  dotProduct += dotProductVector[2];
599  dotProduct += dotProductVector[3];
600  dotProduct += dotProductVector[4];
601  dotProduct += dotProductVector[5];
602  dotProduct += dotProductVector[6];
603  dotProduct += dotProductVector[7];
604 
605  number = thirtysecondPoints * 32;
606  for (; number < num_points; number++) {
607  dotProduct += ((*aPtr++) * (*bPtr++));
608  }
609 
610  *result = (short)dotProduct;
611 }
612 
613 #endif /*LV_HAVE_AVX*/
614 
615 #ifdef LV_HAVE_AVX512F
616 
617 static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
618  const float* input,
619  const float* taps,
620  unsigned int num_points)
621 {
622 
623  unsigned int number = 0;
624  const unsigned int sixtyfourthPoints = num_points / 64;
625 
626  float dotProduct = 0;
627  const float* aPtr = input;
628  const float* bPtr = taps;
629 
630  __m512 a0Val, a1Val, a2Val, a3Val;
631  __m512 b0Val, b1Val, b2Val, b3Val;
632 
633  __m512 dotProdVal0 = _mm512_setzero_ps();
634  __m512 dotProdVal1 = _mm512_setzero_ps();
635  __m512 dotProdVal2 = _mm512_setzero_ps();
636  __m512 dotProdVal3 = _mm512_setzero_ps();
637 
638  for (; number < sixtyfourthPoints; number++) {
639 
640  a0Val = _mm512_loadu_ps(aPtr);
641  a1Val = _mm512_loadu_ps(aPtr + 16);
642  a2Val = _mm512_loadu_ps(aPtr + 32);
643  a3Val = _mm512_loadu_ps(aPtr + 48);
644  b0Val = _mm512_loadu_ps(bPtr);
645  b1Val = _mm512_loadu_ps(bPtr + 16);
646  b2Val = _mm512_loadu_ps(bPtr + 32);
647  b3Val = _mm512_loadu_ps(bPtr + 48);
648 
649  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
650  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
651  dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
652  dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
653 
654  aPtr += 64;
655  bPtr += 64;
656  }
657 
658  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
659  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
660  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
661 
662  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
663 
664  _mm512_storeu_ps(dotProductVector,
665  dotProdVal0); // Store the results back into the dot product vector
666 
667  dotProduct = dotProductVector[0];
668  dotProduct += dotProductVector[1];
669  dotProduct += dotProductVector[2];
670  dotProduct += dotProductVector[3];
671  dotProduct += dotProductVector[4];
672  dotProduct += dotProductVector[5];
673  dotProduct += dotProductVector[6];
674  dotProduct += dotProductVector[7];
675  dotProduct += dotProductVector[8];
676  dotProduct += dotProductVector[9];
677  dotProduct += dotProductVector[10];
678  dotProduct += dotProductVector[11];
679  dotProduct += dotProductVector[12];
680  dotProduct += dotProductVector[13];
681  dotProduct += dotProductVector[14];
682  dotProduct += dotProductVector[15];
683 
684  number = sixtyfourthPoints * 64;
685  for (; number < num_points; number++) {
686  dotProduct += ((*aPtr++) * (*bPtr++));
687  }
688 
689  *result = (short)dotProduct;
690 }
691 
692 #endif /*LV_HAVE_AVX512F*/
693 
694 
695 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:68
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:91
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:394
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:540
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:237