Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
71 #define INCLUDED_volk_32fc_index_max_32u_a_H
72 
73 #include <inttypes.h>
74 #include <stdio.h>
75 #include <volk/volk_common.h>
76 #include <volk/volk_complex.h>
77 
78 #ifdef LV_HAVE_AVX2
79 #include <immintrin.h>
80 
81 static inline void
82 volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
83 {
84  const uint32_t num_bytes = num_points * 8;
85 
86  union bit256 holderf;
87  union bit256 holderi;
88  float sq_dist = 0.0;
89  float max = 0.0;
90  uint32_t index = 0;
91 
92  union bit256 xmm5, xmm4;
93  __m256 xmm1, xmm2, xmm3;
94  __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
95 
96  xmm5.int_vec = _mm256_setzero_si256();
97  xmm4.int_vec = _mm256_setzero_si256();
98  holderf.int_vec = _mm256_setzero_si256();
99  holderi.int_vec = _mm256_setzero_si256();
100 
101  int bound = num_bytes >> 6;
102  int i = 0;
103 
104  xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
105  xmm9 = _mm256_setzero_si256();
106  xmm10 = _mm256_set1_epi32(8);
107  xmm3 = _mm256_setzero_ps();
108  __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
109 
110  for (; i < bound; ++i) {
111  xmm1 = _mm256_load_ps((float*)src0);
112  xmm2 = _mm256_load_ps((float*)&src0[4]);
113 
114  src0 += 8;
115 
116  xmm1 = _mm256_mul_ps(xmm1, xmm1);
117  xmm2 = _mm256_mul_ps(xmm2, xmm2);
118 
119  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
120  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
121 
122  xmm3 = _mm256_max_ps(xmm1, xmm3);
123 
124  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
125  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
126 
127  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
128  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
129 
130  xmm9 = _mm256_add_epi32(xmm11, xmm12);
131 
132  xmm8 = _mm256_add_epi32(xmm8, xmm10);
133  }
134 
135  _mm256_store_ps((float*)&(holderf.f), xmm3);
136  _mm256_store_si256(&(holderi.int_vec), xmm9);
137 
138  for (i = 0; i < 8; i++) {
139  if (holderf.f[i] > max) {
140  index = holderi.i[i];
141  max = holderf.f[i];
142  }
143  }
144 
145  for (i = bound * 8; i < num_points; i++, src0++) {
146  sq_dist =
147  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
148 
149  if (sq_dist > max) {
150  index = i;
151  max = sq_dist;
152  }
153  }
154  target[0] = index;
155 }
156 
157 #endif /*LV_HAVE_AVX2*/
158 
159 #ifdef LV_HAVE_SSE3
160 #include <pmmintrin.h>
161 #include <xmmintrin.h>
162 
163 static inline void
164 volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
165 {
166  const uint32_t num_bytes = num_points * 8;
167 
168  union bit128 holderf;
169  union bit128 holderi;
170  float sq_dist = 0.0;
171 
172  union bit128 xmm5, xmm4;
173  __m128 xmm1, xmm2, xmm3;
174  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
175 
176  xmm5.int_vec = _mm_setzero_si128();
177  xmm4.int_vec = _mm_setzero_si128();
178  holderf.int_vec = _mm_setzero_si128();
179  holderi.int_vec = _mm_setzero_si128();
180 
181  int bound = num_bytes >> 5;
182  int i = 0;
183 
184  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
185  xmm9 = _mm_setzero_si128();
186  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
187  xmm3 = _mm_setzero_ps();
188 
189  for (; i < bound; ++i) {
190  xmm1 = _mm_load_ps((float*)src0);
191  xmm2 = _mm_load_ps((float*)&src0[2]);
192 
193  src0 += 4;
194 
195  xmm1 = _mm_mul_ps(xmm1, xmm1);
196  xmm2 = _mm_mul_ps(xmm2, xmm2);
197 
198  xmm1 = _mm_hadd_ps(xmm1, xmm2);
199 
200  xmm3 = _mm_max_ps(xmm1, xmm3);
201 
202  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
203  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
204 
205  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
206  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
207 
208  xmm9 = _mm_add_epi32(xmm11, xmm12);
209 
210  xmm8 = _mm_add_epi32(xmm8, xmm10);
211  }
212 
213  if (num_bytes >> 4 & 1) {
214  xmm2 = _mm_load_ps((float*)src0);
215 
216  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
217  xmm8 = bit128_p(&xmm1)->int_vec;
218 
219  xmm2 = _mm_mul_ps(xmm2, xmm2);
220 
221  src0 += 2;
222 
223  xmm1 = _mm_hadd_ps(xmm2, xmm2);
224 
225  xmm3 = _mm_max_ps(xmm1, xmm3);
226 
227  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
228 
229  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
230  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
231 
232  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
233  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
234 
235  xmm9 = _mm_add_epi32(xmm11, xmm12);
236 
237  xmm8 = _mm_add_epi32(xmm8, xmm10);
238  }
239 
240  if (num_bytes >> 3 & 1) {
241  sq_dist =
242  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
243 
244  xmm2 = _mm_load1_ps(&sq_dist);
245 
246  xmm1 = xmm3;
247 
248  xmm3 = _mm_max_ss(xmm3, xmm2);
249 
250  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
251  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
252 
253  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
254 
255  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
256  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
257 
258  xmm9 = _mm_add_epi32(xmm11, xmm12);
259  }
260 
261  _mm_store_ps((float*)&(holderf.f), xmm3);
262  _mm_store_si128(&(holderi.int_vec), xmm9);
263 
264  target[0] = holderi.i[0];
265  sq_dist = holderf.f[0];
266  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
267  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
268  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
269  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
270  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
271  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
272 }
273 
274 #endif /*LV_HAVE_SSE3*/
275 
276 #ifdef LV_HAVE_GENERIC
277 static inline void
278 volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
279 {
280  const uint32_t num_bytes = num_points * 8;
281 
282  float sq_dist = 0.0;
283  float max = 0.0;
284  uint32_t index = 0;
285 
286  uint32_t i = 0;
287 
288  for (; i<num_bytes>> 3; ++i) {
289  sq_dist =
290  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
291 
292  if (sq_dist > max) {
293  index = i;
294  max = sq_dist;
295  }
296  }
297  target[0] = index;
298 }
299 
300 #endif /*LV_HAVE_GENERIC*/
301 
302 #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
303 
304 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
305 #define INCLUDED_volk_32fc_index_max_32u_u_H
306 
307 #include <inttypes.h>
308 #include <stdio.h>
309 #include <volk/volk_common.h>
310 #include <volk/volk_complex.h>
311 
312 #ifdef LV_HAVE_AVX2
313 #include <immintrin.h>
314 
315 static inline void
316 volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
317 {
318  const uint32_t num_bytes = num_points * 8;
319 
320  union bit256 holderf;
321  union bit256 holderi;
322  float sq_dist = 0.0;
323  float max = 0.0;
324  uint32_t index = 0;
325 
326  union bit256 xmm5, xmm4;
327  __m256 xmm1, xmm2, xmm3;
328  __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
329 
330  xmm5.int_vec = _mm256_setzero_si256();
331  xmm4.int_vec = _mm256_setzero_si256();
332  holderf.int_vec = _mm256_setzero_si256();
333  holderi.int_vec = _mm256_setzero_si256();
334 
335  int bound = num_bytes >> 6;
336  int i = 0;
337 
338  xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
339  xmm9 = _mm256_setzero_si256();
340  xmm10 = _mm256_set1_epi32(8);
341  xmm3 = _mm256_setzero_ps();
342  __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
343 
344  for (; i < bound; ++i) {
345  xmm1 = _mm256_loadu_ps((float*)src0);
346  xmm2 = _mm256_loadu_ps((float*)&src0[4]);
347 
348  src0 += 8;
349 
350  xmm1 = _mm256_mul_ps(xmm1, xmm1);
351  xmm2 = _mm256_mul_ps(xmm2, xmm2);
352 
353  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
354  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
355 
356  xmm3 = _mm256_max_ps(xmm1, xmm3);
357 
358  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
359  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
360 
361  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
362  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
363 
364  xmm9 = _mm256_add_epi32(xmm11, xmm12);
365 
366  xmm8 = _mm256_add_epi32(xmm8, xmm10);
367  }
368 
369  _mm256_storeu_ps((float*)&(holderf.f), xmm3);
370  _mm256_storeu_si256(&(holderi.int_vec), xmm9);
371 
372  for (i = 0; i < 8; i++) {
373  if (holderf.f[i] > max) {
374  index = holderi.i[i];
375  max = holderf.f[i];
376  }
377  }
378 
379  for (i = bound * 8; i < num_points; i++, src0++) {
380  sq_dist =
381  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
382 
383  if (sq_dist > max) {
384  index = i;
385  max = sq_dist;
386  }
387  }
388  target[0] = index;
389 }
390 
391 #endif /*LV_HAVE_AVX2*/
392 
393 #ifdef LV_HAVE_NEON
394 #include <arm_neon.h>
396 
397 static inline void
398 volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
399 {
400  unsigned int number = 0;
401  const uint32_t quarter_points = num_points / 4;
402  const lv_32fc_t* src0Ptr = src0;
403 
404  uint32_t indices[4] = { 0, 1, 2, 3 };
405  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
406  uint32x4_t vec_indices = vld1q_u32(indices);
407  uint32x4_t vec_max_indices = vec_indices;
408 
409  if (num_points) {
410  float max = *src0Ptr;
411  uint32_t index = 0;
412 
413  float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
414 
415  for (; number < quarter_points; number++) {
416  // Load complex and compute magnitude squared
417  const float32x4_t vec_mag2 =
418  _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
419  __VOLK_PREFETCH(src0Ptr += 4);
420  // a > b?
421  const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
422  vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
423  vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
424  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
425  }
426  uint32_t tmp_max_indices[4];
427  float tmp_max[4];
428  vst1q_u32(tmp_max_indices, vec_max_indices);
429  vst1q_f32(tmp_max, vec_max);
430 
431  for (int i = 0; i < 4; i++) {
432  if (tmp_max[i] > max) {
433  max = tmp_max[i];
434  index = tmp_max_indices[i];
435  }
436  }
437 
438  // Deal with the rest
439  for (number = quarter_points * 4; number < num_points; number++) {
440  const float re = lv_creal(*src0Ptr);
441  const float im = lv_cimag(*src0Ptr);
442  if ((re * re + im * im) > max) {
443  max = *src0Ptr;
444  index = number;
445  }
446  src0Ptr++;
447  }
448  *target = index;
449  }
450 }
451 
452 #endif /*LV_HAVE_NEON*/
453 
454 #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:164
#define bit128_p(x)
Definition: volk_common.h:142
float f[8]
Definition: volk_common.h:132
__m256i int_vec
Definition: volk_common.h:137
uint32_t i[8]
Definition: volk_common.h:131
__m128i int_vec
Definition: volk_common.h:123
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:278
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:398
for i
Definition: volk_config_fixed.tmpl.h:25
Definition: volk_common.h:128
__m128 float_vec
Definition: volk_common.h:119
float complex lv_32fc_t
Definition: volk_complex.h:70
__m256 float_vec
Definition: volk_common.h:136
float f[4]
Definition: volk_common.h:115
Definition: volk_common.h:111
#define lv_creal(x)
Definition: volk_complex.h:92
#define lv_cimag(x)
Definition: volk_complex.h:94
uint32_t i[4]
Definition: volk_common.h:114