Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
55 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
56 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
57 
58 #include <inttypes.h>
59 #include <stdio.h>
60 
61 #ifdef LV_HAVE_SSE2
62 
63 #include <emmintrin.h>
64 
65 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
66  short* src0,
67  short* src1,
68  short* src2,
69  short* src3,
70  unsigned int num_points)
71 {
72  const unsigned int num_bytes = num_points * 2;
73 
74  int i = 0;
75 
76  int bound = (num_bytes >> 4);
77  int bound_copy = bound;
78  int leftovers = (num_bytes >> 1) & 7;
79 
80  __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
81  p_target = (__m128i*)target;
82  p_src0 = (__m128i*)src0;
83  p_src1 = (__m128i*)src1;
84  p_src2 = (__m128i*)src2;
85  p_src3 = (__m128i*)src3;
86 
87  __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
88 
89  while (bound_copy > 0) {
90  xmm1 = _mm_load_si128(p_src0);
91  xmm2 = _mm_load_si128(p_src1);
92  xmm3 = _mm_load_si128(p_src2);
93  xmm4 = _mm_load_si128(p_src3);
94 
95  xmm5 = _mm_setzero_si128();
96  xmm6 = _mm_setzero_si128();
97  xmm7 = xmm1;
98  xmm8 = xmm3;
99 
100  xmm1 = _mm_sub_epi16(xmm2, xmm1);
101 
102  xmm3 = _mm_sub_epi16(xmm4, xmm3);
103 
104  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
105  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
106 
107  xmm2 = _mm_and_si128(xmm5, xmm2);
108  xmm4 = _mm_and_si128(xmm6, xmm4);
109  xmm5 = _mm_andnot_si128(xmm5, xmm7);
110  xmm6 = _mm_andnot_si128(xmm6, xmm8);
111 
112  xmm5 = _mm_add_epi16(xmm2, xmm5);
113  xmm6 = _mm_add_epi16(xmm4, xmm6);
114 
115  xmm1 = _mm_xor_si128(xmm1, xmm1);
116  xmm2 = xmm5;
117  xmm5 = _mm_sub_epi16(xmm6, xmm5);
118  p_src0 += 1;
119  bound_copy -= 1;
120 
121  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
122  p_src1 += 1;
123 
124  xmm6 = _mm_and_si128(xmm1, xmm6);
125 
126  xmm1 = _mm_andnot_si128(xmm1, xmm2);
127  p_src2 += 1;
128 
129  xmm1 = _mm_add_epi16(xmm6, xmm1);
130  p_src3 += 1;
131 
132  _mm_store_si128(p_target, xmm1);
133  p_target += 1;
134  }
135 
136 
137  /*__VOLK_ASM __VOLK_VOLATILE
138  (
139  "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
140  "cmp $0, %[bound]\n\t"
141  "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
142 
143  "movaps (%[src0]), %%xmm1\n\t"
144  "movaps (%[src1]), %%xmm2\n\t"
145  "movaps (%[src2]), %%xmm3\n\t"
146  "movaps (%[src3]), %%xmm4\n\t"
147 
148  "pxor %%xmm5, %%xmm5\n\t"
149  "pxor %%xmm6, %%xmm6\n\t"
150  "movaps %%xmm1, %%xmm7\n\t"
151  "movaps %%xmm3, %%xmm8\n\t"
152  "psubw %%xmm2, %%xmm1\n\t"
153  "psubw %%xmm4, %%xmm3\n\t"
154 
155  "pcmpgtw %%xmm1, %%xmm5\n\t"
156  "pcmpgtw %%xmm3, %%xmm6\n\t"
157 
158  "pand %%xmm5, %%xmm2\n\t"
159  "pand %%xmm6, %%xmm4\n\t"
160  "pandn %%xmm7, %%xmm5\n\t"
161  "pandn %%xmm8, %%xmm6\n\t"
162 
163  "paddw %%xmm2, %%xmm5\n\t"
164  "paddw %%xmm4, %%xmm6\n\t"
165 
166  "pxor %%xmm1, %%xmm1\n\t"
167  "movaps %%xmm5, %%xmm2\n\t"
168 
169  "psubw %%xmm6, %%xmm5\n\t"
170  "add $16, %[src0]\n\t"
171  "add $-1, %[bound]\n\t"
172 
173  "pcmpgtw %%xmm5, %%xmm1\n\t"
174  "add $16, %[src1]\n\t"
175 
176  "pand %%xmm1, %%xmm6\n\t"
177 
178  "pandn %%xmm2, %%xmm1\n\t"
179  "add $16, %[src2]\n\t"
180 
181  "paddw %%xmm6, %%xmm1\n\t"
182  "add $16, %[src3]\n\t"
183 
184  "movaps %%xmm1, (%[target])\n\t"
185  "addw $16, %[target]\n\t"
186  "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
187 
188  "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
189  :
190  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
191  [src3]"r"(src3), [target]"r"(target)
192  :
193  );
194  */
195 
196  short temp0 = 0;
197  short temp1 = 0;
198  for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
199  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
200  temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
201  target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
202  }
203  return;
204 }
205 
206 #endif /*LV_HAVE_SSE2*/
207 
208 #ifdef LV_HAVE_NEON
209 
210 #include <arm_neon.h>
211 
212 static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
213  short* src0,
214  short* src1,
215  short* src2,
216  short* src3,
217  unsigned int num_points)
218 {
219  const unsigned int eighth_points = num_points / 8;
220  unsigned i;
221 
222  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
223  int16x8_t diff12, diff34;
224  int16x8_t comp0, comp1, comp2, comp3;
225  int16x8_t result1_vec, result2_vec;
226  int16x8_t zeros;
227  zeros = vdupq_n_s16(0);
228  for (i = 0; i < eighth_points; ++i) {
229  src0_vec = vld1q_s16(src0);
230  src1_vec = vld1q_s16(src1);
231  src2_vec = vld1q_s16(src2);
232  src3_vec = vld1q_s16(src3);
233  diff12 = vsubq_s16(src0_vec, src1_vec);
234  diff34 = vsubq_s16(src2_vec, src3_vec);
235  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
236  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
237  comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
238  comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
239  comp0 = vandq_s16(src0_vec, comp0);
240  comp1 = vandq_s16(src1_vec, comp1);
241  comp2 = vandq_s16(src2_vec, comp2);
242  comp3 = vandq_s16(src3_vec, comp3);
243 
244  result1_vec = vaddq_s16(comp0, comp1);
245  result2_vec = vaddq_s16(comp2, comp3);
246 
247  diff12 = vsubq_s16(result1_vec, result2_vec);
248  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
249  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
250  comp0 = vandq_s16(result1_vec, comp0);
251  comp1 = vandq_s16(result2_vec, comp1);
252  result1_vec = vaddq_s16(comp0, comp1);
253  vst1q_s16(target, result1_vec);
254  src0 += 8;
255  src1 += 8;
256  src2 += 8;
257  src3 += 8;
258  target += 8;
259  }
260 
261  short temp0 = 0;
262  short temp1 = 0;
263  for (i = eighth_points * 8; i < num_points; ++i) {
264  temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
265  temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
266  *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
267  src0++;
268  src1++;
269  src2++;
270  src3++;
271  }
272 }
273 #endif /* LV_HAVE_NEON */
274 
275 
276 #ifdef LV_HAVE_GENERIC
277 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
278  short* src0,
279  short* src1,
280  short* src2,
281  short* src3,
282  unsigned int num_points)
283 {
284  const unsigned int num_bytes = num_points * 2;
285 
286  int i = 0;
287 
288  int bound = num_bytes >> 1;
289 
290  short temp0 = 0;
291  short temp1 = 0;
292  for (i = 0; i < bound; ++i) {
293  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
294  temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
295  target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
296  }
297 }
298 
299 #endif /*LV_HAVE_GENERIC*/
300 
301 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_16i_x4_quad_max_star_16i_generic(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:277
static void volk_16i_x4_quad_max_star_16i_a_sse2(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:65
static void volk_16i_x4_quad_max_star_16i_neon(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:212