Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
59 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
60 
61 typedef union {
62  unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
63  unsigned int w[64 /*NUMSTATES*/ / 32];
64  unsigned short s[64 /*NUMSTATES*/ / 16];
65  unsigned char c[64 /*NUMSTATES*/ / 8];
66 #ifdef _MSC_VER
67 } decision_t;
68 #else
69 } decision_t __attribute__((aligned(16)));
70 #endif
71 
72 
73 static inline void renormalize(unsigned char* X, unsigned char threshold)
74 {
75  int NUMSTATES = 64;
76  int i;
77 
78  unsigned char min = X[0];
79  // if(min > threshold) {
80  for (i = 0; i < NUMSTATES; i++)
81  if (min > X[i])
82  min = X[i];
83  for (i = 0; i < NUMSTATES; i++)
84  X[i] -= min;
85  //}
86 }
87 
88 
89 // helper BFLY for GENERIC version
90 static inline void BFLY(int i,
91  int s,
92  unsigned char* syms,
93  unsigned char* Y,
94  unsigned char* X,
95  decision_t* d,
96  unsigned char* Branchtab)
97 {
98  int j, decision0, decision1;
99  unsigned char metric, m0, m1, m2, m3;
100 
101  int NUMSTATES = 64;
102  int RATE = 2;
103  int METRICSHIFT = 1;
104  int PRECISIONSHIFT = 2;
105 
106  metric = 0;
107  for (j = 0; j < RATE; j++)
108  metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
109  metric = metric >> PRECISIONSHIFT;
110 
111  unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
112 
113  m0 = X[i] + metric;
114  m1 = X[i + NUMSTATES / 2] + (max - metric);
115  m2 = X[i] + (max - metric);
116  m3 = X[i + NUMSTATES / 2] + metric;
117 
118  decision0 = (signed int)(m0 - m1) > 0;
119  decision1 = (signed int)(m2 - m3) > 0;
120 
121  Y[2 * i] = decision0 ? m1 : m0;
122  Y[2 * i + 1] = decision1 ? m3 : m2;
123 
124  d->w[i / (sizeof(unsigned int) * 8 / 2) +
125  s * (sizeof(decision_t) / sizeof(unsigned int))] |=
126  (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
127 }
128 
129 
130 #if LV_HAVE_AVX2
131 
132 #include <immintrin.h>
133 #include <stdio.h>
134 
135 static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
136  unsigned char* X,
137  unsigned char* syms,
138  unsigned char* dec,
139  unsigned int framebits,
140  unsigned int excess,
141  unsigned char* Branchtab)
142 {
143  unsigned int i9;
144  for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
145  unsigned char a75, a81;
146  int a73, a92;
147  int s20, s21;
148  unsigned char *a80, *b6;
149  int *a110, *a91, *a93;
150  __m256i *a112, *a71, *a72, *a77, *a83, *a95;
151  __m256i a86, a87;
152  __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26,
153  s18, s19, s22, s23, s24, s25, t13, t14, t15;
154  a71 = ((__m256i*)X);
155  s18 = *(a71);
156  a72 = (a71 + 1);
157  s19 = *(a72);
158  s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
159  s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
160  s18 = s22;
161  a73 = (4 * i9);
162  b6 = (syms + a73);
163  a75 = *(b6);
164  a76 = _mm256_set1_epi8(a75);
165  a77 = ((__m256i*)Branchtab);
166  a78 = *(a77);
167  a79 = _mm256_xor_si256(a76, a78);
168  a80 = (b6 + 1);
169  a81 = *(a80);
170  a82 = _mm256_set1_epi8(a81);
171  a83 = (a77 + 1);
172  a84 = *(a83);
173  a85 = _mm256_xor_si256(a82, a84);
174  t13 = _mm256_avg_epu8(a79, a85);
175  a86 = ((__m256i)t13);
176  a87 = _mm256_srli_epi16(a86, 2);
177  a88 = ((__m256i)a87);
178  t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
179  t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
180  m23 = _mm256_adds_epu8(s18, t14);
181  m24 = _mm256_adds_epu8(s19, t15);
182  m25 = _mm256_adds_epu8(s18, t15);
183  m26 = _mm256_adds_epu8(s19, t14);
184  a89 = _mm256_min_epu8(m24, m23);
185  d9 = _mm256_cmpeq_epi8(a89, m24);
186  a90 = _mm256_min_epu8(m26, m25);
187  d10 = _mm256_cmpeq_epi8(a90, m26);
188  s22 = _mm256_unpacklo_epi8(d9, d10);
189  s23 = _mm256_unpackhi_epi8(d9, d10);
190  s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
191  a91 = ((int*)dec);
192  a92 = (4 * i9);
193  a93 = (a91 + a92);
194  *(a93) = s20;
195  s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
196  a110 = (a93 + 1);
197  *(a110) = s21;
198  s22 = _mm256_unpacklo_epi8(a89, a90);
199  s23 = _mm256_unpackhi_epi8(a89, a90);
200  a95 = ((__m256i*)Y);
201  s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
202  *(a95) = s24;
203  s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
204  a112 = (a95 + 1);
205  *(a112) = s23;
206  if ((((unsigned char*)Y)[0] > 210)) {
207  __m256i m5, m6;
208  m5 = ((__m256i*)Y)[0];
209  m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
210  __m256i m7;
211  m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
212  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
213  ((__m256i)m7)));
214  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
215  ((__m256i)m7)));
216  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
217  ((__m256i)m7)));
218  m7 = _mm256_unpacklo_epi8(m7, m7);
219  m7 = _mm256_shufflelo_epi16(m7, 0);
220  m6 = _mm256_unpacklo_epi64(m7, m7);
221  m6 = _mm256_permute2x128_si256(
222  m6, m6, 0); // copy lower half of m6 to upper half, since above ops
223  // operate on 128 bit lanes
224  ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
225  ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
226  }
227  unsigned char a188, a194;
228  int a205;
229  int s48, s54;
230  unsigned char *a187, *a193;
231  int *a204, *a206, *a223, *b16;
232  __m256i *a184, *a185, *a190, *a196, *a208, *a225;
233  __m256i a199, a200;
234  __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40,
235  m41, m42, s46, s47, s50, s51, t25, t26, t27;
236  a184 = ((__m256i*)Y);
237  s46 = *(a184);
238  a185 = (a184 + 1);
239  s47 = *(a185);
240  s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
241  s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
242  s46 = s50;
243  a187 = (b6 + 2);
244  a188 = *(a187);
245  a189 = _mm256_set1_epi8(a188);
246  a190 = ((__m256i*)Branchtab);
247  a191 = *(a190);
248  a192 = _mm256_xor_si256(a189, a191);
249  a193 = (b6 + 3);
250  a194 = *(a193);
251  a195 = _mm256_set1_epi8(a194);
252  a196 = (a190 + 1);
253  a197 = *(a196);
254  a198 = _mm256_xor_si256(a195, a197);
255  t25 = _mm256_avg_epu8(a192, a198);
256  a199 = ((__m256i)t25);
257  a200 = _mm256_srli_epi16(a199, 2);
258  a201 = ((__m256i)a200);
259  t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
260  t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
261  m39 = _mm256_adds_epu8(s46, t26);
262  m40 = _mm256_adds_epu8(s47, t27);
263  m41 = _mm256_adds_epu8(s46, t27);
264  m42 = _mm256_adds_epu8(s47, t26);
265  a202 = _mm256_min_epu8(m40, m39);
266  d17 = _mm256_cmpeq_epi8(a202, m40);
267  a203 = _mm256_min_epu8(m42, m41);
268  d18 = _mm256_cmpeq_epi8(a203, m42);
269  s24 = _mm256_unpacklo_epi8(d17, d18);
270  s25 = _mm256_unpackhi_epi8(d17, d18);
271  s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
272  a204 = ((int*)dec);
273  a205 = (4 * i9);
274  b16 = (a204 + a205);
275  a206 = (b16 + 2);
276  *(a206) = s48;
277  s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
278  a223 = (b16 + 3);
279  *(a223) = s54;
280  s50 = _mm256_unpacklo_epi8(a202, a203);
281  s51 = _mm256_unpackhi_epi8(a202, a203);
282  s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
283  s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
284  a208 = ((__m256i*)X);
285  *(a208) = s25;
286  a225 = (a208 + 1);
287  *(a225) = s51;
288 
289  if ((((unsigned char*)X)[0] > 210)) {
290  __m256i m12, m13;
291  m12 = ((__m256i*)X)[0];
292  m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
293  __m256i m14;
294  m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
295  m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
296  ((__m256i)m14)));
297  m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
298  ((__m256i)m14)));
299  m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
300  ((__m256i)m14)));
301  m14 = _mm256_unpacklo_epi8(m14, m14);
302  m14 = _mm256_shufflelo_epi16(m14, 0);
303  m13 = _mm256_unpacklo_epi64(m14, m14);
304  m13 = _mm256_permute2x128_si256(m13, m13, 0);
305  ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
306  ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
307  }
308  }
309 
310  renormalize(X, 210);
311 
312  unsigned int j;
313  for (j = 0; j < (framebits + excess) % 2; ++j) {
314  int i;
315  for (i = 0; i < 64 / 2; i++) {
316  BFLY(i,
317  (((framebits + excess) >> 1) << 1) + j,
318  syms,
319  Y,
320  X,
321  (decision_t*)dec,
322  Branchtab);
323  }
324 
325  renormalize(Y, 210);
326  }
327  /*skip*/
328 }
329 
330 #endif /*LV_HAVE_AVX2*/
331 
332 
333 #if LV_HAVE_SSE3
334 
335 #include <emmintrin.h>
336 #include <mmintrin.h>
337 #include <pmmintrin.h>
338 #include <stdio.h>
339 #include <xmmintrin.h>
340 
341 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
342  unsigned char* X,
343  unsigned char* syms,
344  unsigned char* dec,
345  unsigned int framebits,
346  unsigned int excess,
347  unsigned char* Branchtab)
348 {
349  unsigned int i9;
350  for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
351  unsigned char a75, a81;
352  int a73, a92;
353  short int s20, s21, s26, s27;
354  unsigned char *a74, *a80, *b6;
355  short int *a110, *a111, *a91, *a93, *a94;
356  __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
357  __m128i a105, a106, a86, a87;
358  __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
359  a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
360  s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
361  a71 = ((__m128i*)X);
362  s18 = *(a71);
363  a72 = (a71 + 2);
364  s19 = *(a72);
365  a73 = (4 * i9);
366  a74 = (syms + a73);
367  a75 = *(a74);
368  a76 = _mm_set1_epi8(a75);
369  a77 = ((__m128i*)Branchtab);
370  a78 = *(a77);
371  a79 = _mm_xor_si128(a76, a78);
372  b6 = (a73 + syms);
373  a80 = (b6 + 1);
374  a81 = *(a80);
375  a82 = _mm_set1_epi8(a81);
376  a83 = (a77 + 2);
377  a84 = *(a83);
378  a85 = _mm_xor_si128(a82, a84);
379  t13 = _mm_avg_epu8(a79, a85);
380  a86 = ((__m128i)t13);
381  a87 = _mm_srli_epi16(a86, 2);
382  a88 = ((__m128i)a87);
383  t14 = _mm_and_si128(
384  a88,
385  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
386  t15 = _mm_subs_epu8(
387  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
388  t14);
389  m23 = _mm_adds_epu8(s18, t14);
390  m24 = _mm_adds_epu8(s19, t15);
391  m25 = _mm_adds_epu8(s18, t15);
392  m26 = _mm_adds_epu8(s19, t14);
393  a89 = _mm_min_epu8(m24, m23);
394  d9 = _mm_cmpeq_epi8(a89, m24);
395  a90 = _mm_min_epu8(m26, m25);
396  d10 = _mm_cmpeq_epi8(a90, m26);
397  s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
398  a91 = ((short int*)dec);
399  a92 = (8 * i9);
400  a93 = (a91 + a92);
401  *(a93) = s20;
402  s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
403  a94 = (a93 + 1);
404  *(a94) = s21;
405  s22 = _mm_unpacklo_epi8(a89, a90);
406  s23 = _mm_unpackhi_epi8(a89, a90);
407  a95 = ((__m128i*)Y);
408  *(a95) = s22;
409  a96 = (a95 + 1);
410  *(a96) = s23;
411  a97 = (a71 + 1);
412  s24 = *(a97);
413  a98 = (a71 + 3);
414  s25 = *(a98);
415  a99 = (a77 + 1);
416  a100 = *(a99);
417  a101 = _mm_xor_si128(a76, a100);
418  a102 = (a77 + 3);
419  a103 = *(a102);
420  a104 = _mm_xor_si128(a82, a103);
421  t16 = _mm_avg_epu8(a101, a104);
422  a105 = ((__m128i)t16);
423  a106 = _mm_srli_epi16(a105, 2);
424  a107 = ((__m128i)a106);
425  t17 = _mm_and_si128(
426  a107,
427  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
428  t18 = _mm_subs_epu8(
429  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
430  t17);
431  m27 = _mm_adds_epu8(s24, t17);
432  m28 = _mm_adds_epu8(s25, t18);
433  m29 = _mm_adds_epu8(s24, t18);
434  m30 = _mm_adds_epu8(s25, t17);
435  a108 = _mm_min_epu8(m28, m27);
436  d11 = _mm_cmpeq_epi8(a108, m28);
437  a109 = _mm_min_epu8(m30, m29);
438  d12 = _mm_cmpeq_epi8(a109, m30);
439  s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
440  a110 = (a93 + 2);
441  *(a110) = s26;
442  s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
443  a111 = (a93 + 3);
444  *(a111) = s27;
445  s28 = _mm_unpacklo_epi8(a108, a109);
446  s29 = _mm_unpackhi_epi8(a108, a109);
447  a112 = (a95 + 2);
448  *(a112) = s28;
449  a113 = (a95 + 3);
450  *(a113) = s29;
451  if ((((unsigned char*)Y)[0] > 210)) {
452  __m128i m5, m6;
453  m5 = ((__m128i*)Y)[0];
454  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
455  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
456  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
457  __m128i m7;
458  m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
459  m7 =
460  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
461  m7 =
462  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
463  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
464  m7 = _mm_unpacklo_epi8(m7, m7);
465  m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
466  m6 = _mm_unpacklo_epi64(m7, m7);
467  ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
468  ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
469  ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
470  ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
471  }
472  unsigned char a188, a194;
473  int a186, a205;
474  short int s48, s49, s54, s55;
475  unsigned char *a187, *a193, *b15;
476  short int *a204, *a206, *a207, *a223, *a224, *b16;
477  __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
478  *a225, *a226;
479  __m128i a199, a200, a218, a219;
480  __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
481  a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
482  m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
483  a184 = ((__m128i*)Y);
484  s46 = *(a184);
485  a185 = (a184 + 2);
486  s47 = *(a185);
487  a186 = (4 * i9);
488  b15 = (a186 + syms);
489  a187 = (b15 + 2);
490  a188 = *(a187);
491  a189 = _mm_set1_epi8(a188);
492  a190 = ((__m128i*)Branchtab);
493  a191 = *(a190);
494  a192 = _mm_xor_si128(a189, a191);
495  a193 = (b15 + 3);
496  a194 = *(a193);
497  a195 = _mm_set1_epi8(a194);
498  a196 = (a190 + 2);
499  a197 = *(a196);
500  a198 = _mm_xor_si128(a195, a197);
501  t25 = _mm_avg_epu8(a192, a198);
502  a199 = ((__m128i)t25);
503  a200 = _mm_srli_epi16(a199, 2);
504  a201 = ((__m128i)a200);
505  t26 = _mm_and_si128(
506  a201,
507  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
508  t27 = _mm_subs_epu8(
509  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
510  t26);
511  m39 = _mm_adds_epu8(s46, t26);
512  m40 = _mm_adds_epu8(s47, t27);
513  m41 = _mm_adds_epu8(s46, t27);
514  m42 = _mm_adds_epu8(s47, t26);
515  a202 = _mm_min_epu8(m40, m39);
516  d17 = _mm_cmpeq_epi8(a202, m40);
517  a203 = _mm_min_epu8(m42, m41);
518  d18 = _mm_cmpeq_epi8(a203, m42);
519  s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
520  a204 = ((short int*)dec);
521  a205 = (8 * i9);
522  b16 = (a204 + a205);
523  a206 = (b16 + 4);
524  *(a206) = s48;
525  s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
526  a207 = (b16 + 5);
527  *(a207) = s49;
528  s50 = _mm_unpacklo_epi8(a202, a203);
529  s51 = _mm_unpackhi_epi8(a202, a203);
530  a208 = ((__m128i*)X);
531  *(a208) = s50;
532  a209 = (a208 + 1);
533  *(a209) = s51;
534  a210 = (a184 + 1);
535  s52 = *(a210);
536  a211 = (a184 + 3);
537  s53 = *(a211);
538  a212 = (a190 + 1);
539  a213 = *(a212);
540  a214 = _mm_xor_si128(a189, a213);
541  a215 = (a190 + 3);
542  a216 = *(a215);
543  a217 = _mm_xor_si128(a195, a216);
544  t28 = _mm_avg_epu8(a214, a217);
545  a218 = ((__m128i)t28);
546  a219 = _mm_srli_epi16(a218, 2);
547  a220 = ((__m128i)a219);
548  t29 = _mm_and_si128(
549  a220,
550  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
551  t30 = _mm_subs_epu8(
552  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
553  t29);
554  m43 = _mm_adds_epu8(s52, t29);
555  m44 = _mm_adds_epu8(s53, t30);
556  m45 = _mm_adds_epu8(s52, t30);
557  m46 = _mm_adds_epu8(s53, t29);
558  a221 = _mm_min_epu8(m44, m43);
559  d19 = _mm_cmpeq_epi8(a221, m44);
560  a222 = _mm_min_epu8(m46, m45);
561  d20 = _mm_cmpeq_epi8(a222, m46);
562  s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
563  a223 = (b16 + 6);
564  *(a223) = s54;
565  s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
566  a224 = (b16 + 7);
567  *(a224) = s55;
568  s56 = _mm_unpacklo_epi8(a221, a222);
569  s57 = _mm_unpackhi_epi8(a221, a222);
570  a225 = (a208 + 2);
571  *(a225) = s56;
572  a226 = (a208 + 3);
573  *(a226) = s57;
574  if ((((unsigned char*)X)[0] > 210)) {
575  __m128i m12, m13;
576  m12 = ((__m128i*)X)[0];
577  m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
578  m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
579  m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
580  __m128i m14;
581  m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
582  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
583  ((__m128i)m14)));
584  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
585  ((__m128i)m14)));
586  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
587  ((__m128i)m14)));
588  m14 = _mm_unpacklo_epi8(m14, m14);
589  m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
590  m13 = _mm_unpacklo_epi64(m14, m14);
591  ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
592  ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
593  ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
594  ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
595  }
596  }
597 
598  renormalize(X, 210);
599 
600  /*int ch;
601  for(ch = 0; ch < 64; ch++) {
602  printf("%d,", X[ch]);
603  }
604  printf("\n");*/
605 
606  unsigned int j;
607  for (j = 0; j < (framebits + excess) % 2; ++j) {
608  int i;
609  for (i = 0; i < 64 / 2; i++) {
610  BFLY(i,
611  (((framebits + excess) >> 1) << 1) + j,
612  syms,
613  Y,
614  X,
615  (decision_t*)dec,
616  Branchtab);
617  }
618 
619 
620  renormalize(Y, 210);
621 
622  /*printf("\n");
623  for(ch = 0; ch < 64; ch++) {
624  printf("%d,", Y[ch]);
625  }
626  printf("\n");*/
627  }
628  /*skip*/
629 }
630 
631 #endif /*LV_HAVE_SSE3*/
632 
633 
634 #if LV_HAVE_GENERIC
635 
636 static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
637  unsigned char* X,
638  unsigned char* syms,
639  unsigned char* dec,
640  unsigned int framebits,
641  unsigned int excess,
642  unsigned char* Branchtab)
643 {
644  int nbits = framebits + excess;
645  int NUMSTATES = 64;
646  int RENORMALIZE_THRESHOLD = 210;
647 
648  int s, i;
649  for (s = 0; s < nbits; s++) {
650  void* tmp;
651  for (i = 0; i < NUMSTATES / 2; i++) {
652  BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
653  }
654 
655  renormalize(Y, RENORMALIZE_THRESHOLD);
656 
658  tmp = (void*)X;
659  X = Y;
660  Y = (unsigned char*)tmp;
661  }
662 }
663 
664 #endif /* LV_HAVE_GENERIC */
665 
666 #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:341
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:73
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:636
for i
Definition: volk_config_fixed.tmpl.h:25
Definition: volk_8u_x4_conv_k7_r2_8u.h:61
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:90
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:63