Vector Optimized Library of Kernels  2.5.2
Architecture-tuned implementations of math kernels
volk_avx_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: GPL-3.0-or-later
8  */
9 
10 /*
11  * This file is intended to hold AVX intrinsics of intrinsics.
12  * They should be used in VOLK kernels to avoid copy-pasta.
13  */
14 
15 #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17 #include <immintrin.h>
18 
19 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
20 {
21  __m256 yl, yh, tmp1, tmp2;
22  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
23  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
24  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
25  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
26  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
27 
28  // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
29  return _mm256_addsub_ps(tmp1, tmp2);
30 }
31 
32 static inline __m256 _mm256_conjugate_ps(__m256 x)
33 {
34  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
35  return _mm256_xor_ps(x, conjugator); // conjugate y
36 }
37 
38 static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
39 {
40  const __m256 nswap = _mm256_permute_ps(x, 0xb1);
41  const __m256 dreal = _mm256_moveldup_ps(y);
42  const __m256 dimag = _mm256_movehdup_ps(y);
43 
44  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
45  const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
46  const __m256 multreal = _mm256_mul_ps(x, dreal);
47  const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
48  return _mm256_add_ps(multreal, multimag);
49 }
50 
51 static inline __m256 _mm256_normalize_ps(__m256 val)
52 {
53  __m256 tmp1 = _mm256_mul_ps(val, val);
54  tmp1 = _mm256_hadd_ps(tmp1, tmp1);
55  tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
56  tmp1 = _mm256_sqrt_ps(tmp1);
57  return _mm256_div_ps(val, tmp1);
58 }
59 
60 static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
61 {
62  __m256 complex1, complex2;
63  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
64  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
65  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
66  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
67  return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
68 }
69 
70 static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
71 {
72  return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
73 }
74 
75 static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
76  const __m256 symbols1,
77  const __m256 points0,
78  const __m256 points1,
79  const __m256 scalar)
80 {
81  /*
82  * Calculate: |y - x|^2 * SNR_lin
83  * Consider 'symbolsX' and 'pointsX' to be complex float
84  * 'symbolsX' are 'y' and 'pointsX' are 'x'
85  */
86  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
87  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
88  const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
89  return _mm256_mul_ps(norms, scalar);
90 }
91 
92 static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
93 {
94  __m256 sign_mask_dummy = _mm256_setzero_ps();
95  const __m128i zeros = _mm_set1_epi8(0x00);
96  const __m128i sign_extract = _mm_set1_epi8(0x80);
97  const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
98  0xff,
99  0xff,
100  0x00,
101  0xff,
102  0xff,
103  0xff,
104  0x01,
105  0xff,
106  0xff,
107  0xff,
108  0x02,
109  0xff,
110  0xff,
111  0xff,
112  0x03);
113  const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
114  0xff,
115  0xff,
116  0x04,
117  0xff,
118  0xff,
119  0xff,
120  0x05,
121  0xff,
122  0xff,
123  0xff,
124  0x06,
125  0xff,
126  0xff,
127  0xff,
128  0x07);
129 
130  fbits = _mm_cmpgt_epi8(fbits, zeros);
131  fbits = _mm_and_si128(fbits, sign_extract);
132  __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
133  __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
134 
135  __m256 sign_mask =
136  _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
137  return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
138  // // This is the desired function call. Though it seems to be missing in GCC.
139  // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
140  // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
141  // _mm_castsi128_ps(sign_bits0));
142 }
143 
144 static inline void
145 _mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
146 {
147  // deinterleave values
148  __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
149  __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
150  *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
151  *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
152 }
153 
154 static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
155 {
156  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
157  const __m256 abs_mask =
158  _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
159 
160  __m256 llr0, llr1;
161  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
162 
163  // calculate result
164  __m256 sign =
165  _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
166  __m256 dst =
167  _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
168  return _mm256_or_ps(dst, sign);
169 }
170 
171 static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
172 {
173  // prepare sign mask for correct +-
174  __m256 sign_mask = _mm256_polar_sign_mask(fbits);
175 
176  __m256 llr0, llr1;
177  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
178 
179  // calculate result
180  llr0 = _mm256_xor_ps(llr0, sign_mask);
181  __m256 dst = _mm256_add_ps(llr0, llr1);
182  return dst;
183 }
184 
185 static inline __m256 _mm256_accumulate_square_sum_ps(
186  __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
187 {
188  aux = _mm256_mul_ps(aux, val);
189  aux = _mm256_sub_ps(aux, acc);
190  aux = _mm256_mul_ps(aux, aux);
191  aux = _mm256_mul_ps(aux, rec);
192  return _mm256_add_ps(sq_acc, aux);
193 }
194 
195 #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */