/usr/include/volk/volk_neon_intrinsics.h is in libvolk1-dev 1.2.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | /* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
/*
* This file is intended to hold NEON intrinsics of intrinsics.
* They should be used in VOLK kernels to avoid copy-pasta.
*/
#ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
#include <arm_neon.h>
static inline float32x4_t
_vmagnitudesquaredq_f32(float32x4x2_t cplxValue)
{
float32x4_t iValue, qValue, result;
iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
return result;
}
static inline float32x4x2_t
_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
{
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
// Multiply cross terms to get the imaginary result
// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
// combine the products
c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
return c_val;
}
static inline float32x4_t
_vlog2q_f32(float32x4_t aval)
{
/* Calculate log2 of floats by taking exponent +
* minimax log2 approx of significand */
static int32x4_t one = vdupq_n_s32(0x000800000);
static /* minimax polynomial */
static float32x4_t p0 = vdupq_n_f32(-3.0400402727048585);
static float32x4_t p1 = vdupq_n_f32(6.1129631282966113);
static float32x4_t p2 = vdupq_n_f32(-5.3419892024633207);
static float32x4_t p3 = vdupq_n_f32(3.2865287703753912);
static float32x4_t p4 = vdupq_n_f32(-1.2669182593441635);
static float32x4_t p5 = vdupq_n_f32(0.2751487703421256);
static float32x4_t p6 = vdupq_n_f32(-0.0256910888150985);
static int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
static int32x4_t sig_mask = vdupq_n_s32(0x007fffff);
static int32x4_t exp_bias = vdupq_n_s32(127);
int32x4_t exponent_i = vandq_s32(aval, exp_mask);
int32x4_t significand_i = vandq_s32(aval, sig_mask);
exponent_i = vshrq_n_s32(exponent_i, 23);
/* extract the exponent and significand
we can treat this as fixed point to save ~9% on the
conversion + float add */
significand_i = vorrq_s32(one, significand_i);
float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23);
/* debias the exponent and convert to float */
exponent_i = vsubq_s32(exponent_i, exp_bias);
float32x4_t exponent_f = vcvtq_f32_s32(exponent_i);
/* put the significand through a polynomial fit of log2(x) [1,2]
add the result to the exponent */
log2_approx = vaddq_f32(exponent_f, p0); /* p0 */
float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */
log2_approx = vaddq_f32(log2_approx, tmp1);
float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */
tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */
log2_approx = vaddq_f32(log2_approx, tmp1);
float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */
tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */
log2_approx = vaddq_f32(log2_approx, tmp1);
float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */
tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */
log2_approx = vaddq_f32(log2_approx, tmp1);
float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */
tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */
log2_approx = vaddq_f32(log2_approx, tmp1);
float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */
tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */
log2_approx = vaddq_f32(log2_approx, tmp1);
return log2_approx;
}
#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
|