fhi_lib/lib/src/xran_bfp_utils.hpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2020 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression utilities functions
  21  *
  22  * @file xran_bfp_utils.hpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #pragma once
  28 #include <immintrin.h>
  29
  30 namespace BlockFloatCompander
  31 {
  32   /// Calculate exponent based on 16 max abs values using leading zero count.
  33   inline __m512i
  34   maskUpperWord(const __m512i inData)
  35   {
  36     const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  37                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  38                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  39                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF);
  40     return _mm512_and_epi64(inData, k_upperWordMask);
  41   }
  42
  43
  44   /// Calculate exponent based on 16 max abs values using leading zero count.
  45   inline __m512i
  46   expLzCnt(const __m512i maxAbs, const __m512i totShiftBits)
  47   {
  48     /// Compute exponent
  49     const auto lzCount = _mm512_lzcnt_epi32(maxAbs);
  50     return _mm512_subs_epu16(totShiftBits, lzCount);
  51   }
  52
  53
  54   /// Full horizontal max of 16b values
  55   inline int
  56   horizontalMax1x32(const __m512i maxAbsReg)
  57   {
  58     /// Swap each IQ pair in each lane (via 32b rotation) and compute max of
  59     /// each pair.
  60     const auto maxRot16 = _mm512_rol_epi32(maxAbsReg, BlockFloatCompander::k_numBitsIQ);
  61     const auto maxAbsIQ = _mm512_max_epi16(maxAbsReg, maxRot16);
  62     /// Convert to 32b by removing repeated values in maxAbs
  63     const auto maxAbs32 = maskUpperWord(maxAbsIQ);
  64     /// Return reduced max
  65     return _mm512_reduce_max_epi32(maxAbs32);
  66   }
  67
  68
  69   /// Perform horizontal max of 16 bit values across each lane
  70   inline __m512i
  71   horizontalMax4x16(const __m512i maxAbsIn)
  72   {
  73     /// Swap 64b in each lane and compute max
  74     const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1);
  75     auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbsIn);
  76     auto maxAbsHorz = _mm512_max_epi16(maxAbsIn, maxAbsPerm);
  77
  78     /// Swap each pair of 32b in each lane and compute max
  79     const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
  80     maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz);
  81     maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
  82
  83     /// Swap each IQ pair in each lane (via 32b rotation) and compute max
  84     maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ);
  85     return _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
  86   }
  87
  88
  89   /// Perform U-plane input data re-ordering and vertical max abs of 16b values
  90   /// Works on 4 RB at a time
  91   inline __m512i
  92   maxAbsVertical4RB(const __m512i inA, const __m512i inB, const __m512i inC)
  93   {
  94     /// Re-order the next 4RB in input data into 3 registers
  95     /// Input SIMD vectors are:
  96     ///   [A A A A A A A A A A A A B B B B]
  97     ///   [B B B B B B B B C C C C C C C C]
  98     ///   [C C C C D D D D D D D D D D D D]
  99     /// Re-ordered SIMD vectors are:
 100     ///   [A A A A B B B B C C C C D D D D]
 101     ///   [A A A A B B B B C C C C D D D D]
 102     ///   [A A A A B B B B C C C C D D D D]
 103     constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src
 104     constexpr int k_shuff1 = 0x41;
 105     const auto z_w1 = _mm512_mask_shuffle_i64x2(inA, k_msk1, inB, inC, k_shuff1);
 106
 107     constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src
 108     constexpr int k_shuff2 = 0xB1;
 109     const auto z_w2 = _mm512_mask_shuffle_i64x2(inB, k_msk2, inA, inC, k_shuff2);
 110
 111     constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src
 112     constexpr int k_shuff3 = 0xBE;
 113     const auto z_w3 = _mm512_mask_shuffle_i64x2(inC, k_msk3, inA, inB, k_shuff3);
 114
 115     /// Perform max abs on these 3 registers
 116     const auto abs16_1 = _mm512_abs_epi16(z_w1);
 117     const auto abs16_2 = _mm512_abs_epi16(z_w2);
 118     const auto abs16_3 = _mm512_abs_epi16(z_w3);
 119     return _mm512_max_epi16(_mm512_max_epi16(abs16_1, abs16_2), abs16_3);
 120   }
 121
 122
 123   /// Selects first 32 bit value in each src lane and packs into laneNum of dest
 124   inline __m512i
 125   slidePermute(const __m512i src, const __m512i dest, const int laneNum)
 126   {
 127     const auto k_selectVals = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16,
 128       28, 24, 20, 16, 28, 24, 20, 16);
 129     constexpr uint16_t k_laneMsk[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
 130     return _mm512_mask_permutex2var_epi32(dest, k_laneMsk[laneNum], k_selectVals, src);
 131   }
 132 }