fhi_lib/lib/src/xran_bfp_uplane_9b16rb.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2020 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression U-plane implementation and interface functions
  21  *
  22  * @file xran_compression.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include "xran_bfp_byte_packing_utils.hpp"
  30 #include "xran_compression.h"
  31 #include <complex>
  32 #include <algorithm>
  33 #include <immintrin.h>
  34 #include <limits.h>
  35 #include <cstring>
  36
  37 namespace BFP_UPlane_9b16RB
  38 {
  39   /// Namespace constants
  40   const int k_numREReal = 24; /// 12 IQ pairs
  41
  42
  43   /// Compute exponent value for a set of 16 RB from the maximum absolute value.
  44   /// Max Abs operates in a loop, executing 4 RB per iteration. The results are
  45   /// packed into the final output register.
  46   inline __m512i
  47   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  48   {
  49     __m512i maxAbs = __m512i();
  50     const __m512i* rawData = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  51     /// Max Abs loop operates on 4RB at a time
  52 #pragma unroll(4)
  53     for (int n = 0; n < 4; ++n)
  54     {
  55       /// Re-order and vertical max abs
  56       auto maxAbsVert = BlockFloatCompander::maxAbsVertical4RB(rawData[3 * n + 0], rawData[3 * n + 1], rawData[3 * n + 2]);
  57       /// Horizontal max abs
  58       auto maxAbsHorz = BlockFloatCompander::horizontalMax4x16(maxAbsVert);
  59       /// Pack these 4 values into maxAbs
  60       maxAbs = BlockFloatCompander::slidePermute(maxAbsHorz, maxAbs, n);
  61     }
  62     /// Calculate exponent
  63     const auto maxAbs32 = BlockFloatCompander::maskUpperWord(maxAbs);
  64     return BlockFloatCompander::expLzCnt(maxAbs32, totShiftBits);
  65   }
  66
  67
  68   /// Apply compression to 1 RB
  69   template<BlockFloatCompander::PackFunction networkBytePack>
  70   inline void
  71   applyCompressionN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
  72                         const int numREOffset, const uint8_t thisExp, const int thisRBExpAddr, const uint16_t rbWriteMask)
  73   {
  74     /// Get AVX512 pointer aligned to desired RB
  75     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + numREOffset);
  76     /// Apply the exponent shift
  77     const auto compData = _mm512_srai_epi16(*rawDataIn, thisExp);
  78     /// Pack compressed data network byte order
  79     const auto compDataBytePacked = networkBytePack(compData);
  80     /// Store exponent first
  81     dataOut->dataCompressed[thisRBExpAddr] = thisExp;
  82     /// Now have 1 RB worth of bytes separated into 3 chunks (1 per lane)
  83     /// Use three offset stores to join
  84     _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
  85     _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + dataIn.iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
  86     _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * dataIn.iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
  87   }
  88
  89
  90   /// Calls compression function specific to the number of RB to be executed. For 9, 10, or 12bit iqWidth.
  91   template<BlockFloatCompander::PackFunction networkBytePack>
  92   inline void
  93   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
  94                    const __m512i totShiftBits, const int totNumBytesPerRB, const uint16_t rbWriteMask)
  95   {
  96     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
  97 #pragma unroll(16)
  98     for (int n = 0; n < 16; ++n)
  99     {
 100       applyCompressionN_1RB<networkBytePack>(dataIn, dataOut, n * k_numREReal, ((uint8_t*)&exponents)[n * 4], n * totNumBytesPerRB, rbWriteMask);
 101     }
 102   }
 103
 104
 105   /// Apply compression to 1 RB
 106   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 107   inline void
 108   applyExpansionN_1RB(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 109                       const int expAddr, const int thisRBAddr, const int maxExpShift)
 110   {
 111     /// Unpack network order packed data
 112     const auto dataUnpacked = networkByteUnpack(dataIn.dataCompressed + expAddr + 1);
 113     /// Apply exponent scaling (by appropriate arithmetic shift right)
 114     const auto dataExpanded = _mm512_srai_epi16(dataUnpacked, maxExpShift - *(dataIn.dataCompressed + expAddr));
 115     /// Write expanded data to output
 116     static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
 117     _mm512_mask_storeu_epi16(dataOut->dataExpanded + thisRBAddr, k_WriteMask, dataExpanded);
 118   }
 119
 120
 121   /// Calls compression function specific to the number of RB to be executed. For 9, 10, or 12bit iqWidth.
 122   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 123   inline void
 124   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 125                  const int totNumBytesPerRB, const int maxExpShift)
 126   {
 127 #pragma unroll(16)
 128       for (int n = 0; n < 16; ++n)
 129       {
 130         applyExpansionN_1RB<networkByteUnpack>(dataIn, dataOut, n * totNumBytesPerRB, n * k_numREReal, maxExpShift);
 131       }
 132   }
 133 }
 134
 135
 136
 137 /// Main kernel function for compression.
 138 /// Starts by determining iqWidth specific parameters and functions.
 139 void
 140 BlockFloatCompander::BFPCompressUserPlaneAvx512_9b16RB(const ExpandedData& dataIn, CompressedData* dataOut)
 141 {
 142   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 143   const auto totShiftBits9 = _mm512_set1_epi32(24);
 144
 145   /// Total number of compressed bytes per RB for each iqWidth option
 146   constexpr int totNumBytesPerRB9 = 28;
 147
 148   /// Compressed data write mask for each iqWidth option
 149   constexpr uint16_t rbWriteMask9 = 0x01FF;
 150
 151   BFP_UPlane_9b16RB::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerRB9, rbWriteMask9);
 152 }
 153
 154
 155
 156 /// Main kernel function for expansion.
 157 /// Starts by determining iqWidth specific parameters and functions.
 158 void
 159 BlockFloatCompander::BFPExpandUserPlaneAvx512_9b16RB(const CompressedData& dataIn, ExpandedData* dataOut)
 160 {
 161   constexpr int k_totNumBytesPerRB9 = 28;
 162   constexpr int k_maxExpShift9 = 7;
 163   BFP_UPlane_9b16RB::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, k_totNumBytesPerRB9, k_maxExpShift9);
 164 }