fhi_lib/lib/src/xran_bfp_cplane16_snc.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2020 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 16T16R
  21  *
  22  * @file xran_bfp_cplane16.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include "xran_bfp_byte_packing_utils.hpp"
  30 #include <complex>
  31 #include <algorithm>
  32 #include <immintrin.h>
  33
  34
  35 namespace BFP_CPlane_16_SNC
  36 {
  37   /// Namespace constants
  38   const int k_numDataElements = 32; /// 16 IQ pairs
  39
  40   inline int
  41   maxAbsOneBlock(const __m512i* inData)
  42   {
  43     /// Compute abs of input data
  44     const auto thisRegAbs = _mm512_abs_epi16(*inData);
  45     /// Horizontal max across register
  46     return BlockFloatCompander::horizontalMax1x32(thisRegAbs);
  47   }
  48
  49   /// Compute exponent value for a set of 16 RB from the maximum absolute value.
  50   inline __m512i
  51   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  52   {
  53     __m512i maxAbs = __m512i();
  54     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  55 #pragma unroll(16)
  56     for (int n = 0; n < 16; ++n)
  57     {
  58       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n);
  59     }
  60     /// Calculate exponent
  61     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  62   }
  63
  64   /// Compute exponent value for a set of 4 RB from the maximum absolute value.
  65   inline __m512i
  66   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  67   {
  68     __m512i maxAbs = __m512i();
  69     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  70 #pragma unroll(4)
  71     for (int n = 0; n < 4; ++n)
  72     {
  73       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n);
  74     }
  75     /// Calculate exponent
  76     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  77   }
  78
  79   /// Compute exponent value for 1 RB from the maximum absolute value.
  80   inline uint8_t
  81   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  82   {
  83     __m512i maxAbs = __m512i();
  84     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  85     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  86     /// Calculate exponent
  87     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  88     return ((uint8_t*)&exps)[0];
  89   }
  90
  91
  92
  93   /// Apply compression to one compression block
  94   template<BlockFloatCompander::PackFunction networkBytePack>
  95   inline void
  96   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
  97                         const int iqWidth, const uint8_t thisExp, const uint64_t rbWriteMask)
  98   {
  99     /// Store exponent first
 100     *outBlockAddr = thisExp;
 101     /// Apply the exponent shift
 102     const auto compData = _mm512_srai_epi16(*dataIn, thisExp);
 103     /// Pack compressed data network byte order
 104     const auto compDataBytePacked = networkBytePack(compData);
 105     /// Store compressed data
 106     _mm512_mask_storeu_epi8(outBlockAddr + 1, rbWriteMask, compDataBytePacked);
 107   }
 108
 109   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 110   template<BlockFloatCompander::PackFunction networkBytePack>
 111   inline void
 112   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 113                  const __m512i totShiftBits, const int totNumBytesPerBlock, const uint64_t rbWriteMask)
 114   {
 115     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 116     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 117 #pragma unroll(16)
 118     for (int n = 0; n < 16; ++n)
 119     {
 120       applyCompressionN_1RB<networkBytePack>(dataInAddr + n, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], rbWriteMask);
 121     }
 122   }
 123
 124   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 125   template<BlockFloatCompander::PackFunction networkBytePack>
 126   inline void
 127   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 128                 const __m512i totShiftBits, const int totNumBytesPerBlock, const uint64_t rbWriteMask)
 129   {
 130     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 131     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 132 #pragma unroll(4)
 133     for (int n = 0; n < 4; ++n)
 134     {
 135       applyCompressionN_1RB<networkBytePack>(dataInAddr + n, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], rbWriteMask);
 136     }
 137   }
 138
 139   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 140   template<BlockFloatCompander::PackFunction networkBytePack>
 141   inline void
 142   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 143                 const __m512i totShiftBits, const int totNumBytesPerBlock, const uint64_t rbWriteMask)
 144   {
 145     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 146     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 147     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, rbWriteMask);
 148   }
 149
 150   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 151   template<BlockFloatCompander::PackFunction networkBytePack>
 152   inline void
 153   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 154                    const __m512i totShiftBits, const int totNumBytesPerBlock, const uint64_t rbWriteMask)
 155   {
 156     switch (dataIn.numBlocks)
 157     {
 158     case 16:
 159       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 160       break;
 161
 162     case 4:
 163       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 164       break;
 165
 166     case 1:
 167       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 168       break;
 169     }
 170   }
 171
 172
 173
 174   /// Apply 8b compression to 1 compression block.
 175   inline void
 176   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 177   {
 178     /// Store exponent first
 179     *outBlockAddr = thisExp;
 180     /// Apply the exponent shift
 181     const auto compData = _mm512_srai_epi16(*dataIn, thisExp);
 182     /// Truncate to 8bit and store
 183     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 184     _mm256_mask_storeu_epi8(outBlockAddr + 1, k_writeMask, _mm512_cvtepi16_epi8(compData));
 185   }
 186
 187   /// Derive and apply 8b compression to 16 compression blocks
 188   inline void
 189   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 190   {
 191     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 192     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 193 #pragma unroll(16)
 194     for (int n = 0; n < 16; ++n)
 195     {
 196       applyCompression8_1RB(dataInAddr + n, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 197     }
 198   }
 199
 200   /// Derive and apply 8b compression to 4 compression blocks
 201   inline void
 202   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 203   {
 204     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 205     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 206 #pragma unroll(4)
 207     for (int n = 0; n < 4; ++n)
 208     {
 209       applyCompression8_1RB(dataInAddr + n, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 210     }
 211   }
 212
 213   /// Derive and apply 8b compression to 1 compression block
 214   inline void
 215   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 216   {
 217     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 218     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 219     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 220   }
 221
 222   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 223   inline void
 224   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 225   {
 226     switch (dataIn.numBlocks)
 227     {
 228     case 16:
 229       compress8_16RB(dataIn, dataOut, totShiftBits);
 230       break;
 231
 232     case 4:
 233       compress8_4RB(dataIn, dataOut, totShiftBits);
 234       break;
 235
 236     case 1:
 237       compress8_1RB(dataIn, dataOut, totShiftBits);
 238       break;
 239     }
 240   }
 241
 242
 243
 244   /// Expand 1 compression block
 245   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 246   inline void
 247   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift)
 248   {
 249     const auto thisExpShift = maxExpShift - *expAddr;
 250     /// Unpack network order packed data
 251     const auto inDataUnpacked = networkByteUnpack(expAddr + 1);
 252     /// Apply exponent scaling (by appropriate arithmetic shift right)
 253     const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 254     /// Write expanded data to output
 255     static constexpr uint8_t k_WriteMask = 0xFF;
 256     _mm512_mask_storeu_epi64(dataOutAddr, k_WriteMask, expandedData);
 257   }
 258
 259   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 260   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 261   void
 262   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 263                  const int totNumBytesPerBlock, const int maxExpShift)
 264   {
 265     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 266     switch (dataIn.numBlocks)
 267     {
 268     case 16:
 269 #pragma unroll(16)
 270       for (int n = 0; n < 16; ++n)
 271       {
 272         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n, maxExpShift);
 273       }
 274       break;
 275
 276     case 4:
 277 #pragma unroll(4)
 278       for (int n = 0; n < 4; ++n)
 279       {
 280         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n, maxExpShift);
 281       }
 282       break;
 283
 284     case 1:
 285       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift);
 286       break;
 287     }
 288   }
 289
 290
 291   /// Apply expansion to 1 compression block
 292   inline void
 293   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 294   {
 295     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 296     const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn);
 297     const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 298     static constexpr uint8_t k_WriteMask = 0xFF;
 299     _mm512_mask_storeu_epi64(dataOutAddr, k_WriteMask, expData);
 300   }
 301
 302   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 303   void
 304   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 305   {
 306     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 307     switch (dataIn.numBlocks)
 308     {
 309     case 16:
 310 #pragma unroll(16)
 311       for (int n = 0; n < 16; ++n)
 312       {
 313         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n);
 314       }
 315       break;
 316
 317     case 4:
 318 #pragma unroll(4)
 319       for (int n = 0; n < 4; ++n)
 320       {
 321         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n);
 322       }
 323       break;
 324
 325     case 1:
 326       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 327       break;
 328     }
 329   }
 330 }
 331
 332
 333 /// Main kernel function for 16 antenna C-plane compression.
 334 /// Starts by determining iqWidth specific parameters and functions.
 335 void
 336 BlockFloatCompander::BFPCompressCtrlPlane16AvxSnc(const ExpandedData& dataIn, CompressedData* dataOut)
 337 {
 338   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 339   const auto totShiftBits8 = _mm512_set1_epi32(25);
 340   const auto totShiftBits9 = _mm512_set1_epi32(24);
 341   const auto totShiftBits10 = _mm512_set1_epi32(23);
 342   const auto totShiftBits12 = _mm512_set1_epi32(21);
 343
 344   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 345   const auto totNumBytesPerBlock = ((BFP_CPlane_16_SNC::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 346
 347   /// Compressed data write mask for each iqWidth option
 348   constexpr uint64_t rbWriteMask9 = 0x0000000FFFFFFFFF;
 349   constexpr uint64_t rbWriteMask10 = 0x000000FFFFFFFFFF;
 350   constexpr uint64_t rbWriteMask12 = 0x0000FFFFFFFFFFFF;
 351
 352   switch (dataIn.iqWidth)
 353   {
 354   case 8:
 355     BFP_CPlane_16_SNC::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 356     break;
 357
 358   case 9:
 359     BFP_CPlane_16_SNC::compressByAllocN<BlockFloatCompander::networkBytePack9bSnc>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, rbWriteMask9);
 360     break;
 361
 362   case 10:
 363     BFP_CPlane_16_SNC::compressByAllocN<BlockFloatCompander::networkBytePack10bSnc>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, rbWriteMask10);
 364     break;
 365
 366   case 12:
 367     BFP_CPlane_16_SNC::compressByAllocN<BlockFloatCompander::networkBytePack12bSnc>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, rbWriteMask12);
 368     break;
 369   }
 370 }
 371
 372
 373 /// Main kernel function for 16 antenna C-plane  expansion.
 374 /// Starts by determining iqWidth specific parameters and functions.
 375 void
 376 BlockFloatCompander::BFPExpandCtrlPlane16AvxSnc(const CompressedData& dataIn, ExpandedData* dataOut)
 377 {
 378   constexpr int k_maxExpShift9 = 7;
 379   constexpr int k_maxExpShift10 = 6;
 380   constexpr int k_maxExpShift12 = 4;
 381
 382   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 383   const auto totNumBytesPerBlock = ((BFP_CPlane_16_SNC::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 384
 385   switch (dataIn.iqWidth)
 386   {
 387   case 8:
 388     BFP_CPlane_16_SNC::expandByAlloc8(dataIn, dataOut);
 389     break;
 390
 391   case 9:
 392     BFP_CPlane_16_SNC::expandByAllocN<BlockFloatCompander::networkByteUnpack9bSnc>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift9);
 393     break;
 394
 395   case 10:
 396     BFP_CPlane_16_SNC::expandByAllocN<BlockFloatCompander::networkByteUnpack10bSnc>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift10);
 397     break;
 398
 399   case 12:
 400     BFP_CPlane_16_SNC::expandByAllocN<BlockFloatCompander::networkByteUnpack12bSnc>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift12);
 401     break;
 402   }
 403 }