fhi_lib/lib/src/xran_bfp_cplane32_snc.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2020 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 32T32R
  21  *
  22  * @file xran_bfp_cplane32.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include "xran_bfp_byte_packing_utils.hpp"
  30 #include <complex>
  31 #include <algorithm>
  32 #include <immintrin.h>
  33
  34
  35 namespace BFP_CPlane_32_SNC
  36 {
  37   /// Namespace constants
  38   const int k_numDataElements = 64; /// 16 IQ pairs
  39   const int k_numRegsPerBlock = 2; /// Number of AVX512 registers per compression block (input)
  40
  41   inline int
  42   maxAbsOneBlock(const __m512i* inData)
  43   {
  44     /// Vertical maxAbs on all registers
  45     __m512i maxAbsReg = __m512i();
  46 #pragma unroll(k_numRegsPerBlock)
  47     for (int n = 0; n < k_numRegsPerBlock; ++n)
  48     {
  49       const auto thisRegAbs = _mm512_abs_epi16(inData[n]);
  50       maxAbsReg = _mm512_max_epi16(thisRegAbs, maxAbsReg);
  51     }
  52     /// Horizontal max across remaining register
  53     return BlockFloatCompander::horizontalMax1x32(maxAbsReg);
  54   }
  55
  56   /// Compute exponent value for a set of 16 RB from the maximum absolute value
  57   inline __m512i
  58   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  59   {
  60     __m512i maxAbs = __m512i();
  61     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  62 #pragma unroll(16)
  63     for (int n = 0; n < 16; ++n)
  64     {
  65       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  66     }
  67     /// Calculate exponent
  68     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  69   }
  70
  71   /// Compute exponent value for a set of 4 RB from the maximum absolute value
  72   inline __m512i
  73   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  74   {
  75     __m512i maxAbs = __m512i();
  76     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  77 #pragma unroll(4)
  78     for (int n = 0; n < 4; ++n)
  79     {
  80       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  81     }
  82     /// Calculate exponent
  83     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  84   }
  85
  86   /// Compute exponent value for 1 RB from the maximum absolute value
  87   inline uint8_t
  88   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  89   {
  90     __m512i maxAbs = __m512i();
  91     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  92     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  93     /// Calculate exponent
  94     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  95     return ((uint8_t*)&exps)[0];
  96   }
  97
  98
  99
 100   /// Apply compression to one compression block
 101   template<BlockFloatCompander::PackFunction networkBytePack>
 102   inline void
 103   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
 104                         const int iqWidth, const uint8_t thisExp, const int totNumBytesPerReg, const uint64_t rbWriteMask)
 105   {
 106     /// Store exponent first
 107     *outBlockAddr = thisExp;
 108 #pragma unroll(k_numRegsPerBlock)
 109     for (int n = 0; n < k_numRegsPerBlock; ++n)
 110     {
 111       /// Apply the exponent shift
 112       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 113       /// Pack compressed data network byte order
 114       const auto compDataBytePacked = networkBytePack(compData);
 115       /// Store compressed data
 116       _mm512_mask_storeu_epi8(outBlockAddr + 1 + n * totNumBytesPerReg, rbWriteMask, compDataBytePacked);
 117     }
 118   }
 119
 120   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 121   template<BlockFloatCompander::PackFunction networkBytePack>
 122   inline void
 123   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 124                  const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint64_t rbWriteMask)
 125   {
 126     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 127     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 128 #pragma unroll(16)
 129     for (int n = 0; n < 16; ++n)
 130     {
 131       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 132     }
 133   }
 134
 135   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 136   template<BlockFloatCompander::PackFunction networkBytePack>
 137   inline void
 138   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 139                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint64_t rbWriteMask)
 140   {
 141     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 142     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 143 #pragma unroll(4)
 144     for (int n = 0; n < 4; ++n)
 145     {
 146       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 147     }
 148   }
 149
 150   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 151   template<BlockFloatCompander::PackFunction networkBytePack>
 152   inline void
 153   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 154                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint64_t rbWriteMask)
 155   {
 156     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 157     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 158     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, totNumBytesPerReg, rbWriteMask);
 159   }
 160
 161   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 162   template<BlockFloatCompander::PackFunction networkBytePack>
 163   inline void
 164   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 165                    const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint64_t rbWriteMask)
 166   {
 167     switch (dataIn.numBlocks)
 168     {
 169     case 16:
 170       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 171       break;
 172
 173     case 4:
 174       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 175       break;
 176
 177     case 1:
 178       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 179       break;
 180     }
 181   }
 182
 183
 184
 185   /// Apply 8b compression to 1 compression block.
 186   inline void
 187   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 188   {
 189     /// Store exponent first
 190     *outBlockAddr = thisExp;
 191     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 192     __m256i* regOutAddr = reinterpret_cast<__m256i*>(outBlockAddr + 1);
 193 #pragma unroll(k_numRegsPerBlock)
 194     for (int n = 0; n < k_numRegsPerBlock; ++n)
 195     {
 196       /// Apply the exponent shift
 197       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 198       /// Truncate to 8bit and store
 199       _mm256_mask_storeu_epi8(regOutAddr + n, k_writeMask, _mm512_cvtepi16_epi8(compData));
 200     }
 201   }
 202
 203   /// Derive and apply 8b compression to 16 compression blocks
 204   inline void
 205   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 206   {
 207     const __m512i exponents = computeExponent_16RB(dataIn, totShiftBits);
 208     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 209 #pragma unroll(16)
 210     for (int n = 0; n < 16; ++n)
 211     {
 212       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 213     }
 214   }
 215
 216   /// Derive and apply 8b compression to 4 compression blocks
 217   inline void
 218   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 219   {
 220     const __m512i exponents = computeExponent_4RB(dataIn, totShiftBits);
 221     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 222 #pragma unroll(4)
 223     for (int n = 0; n < 4; ++n)
 224     {
 225       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 226     }
 227   }
 228
 229   /// Derive and apply 8b compression to 1 compression block
 230   inline void
 231   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 232   {
 233     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 234     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 235     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 236   }
 237
 238   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 239   inline void
 240   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 241   {
 242     switch (dataIn.numBlocks)
 243     {
 244     case 16:
 245       compress8_16RB(dataIn, dataOut, totShiftBits);
 246       break;
 247
 248     case 4:
 249       compress8_4RB(dataIn, dataOut, totShiftBits);
 250       break;
 251
 252     case 1:
 253       compress8_1RB(dataIn, dataOut, totShiftBits);
 254       break;
 255     }
 256   }
 257
 258
 259
 260   /// Expand 1 compression block
 261   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 262   inline void
 263   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift, const int totNumBytesPerReg)
 264   {
 265     static constexpr uint8_t k_WriteMask = 0xFF;
 266     const auto thisExpShift = maxExpShift - *expAddr;
 267 #pragma unroll(k_numRegsPerBlock)
 268     for (int n = 0; n < k_numRegsPerBlock; ++n)
 269     {
 270       const auto thisInRegAddr = expAddr + 1 + n * totNumBytesPerReg;
 271       /// Unpack network order packed data
 272       const auto inDataUnpacked = networkByteUnpack(thisInRegAddr);
 273       /// Apply exponent scaling (by appropriate arithmetic shift right)
 274       const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 275       /// Write expanded data to output
 276       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expandedData);
 277     }
 278   }
 279
 280   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 281   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 282   void
 283   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 284                  const int totNumBytesPerBlock, const int totNumBytesPerReg, const int maxExpShift)
 285   {
 286     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 287     switch (dataIn.numBlocks)
 288     {
 289     case 16:
 290 #pragma unroll(16)
 291       for (int n = 0; n < 16; ++n)
 292       {
 293         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 294       }
 295       break;
 296
 297     case 4:
 298 #pragma unroll(4)
 299       for (int n = 0; n < 4; ++n)
 300       {
 301         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 302       }
 303       break;
 304
 305     case 1:
 306       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift, totNumBytesPerReg);
 307       break;
 308     }
 309   }
 310
 311
 312   /// Apply expansion to 1 compression block
 313   inline void
 314   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 315   {
 316     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 317     static constexpr uint8_t k_WriteMask = 0xFF;
 318 #pragma unroll(k_numRegsPerBlock)
 319     for (int n = 0; n < k_numRegsPerBlock; ++n)
 320     {
 321       const auto compData16 = _mm512_cvtepi8_epi16(rawDataIn[n]);
 322       const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 323       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expData);
 324     }
 325   }
 326
 327   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 328   void
 329   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 330   {
 331     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 332     switch (dataIn.numBlocks)
 333     {
 334     case 16:
 335 #pragma unroll(16)
 336       for (int n = 0; n < 16; ++n)
 337       {
 338         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 339       }
 340       break;
 341
 342     case 4:
 343 #pragma unroll(4)
 344       for (int n = 0; n < 4; ++n)
 345       {
 346         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 347       }
 348       break;
 349
 350     case 1:
 351       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 352       break;
 353     }
 354   }
 355 }
 356
 357
 358 /// Main kernel function for 32 antenna C-plane compression.
 359 /// Starts by determining iqWidth specific parameters and functions.
 360 void
 361 BlockFloatCompander::BFPCompressCtrlPlane32AvxSnc(const ExpandedData& dataIn, CompressedData* dataOut)
 362 {
 363   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 364   const auto totShiftBits8 = _mm512_set1_epi32(25);
 365   const auto totShiftBits9 = _mm512_set1_epi32(24);
 366   const auto totShiftBits10 = _mm512_set1_epi32(23);
 367   const auto totShiftBits12 = _mm512_set1_epi32(21);
 368
 369   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 370   const auto totNumBytesPerBlock = ((BFP_CPlane_32_SNC::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 371   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 372   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 373
 374   /// Compressed data write mask for each iqWidth option
 375   constexpr uint64_t rbWriteMask9 = 0x0000000FFFFFFFFF;
 376   constexpr uint64_t rbWriteMask10 = 0x000000FFFFFFFFFF;
 377   constexpr uint64_t rbWriteMask12 = 0x0000FFFFFFFFFFFF;
 378
 379   switch (dataIn.iqWidth)
 380   {
 381   case 8:
 382     BFP_CPlane_32_SNC::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 383     break;
 384
 385   case 9:
 386     BFP_CPlane_32_SNC::compressByAllocN<BlockFloatCompander::networkBytePack9bSnc>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask9);
 387     break;
 388
 389   case 10:
 390     BFP_CPlane_32_SNC::compressByAllocN<BlockFloatCompander::networkBytePack10bSnc>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask10);
 391     break;
 392
 393   case 12:
 394     BFP_CPlane_32_SNC::compressByAllocN<BlockFloatCompander::networkBytePack12bSnc>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask12);
 395     break;
 396   }
 397 }
 398
 399
 400 /// Main kernel function for 32 antenna C-plane expansion.
 401 /// Starts by determining iqWidth specific parameters and functions.
 402 void
 403 BlockFloatCompander::BFPExpandCtrlPlane32AvxSnc(const CompressedData& dataIn, ExpandedData* dataOut)
 404 {
 405   constexpr int k_maxExpShift9 = 7;
 406   constexpr int k_maxExpShift10 = 6;
 407   constexpr int k_maxExpShift12 = 4;
 408
 409   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 410   const auto totNumBytesPerBlock = ((BFP_CPlane_32_SNC::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 411   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 412   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 413
 414   switch (dataIn.iqWidth)
 415   {
 416   case 8:
 417     BFP_CPlane_32_SNC::expandByAlloc8(dataIn, dataOut);
 418     break;
 419
 420   case 9:
 421     BFP_CPlane_32_SNC::expandByAllocN<BlockFloatCompander::networkByteUnpack9bSnc>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift9);
 422     break;
 423
 424   case 10:
 425     BFP_CPlane_32_SNC::expandByAllocN<BlockFloatCompander::networkByteUnpack10bSnc>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift10);
 426     break;
 427
 428   case 12:
 429     BFP_CPlane_32_SNC::expandByAllocN<BlockFloatCompander::networkByteUnpack12bSnc>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift12);
 430     break;
 431   }
 432 }