fhi_lib/lib/src/xran_bfp_cplane32.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2020 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 32T32R
  21  *
  22  * @file xran_bfp_cplane32.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include "xran_bfp_byte_packing_utils.hpp"
  30 #include <complex>
  31 #include <algorithm>
  32 #include <immintrin.h>
  33
  34
  35 namespace BFP_CPlane_32
  36 {
  37   /// Namespace constants
  38   const int k_numDataElements = 64; /// 16 IQ pairs
  39   const int k_numRegsPerBlock = 2; /// Number of AVX512 registers per compression block (input)
  40
  41   inline int
  42   maxAbsOneBlock(const __m512i* inData)
  43   {
  44     /// Vertical maxAbs on all registers
  45     __m512i maxAbsReg = __m512i();
  46 #pragma unroll(k_numRegsPerBlock)
  47     for (int n = 0; n < k_numRegsPerBlock; ++n)
  48     {
  49       const auto thisRegAbs = _mm512_abs_epi16(inData[n]);
  50       maxAbsReg = _mm512_max_epi16(thisRegAbs, maxAbsReg);
  51     }
  52     /// Horizontal max across remaining register
  53     return BlockFloatCompander::horizontalMax1x32(maxAbsReg);
  54   }
  55
  56   /// Compute exponent value for a set of 16 RB from the maximum absolute value
  57   inline __m512i
  58   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  59   {
  60     __m512i maxAbs = __m512i();
  61     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  62 #pragma unroll(16)
  63     for (int n = 0; n < 16; ++n)
  64     {
  65       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  66     }
  67     /// Calculate exponent
  68     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  69   }
  70
  71   /// Compute exponent value for a set of 4 RB from the maximum absolute value
  72   inline __m512i
  73   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  74   {
  75     __m512i maxAbs = __m512i();
  76     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  77 #pragma unroll(4)
  78     for (int n = 0; n < 4; ++n)
  79     {
  80       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  81     }
  82     /// Calculate exponent
  83     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  84   }
  85
  86   /// Compute exponent value for 1 RB from the maximum absolute value
  87   inline uint8_t
  88   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  89   {
  90     __m512i maxAbs = __m512i();
  91     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  92     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  93     /// Calculate exponent
  94     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  95     return ((uint8_t*)&exps)[0];
  96   }
  97
  98
  99
 100   /// Apply compression to one compression block
 101   template<BlockFloatCompander::PackFunction networkBytePack>
 102   inline void
 103   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
 104                         const int iqWidth, const uint8_t thisExp, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 105   {
 106     /// Store exponent first
 107     *outBlockAddr = thisExp;
 108 #pragma unroll(k_numRegsPerBlock)
 109     for (int n = 0; n < k_numRegsPerBlock; ++n)
 110     {
 111       /// Apply the exponent shift
 112       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 113       /// Pack compressed data network byte order
 114       const auto compDataBytePacked = networkBytePack(compData);
 115       /// Now have 1 register worth of bytes separated into 4 chunks (1 per lane)
 116       /// Use four offset stores to join
 117       const auto thisOutRegAddr = outBlockAddr + 1 + n * totNumBytesPerReg;
 118       _mm_mask_storeu_epi8(thisOutRegAddr, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
 119       _mm_mask_storeu_epi8(thisOutRegAddr + iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
 120       _mm_mask_storeu_epi8(thisOutRegAddr + (2 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
 121       _mm_mask_storeu_epi8(thisOutRegAddr + (3 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 3));
 122     }
 123   }
 124
 125   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 126   template<BlockFloatCompander::PackFunction networkBytePack>
 127   inline void
 128   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 129                  const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 130   {
 131     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 132     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 133 #pragma unroll(16)
 134     for (int n = 0; n < 16; ++n)
 135     {
 136       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 137     }
 138   }
 139
 140   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 141   template<BlockFloatCompander::PackFunction networkBytePack>
 142   inline void
 143   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 144                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 145   {
 146     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 147     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 148 #pragma unroll(4)
 149     for (int n = 0; n < 4; ++n)
 150     {
 151       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 152     }
 153   }
 154
 155   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 156   template<BlockFloatCompander::PackFunction networkBytePack>
 157   inline void
 158   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 159                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 160   {
 161     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 162     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 163     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, totNumBytesPerReg, rbWriteMask);
 164   }
 165
 166   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 167   template<BlockFloatCompander::PackFunction networkBytePack>
 168   inline void
 169   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 170                    const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 171   {
 172     switch (dataIn.numBlocks)
 173     {
 174     case 16:
 175       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 176       break;
 177
 178     case 4:
 179       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 180       break;
 181
 182     case 1:
 183       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 184       break;
 185     }
 186   }
 187
 188
 189
 190   /// Apply 8b compression to 1 compression block.
 191   inline void
 192   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 193   {
 194     /// Store exponent first
 195     *outBlockAddr = thisExp;
 196     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 197     __m256i* regOutAddr = reinterpret_cast<__m256i*>(outBlockAddr + 1);
 198 #pragma unroll(k_numRegsPerBlock)
 199     for (int n = 0; n < k_numRegsPerBlock; ++n)
 200     {
 201       /// Apply the exponent shift
 202       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 203       /// Truncate to 8bit and store
 204       _mm256_mask_storeu_epi8(regOutAddr + n, k_writeMask, _mm512_cvtepi16_epi8(compData));
 205     }
 206   }
 207
 208   /// Derive and apply 8b compression to 16 compression blocks
 209   inline void
 210   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 211   {
 212     const __m512i exponents = computeExponent_16RB(dataIn, totShiftBits);
 213     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 214 #pragma unroll(16)
 215     for (int n = 0; n < 16; ++n)
 216     {
 217       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 218     }
 219   }
 220
 221   /// Derive and apply 8b compression to 4 compression blocks
 222   inline void
 223   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 224   {
 225     const __m512i exponents = computeExponent_4RB(dataIn, totShiftBits);
 226     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 227 #pragma unroll(4)
 228     for (int n = 0; n < 4; ++n)
 229     {
 230       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 231     }
 232   }
 233
 234   /// Derive and apply 8b compression to 1 compression block
 235   inline void
 236   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 237   {
 238     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 239     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 240     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 241   }
 242
 243   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 244   inline void
 245   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 246   {
 247     switch (dataIn.numBlocks)
 248     {
 249     case 16:
 250       compress8_16RB(dataIn, dataOut, totShiftBits);
 251       break;
 252
 253     case 4:
 254       compress8_4RB(dataIn, dataOut, totShiftBits);
 255       break;
 256
 257     case 1:
 258       compress8_1RB(dataIn, dataOut, totShiftBits);
 259       break;
 260     }
 261   }
 262
 263
 264
 265   /// Expand 1 compression block
 266   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 267   inline void
 268   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift, const int totNumBytesPerReg)
 269   {
 270     static constexpr uint8_t k_WriteMask = 0xFF;
 271     const auto thisExpShift = maxExpShift - *expAddr;
 272 #pragma unroll(k_numRegsPerBlock)
 273     for (int n = 0; n < k_numRegsPerBlock; ++n)
 274     {
 275       const auto thisInRegAddr = expAddr + 1 + n * totNumBytesPerReg;
 276       /// Unpack network order packed data
 277       const auto inDataUnpacked = networkByteUnpack(thisInRegAddr);
 278       /// Apply exponent scaling (by appropriate arithmetic shift right)
 279       const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 280       /// Write expanded data to output
 281       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expandedData);
 282     }
 283   }
 284
 285   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 286   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 287   void
 288   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 289                  const int totNumBytesPerBlock, const int totNumBytesPerReg, const int maxExpShift)
 290   {
 291     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 292     switch (dataIn.numBlocks)
 293     {
 294     case 16:
 295 #pragma unroll(16)
 296       for (int n = 0; n < 16; ++n)
 297       {
 298         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 299       }
 300       break;
 301
 302     case 4:
 303 #pragma unroll(4)
 304       for (int n = 0; n < 4; ++n)
 305       {
 306         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 307       }
 308       break;
 309
 310     case 1:
 311       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift, totNumBytesPerReg);
 312       break;
 313     }
 314   }
 315
 316
 317   /// Apply expansion to 1 compression block
 318   inline void
 319   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 320   {
 321     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 322     static constexpr uint8_t k_WriteMask = 0xFF;
 323 #pragma unroll(k_numRegsPerBlock)
 324     for (int n = 0; n < k_numRegsPerBlock; ++n)
 325     {
 326       const auto compData16 = _mm512_cvtepi8_epi16(rawDataIn[n]);
 327       const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 328       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expData);
 329     }
 330   }
 331
 332   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 333   void
 334   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 335   {
 336     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 337     switch (dataIn.numBlocks)
 338     {
 339     case 16:
 340 #pragma unroll(16)
 341       for (int n = 0; n < 16; ++n)
 342       {
 343         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 344       }
 345       break;
 346
 347     case 4:
 348 #pragma unroll(4)
 349       for (int n = 0; n < 4; ++n)
 350       {
 351         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 352       }
 353       break;
 354
 355     case 1:
 356       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 357       break;
 358     }
 359   }
 360 }
 361
 362
 363 /// Main kernel function for 32 antenna C-plane compression.
 364 /// Starts by determining iqWidth specific parameters and functions.
 365 void
 366 BlockFloatCompander::BFPCompressCtrlPlane32Avx512(const ExpandedData& dataIn, CompressedData* dataOut)
 367 {
 368   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 369   const auto totShiftBits8 = _mm512_set1_epi32(25);
 370   const auto totShiftBits9 = _mm512_set1_epi32(24);
 371   const auto totShiftBits10 = _mm512_set1_epi32(23);
 372   const auto totShiftBits12 = _mm512_set1_epi32(21);
 373
 374   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 375   const auto totNumBytesPerBlock = ((BFP_CPlane_32::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 376   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 377   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 378
 379   /// Compressed data write mask for each iqWidth option
 380   constexpr uint16_t rbWriteMask9 = 0x01FF;
 381   constexpr uint16_t rbWriteMask10 = 0x03FF;
 382   constexpr uint16_t rbWriteMask12 = 0x0FFF;
 383
 384   switch (dataIn.iqWidth)
 385   {
 386   case 8:
 387     BFP_CPlane_32::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 388     break;
 389
 390   case 9:
 391     BFP_CPlane_32::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask9);
 392     break;
 393
 394   case 10:
 395     BFP_CPlane_32::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask10);
 396     break;
 397
 398   case 12:
 399     BFP_CPlane_32::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask12);
 400     break;
 401   }
 402 }
 403
 404
 405 /// Main kernel function for 32 antenna C-plane expansion.
 406 /// Starts by determining iqWidth specific parameters and functions.
 407 void
 408 BlockFloatCompander::BFPExpandCtrlPlane32Avx512(const CompressedData& dataIn, ExpandedData* dataOut)
 409 {
 410   constexpr int k_maxExpShift9 = 7;
 411   constexpr int k_maxExpShift10 = 6;
 412   constexpr int k_maxExpShift12 = 4;
 413
 414   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 415   const auto totNumBytesPerBlock = ((BFP_CPlane_32::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 416   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 417   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 418
 419   switch (dataIn.iqWidth)
 420   {
 421   case 8:
 422     BFP_CPlane_32::expandByAlloc8(dataIn, dataOut);
 423     break;
 424
 425   case 9:
 426     BFP_CPlane_32::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift9);
 427     break;
 428
 429   case 10:
 430     BFP_CPlane_32::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift10);
 431     break;
 432
 433   case 12:
 434     BFP_CPlane_32::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift12);
 435     break;
 436   }
 437 }