fhi_lib/lib/src/xran_bfp_cplane64.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2020 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 64T64R
  21  *
  22  * @file xran_bfp_cplane64.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include "xran_bfp_byte_packing_utils.hpp"
  30 #include <complex>
  31 #include <algorithm>
  32 #include <immintrin.h>
  33
  34
  35 namespace BFP_CPlane_64
  36 {
  37   /// Namespace constants
  38   const int k_numDataElements = 128; /// 16 IQ pairs
  39   const int k_numRegsPerBlock = 4; /// Number of AVX512 registers per compression block (input)
  40
  41   inline int
  42   maxAbsOneBlock(const __m512i* inData)
  43   {
  44     /// Vertical maxAbs on all registers
  45     __m512i maxAbsReg = __m512i();
  46 #pragma unroll(k_numRegsPerBlock)
  47     for (int n = 0; n < k_numRegsPerBlock; ++n)
  48     {
  49       const auto thisRegAbs = _mm512_abs_epi16(inData[n]);
  50       maxAbsReg = _mm512_max_epi16(thisRegAbs, maxAbsReg);
  51     }
  52     /// Horizontal max across remaining register
  53     return BlockFloatCompander::horizontalMax1x32(maxAbsReg);
  54   }
  55
  56   /// Compute exponent value for a set of 16 RB from the maximum absolute value
  57   inline __m512i
  58   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  59   {
  60     __m512i maxAbs = __m512i();
  61     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  62 #pragma unroll(16)
  63     for (int n = 0; n < 16; ++n)
  64     {
  65       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  66     }
  67     /// Calculate exponent
  68     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  69   }
  70
  71   /// Compute exponent value for a set of 4 RB from the maximum absolute value
  72   inline __m512i
  73   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  74   {
  75     __m512i maxAbs = __m512i();
  76     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  77 #pragma unroll(4)
  78     for (int n = 0; n < 4; ++n)
  79     {
  80       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  81     }
  82     /// Calculate exponent
  83     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  84   }
  85
  86   /// Compute exponent value for 1 RB from the maximum absolute value
  87   inline uint8_t
  88   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  89   {
  90     __m512i maxAbs = __m512i();
  91     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  92     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  93     /// Calculate exponent
  94     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  95     return ((uint8_t*)&exps)[0];
  96   }
  97
  98
  99
 100   /// Apply compression to one compression block
 101   template<BlockFloatCompander::PackFunction networkBytePack>
 102   inline void
 103   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
 104                         const int iqWidth, const uint8_t thisExp, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 105   {
 106     /// Store exponent first
 107     *outBlockAddr = thisExp;
 108 #pragma unroll(k_numRegsPerBlock)
 109     for (int n = 0; n < k_numRegsPerBlock; ++n)
 110     {
 111       /// Apply the exponent shift
 112       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 113       /// Pack compressed data network byte order
 114       const auto compDataBytePacked = networkBytePack(compData);
 115       /// Now have 1 register worth of bytes separated into 4 chunks (1 per lane)
 116       /// Use four offset stores to join
 117       const auto thisOutRegAddr = outBlockAddr + 1 + n * totNumBytesPerReg;
 118       _mm_mask_storeu_epi8(thisOutRegAddr, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
 119       _mm_mask_storeu_epi8(thisOutRegAddr + iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
 120       _mm_mask_storeu_epi8(thisOutRegAddr + (2 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
 121       _mm_mask_storeu_epi8(thisOutRegAddr + (3 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 3));
 122     }
 123   }
 124
 125   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 126   template<BlockFloatCompander::PackFunction networkBytePack>
 127   inline void
 128   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 129                  const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 130   {
 131     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 132     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 133 #pragma unroll(16)
 134     for (int n = 0; n < 16; ++n)
 135     {
 136       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 137     }
 138   }
 139
 140   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 141   template<BlockFloatCompander::PackFunction networkBytePack>
 142   inline void
 143   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 144                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 145   {
 146     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 147     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 148 #pragma unroll(4)
 149     for (int n = 0; n < 4; ++n)
 150     {
 151       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 152     }
 153   }
 154
 155   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 156   template<BlockFloatCompander::PackFunction networkBytePack>
 157   inline void
 158   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 159                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 160   {
 161     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 162     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 163     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, totNumBytesPerReg, rbWriteMask);
 164   }
 165
 166   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 167   template<BlockFloatCompander::PackFunction networkBytePack>
 168   inline void
 169   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 170                    const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 171   {
 172     switch (dataIn.numBlocks)
 173     {
 174     case 16:
 175       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 176       break;
 177
 178     case 4:
 179       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 180       break;
 181
 182     case 1:
 183       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 184       break;
 185     }
 186   }
 187
 188
 189
 190   /// Apply 8b compression to 1 compression block.
 191   inline void
 192   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 193   {
 194     /// Store exponent first
 195     *outBlockAddr = thisExp;
 196     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 197     __m256i* regOutAddr = reinterpret_cast<__m256i*>(outBlockAddr + 1);
 198 #pragma unroll(k_numRegsPerBlock)
 199     for (int n = 0; n < k_numRegsPerBlock; ++n)
 200     {
 201       /// Apply the exponent shift
 202       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 203       /// Truncate to 8bit and store
 204       _mm256_mask_storeu_epi8(regOutAddr + n, k_writeMask, _mm512_cvtepi16_epi8(compData));
 205     }
 206   }
 207
 208   /// Derive and apply 8b compression to 16 compression blocks
 209   inline void
 210   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 211   {
 212     const __m512i exponents = computeExponent_16RB(dataIn, totShiftBits);
 213     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 214 #pragma unroll(16)
 215     for (int n = 0; n < 16; ++n)
 216     {
 217       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 218     }
 219   }
 220
 221   /// Derive and apply 8b compression to 4 compression blocks
 222   inline void
 223   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 224   {
 225     const __m512i exponents = computeExponent_4RB(dataIn, totShiftBits);
 226     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 227 #pragma unroll(4)
 228     for (int n = 0; n < 4; ++n)
 229     {
 230       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 231     }
 232   }
 233
 234   /// Derive and apply 8b compression to 1 compression block
 235   inline void
 236   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 237   {
 238     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 239     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 240     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 241   }
 242
 243   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 244   inline void
 245   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 246   {
 247     switch (dataIn.numBlocks)
 248     {
 249     case 16:
 250       compress8_16RB(dataIn, dataOut, totShiftBits);
 251       break;
 252
 253     case 4:
 254       compress8_4RB(dataIn, dataOut, totShiftBits);
 255       break;
 256
 257     case 1:
 258       compress8_1RB(dataIn, dataOut, totShiftBits);
 259       break;
 260     }
 261   }
 262
 263
 264
 265   /// Expand 1 compression block
 266   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 267   inline void
 268   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift, const int totNumBytesPerReg)
 269   {
 270     static constexpr uint8_t k_WriteMask = 0xFF;
 271     const auto thisExpShift = maxExpShift - *expAddr;
 272 #pragma unroll(k_numRegsPerBlock)
 273     for (int n = 0; n < k_numRegsPerBlock; ++n)
 274     {
 275       const auto thisInRegAddr = expAddr + 1 + n * totNumBytesPerReg;
 276       /// Unpack network order packed data
 277       const auto inDataUnpacked = networkByteUnpack(thisInRegAddr);
 278       /// Apply exponent scaling (by appropriate arithmetic shift right)
 279       const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 280       /// Write expanded data to output
 281       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expandedData);
 282     }
 283   }
 284
 285   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 286   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 287   void expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 288                       const int totNumBytesPerBlock, const int totNumBytesPerReg, const int maxExpShift)
 289   {
 290     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 291     switch (dataIn.numBlocks)
 292     {
 293     case 16:
 294 #pragma unroll(16)
 295       for (int n = 0; n < 16; ++n)
 296       {
 297         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 298       }
 299       break;
 300
 301     case 4:
 302 #pragma unroll(4)
 303       for (int n = 0; n < 4; ++n)
 304       {
 305         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 306       }
 307       break;
 308
 309     case 1:
 310       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift, totNumBytesPerReg);
 311       break;
 312     }
 313   }
 314
 315
 316   /// Apply expansion to 1 compression block
 317   inline void
 318   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 319   {
 320     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 321     static constexpr uint8_t k_WriteMask = 0xFF;
 322 #pragma unroll(k_numRegsPerBlock)
 323     for (int n = 0; n < k_numRegsPerBlock; ++n)
 324     {
 325       const auto compData16 = _mm512_cvtepi8_epi16(rawDataIn[n]);
 326       const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 327       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expData);
 328     }
 329   }
 330
 331   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 332   void
 333   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 334   {
 335     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 336     switch (dataIn.numBlocks)
 337     {
 338     case 16:
 339 #pragma unroll(16)
 340       for (int n = 0; n < 16; ++n)
 341       {
 342         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 343       }
 344       break;
 345
 346     case 4:
 347 #pragma unroll(4)
 348       for (int n = 0; n < 4; ++n)
 349       {
 350         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 351       }
 352       break;
 353
 354     case 1:
 355       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 356       break;
 357     }
 358   }
 359 }
 360
 361
 362 /// Main kernel function for 64 antenna C-plane compression.
 363 /// Starts by determining iqWidth specific parameters and functions.
 364 void
 365 BlockFloatCompander::BFPCompressCtrlPlane64Avx512(const ExpandedData& dataIn, CompressedData* dataOut)
 366 {
 367   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 368   const auto totShiftBits8 = _mm512_set1_epi32(25);
 369   const auto totShiftBits9 = _mm512_set1_epi32(24);
 370   const auto totShiftBits10 = _mm512_set1_epi32(23);
 371   const auto totShiftBits12 = _mm512_set1_epi32(21);
 372
 373   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 374   const auto totNumBytesPerBlock = ((BFP_CPlane_64::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 375   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 376   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 377
 378   /// Compressed data write mask for each iqWidth option
 379   constexpr uint16_t rbWriteMask9 = 0x01FF;
 380   constexpr uint16_t rbWriteMask10 = 0x03FF;
 381   constexpr uint16_t rbWriteMask12 = 0x0FFF;
 382
 383   switch (dataIn.iqWidth)
 384   {
 385   case 8:
 386     BFP_CPlane_64::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 387     break;
 388
 389   case 9:
 390     BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask9);
 391     break;
 392
 393   case 10:
 394     BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask10);
 395     break;
 396
 397   case 12:
 398     BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask12);
 399     break;
 400   }
 401 }
 402
 403
 404 /// Main kernel function for 64 antenna C-plane expansion.
 405 /// Starts by determining iqWidth specific parameters and functions.
 406 void
 407 BlockFloatCompander::BFPExpandCtrlPlane64Avx512(const CompressedData& dataIn, ExpandedData* dataOut)
 408 {
 409   constexpr int k_maxExpShift9 = 7;
 410   constexpr int k_maxExpShift10 = 6;
 411   constexpr int k_maxExpShift12 = 4;
 412
 413   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 414   const auto totNumBytesPerBlock = ((BFP_CPlane_64::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 415   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 416   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 417
 418   switch (dataIn.iqWidth)
 419   {
 420   case 8:
 421     BFP_CPlane_64::expandByAlloc8(dataIn, dataOut);
 422     break;
 423
 424   case 9:
 425     BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift9);
 426     break;
 427
 428   case 10:
 429     BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift10);
 430     break;
 431
 432   case 12:
 433     BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift12);
 434     break;
 435   }
 436 }