fhi_lib/lib/src/xran_bfp_cplane32.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2019 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 32T32R
  21  *
  22  * @file xran_bfp_cplane32.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include <complex>
  30 #include <algorithm>
  31 #include <immintrin.h>
  32
  33
  34 namespace BFP_CPlane_32
  35 {
  36   /// Namespace constants
  37   const int k_numDataElements = 64; /// 16 IQ pairs
  38   const int k_numRegsPerBlock = 2; /// Number of AVX512 registers per compression block (input)
  39
  40   inline int
  41   maxAbsOneBlock(const __m512i* inData)
  42   {
  43     /// Vertical maxAbs on all registers
  44     __m512i maxAbsReg = __m512i();
  45 #pragma unroll(k_numRegsPerBlock)
  46     for (int n = 0; n < k_numRegsPerBlock; ++n)
  47     {
  48       const auto thisRegAbs = _mm512_abs_epi16(inData[n]);
  49       maxAbsReg = _mm512_max_epi16(thisRegAbs, maxAbsReg);
  50     }
  51     /// Horizontal max across remaining register
  52     return BlockFloatCompander::horizontalMax1x32(maxAbsReg);
  53   }
  54
  55   /// Compute exponent value for a set of 16 RB from the maximum absolute value
  56   inline __m512i
  57   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  58   {
  59     __m512i maxAbs = __m512i();
  60     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  61 #pragma unroll(16)
  62     for (int n = 0; n < 16; ++n)
  63     {
  64       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  65     }
  66     /// Calculate exponent
  67     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  68   }
  69
  70   /// Compute exponent value for a set of 4 RB from the maximum absolute value
  71   inline __m512i
  72   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  73   {
  74     __m512i maxAbs = __m512i();
  75     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  76 #pragma unroll(4)
  77     for (int n = 0; n < 4; ++n)
  78     {
  79       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  80     }
  81     /// Calculate exponent
  82     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  83   }
  84
  85   /// Compute exponent value for 1 RB from the maximum absolute value
  86   inline uint8_t
  87   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  88   {
  89     __m512i maxAbs = __m512i();
  90     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  91     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  92     /// Calculate exponent
  93     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  94     return ((uint8_t*)&exps)[0];
  95   }
  96
  97
  98
  99   /// Apply compression to one compression block
 100   template<BlockFloatCompander::PackFunction networkBytePack>
 101   inline void
 102   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
 103                         const int iqWidth, const uint8_t thisExp, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 104   {
 105     /// Store exponent first
 106     *outBlockAddr = thisExp;
 107 #pragma unroll(k_numRegsPerBlock)
 108     for (int n = 0; n < k_numRegsPerBlock; ++n)
 109     {
 110       /// Apply the exponent shift
 111       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 112       /// Pack compressed data network byte order
 113       const auto compDataBytePacked = networkBytePack(compData);
 114       /// Now have 1 register worth of bytes separated into 4 chunks (1 per lane)
 115       /// Use four offset stores to join
 116       const auto thisOutRegAddr = outBlockAddr + 1 + n * totNumBytesPerReg;
 117       _mm_mask_storeu_epi8(thisOutRegAddr, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
 118       _mm_mask_storeu_epi8(thisOutRegAddr + iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
 119       _mm_mask_storeu_epi8(thisOutRegAddr + (2 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
 120       _mm_mask_storeu_epi8(thisOutRegAddr + (3 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 3));
 121     }
 122   }
 123
 124   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 125   template<BlockFloatCompander::PackFunction networkBytePack>
 126   inline void
 127   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 128                  const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 129   {
 130     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 131     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 132 #pragma unroll(16)
 133     for (int n = 0; n < 16; ++n)
 134     {
 135       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 136     }
 137   }
 138
 139   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 140   template<BlockFloatCompander::PackFunction networkBytePack>
 141   inline void
 142   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 143                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 144   {
 145     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 146     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 147 #pragma unroll(4)
 148     for (int n = 0; n < 4; ++n)
 149     {
 150       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 151     }
 152   }
 153
 154   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 155   template<BlockFloatCompander::PackFunction networkBytePack>
 156   inline void
 157   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 158                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 159   {
 160     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 161     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 162     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, totNumBytesPerReg, rbWriteMask);
 163   }
 164
 165   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 166   template<BlockFloatCompander::PackFunction networkBytePack>
 167   inline void
 168   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 169                    const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 170   {
 171     switch (dataIn.numBlocks)
 172     {
 173     case 16:
 174       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 175       break;
 176
 177     case 4:
 178       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 179       break;
 180
 181     case 1:
 182       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 183       break;
 184     }
 185   }
 186
 187
 188
 189   /// Apply 8b compression to 1 compression block.
 190   inline void
 191   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 192   {
 193     /// Store exponent first
 194     *outBlockAddr = thisExp;
 195     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 196     __m256i* regOutAddr = reinterpret_cast<__m256i*>(outBlockAddr + 1);
 197 #pragma unroll(k_numRegsPerBlock)
 198     for (int n = 0; n < k_numRegsPerBlock; ++n)
 199     {
 200       /// Apply the exponent shift
 201       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 202       /// Truncate to 8bit and store
 203       _mm256_mask_storeu_epi8(regOutAddr + n, k_writeMask, _mm512_cvtepi16_epi8(compData));
 204     }
 205   }
 206
 207   /// Derive and apply 8b compression to 16 compression blocks
 208   inline void
 209   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 210   {
 211     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 212     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 213 #pragma unroll(16)
 214     for (int n = 0; n < 16; ++n)
 215     {
 216       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 217     }
 218   }
 219
 220   /// Derive and apply 8b compression to 4 compression blocks
 221   inline void
 222   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 223   {
 224     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 225     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 226 #pragma unroll(4)
 227     for (int n = 0; n < 4; ++n)
 228     {
 229       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 230     }
 231   }
 232
 233   /// Derive and apply 8b compression to 1 compression block
 234   inline void
 235   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 236   {
 237     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 238     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 239     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 240   }
 241
 242   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 243   inline void
 244   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 245   {
 246     switch (dataIn.numBlocks)
 247     {
 248     case 16:
 249       compress8_16RB(dataIn, dataOut, totShiftBits);
 250       break;
 251
 252     case 4:
 253       compress8_4RB(dataIn, dataOut, totShiftBits);
 254       break;
 255
 256     case 1:
 257       compress8_1RB(dataIn, dataOut, totShiftBits);
 258       break;
 259     }
 260   }
 261
 262
 263
 264   /// Expand 1 compression block
 265   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 266   inline void
 267   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift, const int totNumBytesPerReg)
 268   {
 269     static constexpr uint8_t k_WriteMask = 0xFF;
 270     const auto thisExpShift = maxExpShift - *expAddr;
 271 #pragma unroll(k_numRegsPerBlock)
 272     for (int n = 0; n < k_numRegsPerBlock; ++n)
 273     {
 274       const auto thisInRegAddr = expAddr + 1 + n * totNumBytesPerReg;
 275       /// Unpack network order packed data
 276       const auto inDataUnpacked = networkByteUnpack(thisInRegAddr);
 277       /// Apply exponent scaling (by appropriate arithmetic shift right)
 278       const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 279       /// Write expanded data to output
 280       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expandedData);
 281     }
 282   }
 283
 284   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 285   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 286   void
 287   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 288                  const int totNumBytesPerBlock, const int totNumBytesPerReg, const int maxExpShift)
 289   {
 290     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 291     switch (dataIn.numBlocks)
 292     {
 293     case 16:
 294 #pragma unroll(16)
 295       for (int n = 0; n < 16; ++n)
 296       {
 297         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 298       }
 299       break;
 300
 301     case 4:
 302 #pragma unroll(4)
 303       for (int n = 0; n < 4; ++n)
 304       {
 305         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 306       }
 307       break;
 308
 309     case 1:
 310       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift, totNumBytesPerReg);
 311       break;
 312     }
 313   }
 314
 315
 316   /// Apply expansion to 1 compression block
 317   inline void
 318   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 319   {
 320     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 321     static constexpr uint8_t k_WriteMask = 0xFF;
 322 #pragma unroll(k_numRegsPerBlock)
 323     for (int n = 0; n < k_numRegsPerBlock; ++n)
 324     {
 325       const auto compData16 = _mm512_cvtepi8_epi16(rawDataIn[n]);
 326       const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 327       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expData);
 328     }
 329   }
 330
 331   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 332   void
 333   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 334   {
 335     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 336     switch (dataIn.numBlocks)
 337     {
 338     case 16:
 339 #pragma unroll(16)
 340       for (int n = 0; n < 16; ++n)
 341       {
 342         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 343       }
 344       break;
 345
 346     case 4:
 347 #pragma unroll(4)
 348       for (int n = 0; n < 4; ++n)
 349       {
 350         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 351       }
 352       break;
 353
 354     case 1:
 355       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 356       break;
 357     }
 358   }
 359 }
 360
 361
 362 /// Main kernel function for 32 antenna C-plane compression.
 363 /// Starts by determining iqWidth specific parameters and functions.
 364 void
 365 BlockFloatCompander::BFPCompressCtrlPlane32Avx512(const ExpandedData& dataIn, CompressedData* dataOut)
 366 {
 367   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 368   const auto totShiftBits8 = _mm512_set1_epi32(25);
 369   const auto totShiftBits9 = _mm512_set1_epi32(24);
 370   const auto totShiftBits10 = _mm512_set1_epi32(23);
 371   const auto totShiftBits12 = _mm512_set1_epi32(21);
 372
 373   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 374   const auto totNumBytesPerBlock = ((BFP_CPlane_32::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 375   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 376   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 377
 378   /// Compressed data write mask for each iqWidth option
 379   constexpr uint16_t rbWriteMask9 = 0x01FF;
 380   constexpr uint16_t rbWriteMask10 = 0x03FF;
 381   constexpr uint16_t rbWriteMask12 = 0x0FFF;
 382
 383   switch (dataIn.iqWidth)
 384   {
 385   case 8:
 386     BFP_CPlane_32::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 387     break;
 388
 389   case 9:
 390     BFP_CPlane_32::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask9);
 391     break;
 392
 393   case 10:
 394     BFP_CPlane_32::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask10);
 395     break;
 396
 397   case 12:
 398     BFP_CPlane_32::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask12);
 399     break;
 400   }
 401 }
 402
 403
 404 /// Main kernel function for 32 antenna C-plane expansion.
 405 /// Starts by determining iqWidth specific parameters and functions.
 406 void
 407 BlockFloatCompander::BFPExpandCtrlPlane32Avx512(const CompressedData& dataIn, ExpandedData* dataOut)
 408 {
 409   constexpr int k_maxExpShift9 = 7;
 410   constexpr int k_maxExpShift10 = 6;
 411   constexpr int k_maxExpShift12 = 4;
 412
 413   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 414   const auto totNumBytesPerBlock = ((BFP_CPlane_32::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 415   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 416   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 417
 418   switch (dataIn.iqWidth)
 419   {
 420   case 8:
 421     BFP_CPlane_32::expandByAlloc8(dataIn, dataOut);
 422     break;
 423
 424   case 9:
 425     BFP_CPlane_32::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift9);
 426     break;
 427
 428   case 10:
 429     BFP_CPlane_32::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift10);
 430     break;
 431
 432   case 12:
 433     BFP_CPlane_32::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift12);
 434     break;
 435   }
 436 }