fhi_lib/lib/src/xran_bfp_cplane64.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2019 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 64T64R
  21  *
  22  * @file xran_bfp_cplane64.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include <complex>
  30 #include <algorithm>
  31 #include <immintrin.h>
  32
  33
  34 namespace BFP_CPlane_64
  35 {
  36   /// Namespace constants
  37   const int k_numDataElements = 128; /// 16 IQ pairs
  38   const int k_numRegsPerBlock = 4; /// Number of AVX512 registers per compression block (input)
  39
  40   inline int
  41   maxAbsOneBlock(const __m512i* inData)
  42   {
  43     /// Vertical maxAbs on all registers
  44     __m512i maxAbsReg = __m512i();
  45 #pragma unroll(k_numRegsPerBlock)
  46     for (int n = 0; n < k_numRegsPerBlock; ++n)
  47     {
  48       const auto thisRegAbs = _mm512_abs_epi16(inData[n]);
  49       maxAbsReg = _mm512_max_epi16(thisRegAbs, maxAbsReg);
  50     }
  51     /// Horizontal max across remaining register
  52     return BlockFloatCompander::horizontalMax1x32(maxAbsReg);
  53   }
  54
  55   /// Compute exponent value for a set of 16 RB from the maximum absolute value
  56   inline __m512i
  57   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  58   {
  59     __m512i maxAbs = __m512i();
  60     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  61 #pragma unroll(16)
  62     for (int n = 0; n < 16; ++n)
  63     {
  64       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  65     }
  66     /// Calculate exponent
  67     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  68   }
  69
  70   /// Compute exponent value for a set of 4 RB from the maximum absolute value
  71   inline __m512i
  72   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  73   {
  74     __m512i maxAbs = __m512i();
  75     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  76 #pragma unroll(4)
  77     for (int n = 0; n < 4; ++n)
  78     {
  79       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
  80     }
  81     /// Calculate exponent
  82     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  83   }
  84
  85   /// Compute exponent value for 1 RB from the maximum absolute value
  86   inline uint8_t
  87   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  88   {
  89     __m512i maxAbs = __m512i();
  90     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  91     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  92     /// Calculate exponent
  93     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  94     return ((uint8_t*)&exps)[0];
  95   }
  96
  97
  98
  99   /// Apply compression to one compression block
 100   template<BlockFloatCompander::PackFunction networkBytePack>
 101   inline void
 102   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
 103                         const int iqWidth, const uint8_t thisExp, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 104   {
 105     /// Store exponent first
 106     *outBlockAddr = thisExp;
 107 #pragma unroll(k_numRegsPerBlock)
 108     for (int n = 0; n < k_numRegsPerBlock; ++n)
 109     {
 110       /// Apply the exponent shift
 111       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 112       /// Pack compressed data network byte order
 113       const auto compDataBytePacked = networkBytePack(compData);
 114       /// Now have 1 register worth of bytes separated into 4 chunks (1 per lane)
 115       /// Use four offset stores to join
 116       const auto thisOutRegAddr = outBlockAddr + 1 + n * totNumBytesPerReg;
 117       _mm_mask_storeu_epi8(thisOutRegAddr, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
 118       _mm_mask_storeu_epi8(thisOutRegAddr + iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
 119       _mm_mask_storeu_epi8(thisOutRegAddr + (2 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
 120       _mm_mask_storeu_epi8(thisOutRegAddr + (3 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 3));
 121     }
 122   }
 123
 124   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 125   template<BlockFloatCompander::PackFunction networkBytePack>
 126   inline void
 127   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 128                  const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 129   {
 130     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 131     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 132 #pragma unroll(16)
 133     for (int n = 0; n < 16; ++n)
 134     {
 135       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 136     }
 137   }
 138
 139   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 140   template<BlockFloatCompander::PackFunction networkBytePack>
 141   inline void
 142   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 143                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 144   {
 145     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 146     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 147 #pragma unroll(4)
 148     for (int n = 0; n < 4; ++n)
 149     {
 150       applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
 151     }
 152   }
 153
 154   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 155   template<BlockFloatCompander::PackFunction networkBytePack>
 156   inline void
 157   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 158                 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 159   {
 160     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 161     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 162     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, totNumBytesPerReg, rbWriteMask);
 163   }
 164
 165   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 166   template<BlockFloatCompander::PackFunction networkBytePack>
 167   inline void
 168   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 169                    const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
 170   {
 171     switch (dataIn.numBlocks)
 172     {
 173     case 16:
 174       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 175       break;
 176
 177     case 4:
 178       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 179       break;
 180
 181     case 1:
 182       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
 183       break;
 184     }
 185   }
 186
 187
 188
 189   /// Apply 8b compression to 1 compression block.
 190   inline void
 191   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 192   {
 193     /// Store exponent first
 194     *outBlockAddr = thisExp;
 195     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 196     __m256i* regOutAddr = reinterpret_cast<__m256i*>(outBlockAddr + 1);
 197 #pragma unroll(k_numRegsPerBlock)
 198     for (int n = 0; n < k_numRegsPerBlock; ++n)
 199     {
 200       /// Apply the exponent shift
 201       const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
 202       /// Truncate to 8bit and store
 203       _mm256_mask_storeu_epi8(regOutAddr + n, k_writeMask, _mm512_cvtepi16_epi8(compData));
 204     }
 205   }
 206
 207   /// Derive and apply 8b compression to 16 compression blocks
 208   inline void
 209   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 210   {
 211     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 212     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 213 #pragma unroll(16)
 214     for (int n = 0; n < 16; ++n)
 215     {
 216       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 217     }
 218   }
 219
 220   /// Derive and apply 8b compression to 4 compression blocks
 221   inline void
 222   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 223   {
 224     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 225     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 226 #pragma unroll(4)
 227     for (int n = 0; n < 4; ++n)
 228     {
 229       applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 230     }
 231   }
 232
 233   /// Derive and apply 8b compression to 1 compression block
 234   inline void
 235   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 236   {
 237     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 238     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 239     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 240   }
 241
 242   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 243   inline void
 244   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 245   {
 246     switch (dataIn.numBlocks)
 247     {
 248     case 16:
 249       compress8_16RB(dataIn, dataOut, totShiftBits);
 250       break;
 251
 252     case 4:
 253       compress8_4RB(dataIn, dataOut, totShiftBits);
 254       break;
 255
 256     case 1:
 257       compress8_1RB(dataIn, dataOut, totShiftBits);
 258       break;
 259     }
 260   }
 261
 262
 263
 264   /// Expand 1 compression block
 265   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 266   inline void
 267   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift, const int totNumBytesPerReg)
 268   {
 269     static constexpr uint8_t k_WriteMask = 0xFF;
 270     const auto thisExpShift = maxExpShift - *expAddr;
 271 #pragma unroll(k_numRegsPerBlock)
 272     for (int n = 0; n < k_numRegsPerBlock; ++n)
 273     {
 274       const auto thisInRegAddr = expAddr + 1 + n * totNumBytesPerReg;
 275       /// Unpack network order packed data
 276       const auto inDataUnpacked = networkByteUnpack(thisInRegAddr);
 277       /// Apply exponent scaling (by appropriate arithmetic shift right)
 278       const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 279       /// Write expanded data to output
 280       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expandedData);
 281     }
 282   }
 283
 284   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 285   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 286   void expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 287                       const int totNumBytesPerBlock, const int totNumBytesPerReg, const int maxExpShift)
 288   {
 289     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 290     switch (dataIn.numBlocks)
 291     {
 292     case 16:
 293 #pragma unroll(16)
 294       for (int n = 0; n < 16; ++n)
 295       {
 296         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 297       }
 298       break;
 299
 300     case 4:
 301 #pragma unroll(4)
 302       for (int n = 0; n < 4; ++n)
 303       {
 304         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
 305       }
 306       break;
 307
 308     case 1:
 309       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift, totNumBytesPerReg);
 310       break;
 311     }
 312   }
 313
 314
 315   /// Apply expansion to 1 compression block
 316   inline void
 317   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 318   {
 319     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 320     static constexpr uint8_t k_WriteMask = 0xFF;
 321 #pragma unroll(k_numRegsPerBlock)
 322     for (int n = 0; n < k_numRegsPerBlock; ++n)
 323     {
 324       const auto compData16 = _mm512_cvtepi8_epi16(rawDataIn[n]);
 325       const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 326       _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expData);
 327     }
 328   }
 329
 330   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 331   void
 332   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 333   {
 334     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 335     switch (dataIn.numBlocks)
 336     {
 337     case 16:
 338 #pragma unroll(16)
 339       for (int n = 0; n < 16; ++n)
 340       {
 341         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 342       }
 343       break;
 344
 345     case 4:
 346 #pragma unroll(4)
 347       for (int n = 0; n < 4; ++n)
 348       {
 349         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
 350       }
 351       break;
 352
 353     case 1:
 354       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 355       break;
 356     }
 357   }
 358 }
 359
 360
 361 /// Main kernel function for 64 antenna C-plane compression.
 362 /// Starts by determining iqWidth specific parameters and functions.
 363 void
 364 BlockFloatCompander::BFPCompressCtrlPlane64Avx512(const ExpandedData& dataIn, CompressedData* dataOut)
 365 {
 366   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 367   const auto totShiftBits8 = _mm512_set1_epi32(25);
 368   const auto totShiftBits9 = _mm512_set1_epi32(24);
 369   const auto totShiftBits10 = _mm512_set1_epi32(23);
 370   const auto totShiftBits12 = _mm512_set1_epi32(21);
 371
 372   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 373   const auto totNumBytesPerBlock = ((BFP_CPlane_64::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 374   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 375   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 376
 377   /// Compressed data write mask for each iqWidth option
 378   constexpr uint16_t rbWriteMask9 = 0x01FF;
 379   constexpr uint16_t rbWriteMask10 = 0x03FF;
 380   constexpr uint16_t rbWriteMask12 = 0x0FFF;
 381
 382   switch (dataIn.iqWidth)
 383   {
 384   case 8:
 385     BFP_CPlane_64::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 386     break;
 387
 388   case 9:
 389     BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask9);
 390     break;
 391
 392   case 10:
 393     BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask10);
 394     break;
 395
 396   case 12:
 397     BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask12);
 398     break;
 399   }
 400 }
 401
 402
 403 /// Main kernel function for 64 antenna C-plane expansion.
 404 /// Starts by determining iqWidth specific parameters and functions.
 405 void
 406 BlockFloatCompander::BFPExpandCtrlPlane64Avx512(const CompressedData& dataIn, ExpandedData* dataOut)
 407 {
 408   constexpr int k_maxExpShift9 = 7;
 409   constexpr int k_maxExpShift10 = 6;
 410   constexpr int k_maxExpShift12 = 4;
 411
 412   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 413   const auto totNumBytesPerBlock = ((BFP_CPlane_64::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 414   /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
 415   const auto totNumBytesPerReg = dataIn.iqWidth << 2;
 416
 417   switch (dataIn.iqWidth)
 418   {
 419   case 8:
 420     BFP_CPlane_64::expandByAlloc8(dataIn, dataOut);
 421     break;
 422
 423   case 9:
 424     BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift9);
 425     break;
 426
 427   case 10:
 428     BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift10);
 429     break;
 430
 431   case 12:
 432     BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift12);
 433     break;
 434   }
 435 }