fhi_lib/lib/src/xran_bfp_cplane16.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2019 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 16T16R
  21  *
  22  * @file xran_bfp_cplane16.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include <complex>
  30 #include <algorithm>
  31 #include <immintrin.h>
  32
  33
  34 namespace BFP_CPlane_16
  35 {
  36   /// Namespace constants
  37   const int k_numDataElements = 32; /// 16 IQ pairs
  38
  39   inline int
  40   maxAbsOneBlock(const __m512i* inData)
  41   {
  42     /// Compute abs of input data
  43     const auto thisRegAbs = _mm512_abs_epi16(*inData);
  44     /// Horizontal max across register
  45     return BlockFloatCompander::horizontalMax1x32(thisRegAbs);
  46   }
  47
  48   /// Compute exponent value for a set of 16 RB from the maximum absolute value.
  49   inline __m512i
  50   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  51   {
  52     __m512i maxAbs = __m512i();
  53     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  54 #pragma unroll(16)
  55     for (int n = 0; n < 16; ++n)
  56     {
  57       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n);
  58     }
  59     /// Calculate exponent
  60     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  61   }
  62
  63   /// Compute exponent value for a set of 4 RB from the maximum absolute value.
  64   inline __m512i
  65   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  66   {
  67     __m512i maxAbs = __m512i();
  68     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  69 #pragma unroll(4)
  70     for (int n = 0; n < 4; ++n)
  71     {
  72       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n);
  73     }
  74     /// Calculate exponent
  75     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  76   }
  77
  78   /// Compute exponent value for 1 RB from the maximum absolute value.
  79   inline uint8_t
  80   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  81   {
  82     __m512i maxAbs = __m512i();
  83     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  84     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  85     /// Calculate exponent
  86     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  87     return ((uint8_t*)&exps)[0];
  88   }
  89
  90
  91
  92   /// Apply compression to one compression block
  93   template<BlockFloatCompander::PackFunction networkBytePack>
  94   inline void
  95   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
  96                         const int iqWidth, const uint8_t thisExp, const uint16_t rbWriteMask)
  97   {
  98     /// Store exponent first
  99     *outBlockAddr = thisExp;
 100     /// Apply the exponent shift
 101     const auto compData = _mm512_srai_epi16(*dataIn, thisExp);
 102     /// Pack compressed data network byte order
 103     const auto compDataBytePacked = networkBytePack(compData);
 104     /// Now have 1 register worth of bytes separated into 4 chunks (1 per lane)
 105     /// Use four offset stores to join
 106     const auto thisOutRegAddr = outBlockAddr + 1;
 107     _mm_mask_storeu_epi8(thisOutRegAddr, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
 108     _mm_mask_storeu_epi8(thisOutRegAddr + iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
 109     _mm_mask_storeu_epi8(thisOutRegAddr + (2 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
 110     _mm_mask_storeu_epi8(thisOutRegAddr + (3 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 3));
 111   }
 112
 113   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 114   template<BlockFloatCompander::PackFunction networkBytePack>
 115   inline void
 116   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 117                  const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 118   {
 119     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 120     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 121 #pragma unroll(16)
 122     for (int n = 0; n < 16; ++n)
 123     {
 124       applyCompressionN_1RB<networkBytePack>(dataInAddr + n, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], rbWriteMask);
 125     }
 126   }
 127
 128   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 129   template<BlockFloatCompander::PackFunction networkBytePack>
 130   inline void
 131   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 132                 const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 133   {
 134     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 135     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 136 #pragma unroll(4)
 137     for (int n = 0; n < 4; ++n)
 138     {
 139       applyCompressionN_1RB<networkBytePack>(dataInAddr + n, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], rbWriteMask);
 140     }
 141   }
 142
 143   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 144   template<BlockFloatCompander::PackFunction networkBytePack>
 145   inline void
 146   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 147                 const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 148   {
 149     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 150     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 151     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, rbWriteMask);
 152   }
 153
 154   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 155   template<BlockFloatCompander::PackFunction networkBytePack>
 156   inline void
 157   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 158                    const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 159   {
 160     switch (dataIn.numBlocks)
 161     {
 162     case 16:
 163       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 164       break;
 165
 166     case 4:
 167       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 168       break;
 169
 170     case 1:
 171       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 172       break;
 173     }
 174   }
 175
 176
 177
 178   /// Apply 8b compression to 1 compression block.
 179   inline void
 180   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 181   {
 182     /// Store exponent first
 183     *outBlockAddr = thisExp;
 184     /// Apply the exponent shift
 185     const auto compData = _mm512_srai_epi16(*dataIn, thisExp);
 186     /// Truncate to 8bit and store
 187     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 188     _mm256_mask_storeu_epi8(outBlockAddr + 1, k_writeMask, _mm512_cvtepi16_epi8(compData));
 189   }
 190
 191   /// Derive and apply 8b compression to 16 compression blocks
 192   inline void
 193   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 194   {
 195     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 196     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 197 #pragma unroll(16)
 198     for (int n = 0; n < 16; ++n)
 199     {
 200       applyCompression8_1RB(dataInAddr + n, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 201     }
 202   }
 203
 204   /// Derive and apply 8b compression to 4 compression blocks
 205   inline void
 206   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 207   {
 208     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 209     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 210 #pragma unroll(4)
 211     for (int n = 0; n < 4; ++n)
 212     {
 213       applyCompression8_1RB(dataInAddr + n, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 214     }
 215   }
 216
 217   /// Derive and apply 8b compression to 1 compression block
 218   inline void
 219   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 220   {
 221     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 222     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 223     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 224   }
 225
 226   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 227   inline void
 228   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 229   {
 230     switch (dataIn.numBlocks)
 231     {
 232     case 16:
 233       compress8_16RB(dataIn, dataOut, totShiftBits);
 234       break;
 235
 236     case 4:
 237       compress8_4RB(dataIn, dataOut, totShiftBits);
 238       break;
 239
 240     case 1:
 241       compress8_1RB(dataIn, dataOut, totShiftBits);
 242       break;
 243     }
 244   }
 245
 246
 247
 248   /// Expand 1 compression block
 249   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 250   inline void
 251   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift)
 252   {
 253     const auto thisExpShift = maxExpShift - *expAddr;
 254     /// Unpack network order packed data
 255     const auto inDataUnpacked = networkByteUnpack(expAddr + 1);
 256     /// Apply exponent scaling (by appropriate arithmetic shift right)
 257     const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 258     /// Write expanded data to output
 259     static constexpr uint8_t k_WriteMask = 0xFF;
 260     _mm512_mask_storeu_epi64(dataOutAddr, k_WriteMask, expandedData);
 261   }
 262
 263   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 264   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 265   void
 266   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 267                  const int totNumBytesPerBlock, const int maxExpShift)
 268   {
 269     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 270     switch (dataIn.numBlocks)
 271     {
 272     case 16:
 273 #pragma unroll(16)
 274       for (int n = 0; n < 16; ++n)
 275       {
 276         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n, maxExpShift);
 277       }
 278       break;
 279
 280     case 4:
 281 #pragma unroll(4)
 282       for (int n = 0; n < 4; ++n)
 283       {
 284         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n, maxExpShift);
 285       }
 286       break;
 287
 288     case 1:
 289       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift);
 290       break;
 291     }
 292   }
 293
 294
 295   /// Apply expansion to 1 compression block
 296   inline void
 297   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 298   {
 299     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 300     const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn);
 301     const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 302     static constexpr uint8_t k_WriteMask = 0xFF;
 303     _mm512_mask_storeu_epi64(dataOutAddr, k_WriteMask, expData);
 304   }
 305
 306   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 307   void
 308   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 309   {
 310     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 311     switch (dataIn.numBlocks)
 312     {
 313     case 16:
 314 #pragma unroll(16)
 315       for (int n = 0; n < 16; ++n)
 316       {
 317         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n);
 318       }
 319       break;
 320
 321     case 4:
 322 #pragma unroll(4)
 323       for (int n = 0; n < 4; ++n)
 324       {
 325         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n);
 326       }
 327       break;
 328
 329     case 1:
 330       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 331       break;
 332     }
 333   }
 334 }
 335
 336
 337 /// Main kernel function for 16 antenna C-plane compression.
 338 /// Starts by determining iqWidth specific parameters and functions.
 339 void
 340 BlockFloatCompander::BFPCompressCtrlPlane16Avx512(const ExpandedData& dataIn, CompressedData* dataOut)
 341 {
 342   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 343   const auto totShiftBits8 = _mm512_set1_epi32(25);
 344   const auto totShiftBits9 = _mm512_set1_epi32(24);
 345   const auto totShiftBits10 = _mm512_set1_epi32(23);
 346   const auto totShiftBits12 = _mm512_set1_epi32(21);
 347
 348   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 349   const auto totNumBytesPerBlock = ((BFP_CPlane_16::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 350
 351   /// Compressed data write mask for each iqWidth option
 352   constexpr uint16_t rbWriteMask9 = 0x01FF;
 353   constexpr uint16_t rbWriteMask10 = 0x03FF;
 354   constexpr uint16_t rbWriteMask12 = 0x0FFF;
 355
 356   switch (dataIn.iqWidth)
 357   {
 358   case 8:
 359     BFP_CPlane_16::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 360     break;
 361
 362   case 9:
 363     BFP_CPlane_16::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, rbWriteMask9);
 364     break;
 365
 366   case 10:
 367     BFP_CPlane_16::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, rbWriteMask10);
 368     break;
 369
 370   case 12:
 371     BFP_CPlane_16::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, rbWriteMask12);
 372     break;
 373   }
 374 }
 375
 376
 377 /// Main kernel function for 16 antenna C-plane  expansion.
 378 /// Starts by determining iqWidth specific parameters and functions.
 379 void
 380 BlockFloatCompander::BFPExpandCtrlPlane16Avx512(const CompressedData& dataIn, ExpandedData* dataOut)
 381 {
 382   constexpr int k_maxExpShift9 = 7;
 383   constexpr int k_maxExpShift10 = 6;
 384   constexpr int k_maxExpShift12 = 4;
 385
 386   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 387   const auto totNumBytesPerBlock = ((BFP_CPlane_16::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 388
 389   switch (dataIn.iqWidth)
 390   {
 391   case 8:
 392     BFP_CPlane_16::expandByAlloc8(dataIn, dataOut);
 393     break;
 394
 395   case 9:
 396     BFP_CPlane_16::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift9);
 397     break;
 398
 399   case 10:
 400     BFP_CPlane_16::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift10);
 401     break;
 402
 403   case 12:
 404     BFP_CPlane_16::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift12);
 405     break;
 406   }
 407 }