fhi_lib/lib/src/xran_bfp_cplane16.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2020 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression for C-plane with 16T16R
  21  *
  22  * @file xran_bfp_cplane16.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include "xran_bfp_byte_packing_utils.hpp"
  30 #include <complex>
  31 #include <algorithm>
  32 #include <immintrin.h>
  33
  34
  35 namespace BFP_CPlane_16
  36 {
  37   /// Namespace constants
  38   const int k_numDataElements = 32; /// 16 IQ pairs
  39
  40   inline int
  41   maxAbsOneBlock(const __m512i* inData)
  42   {
  43     /// Compute abs of input data
  44     const auto thisRegAbs = _mm512_abs_epi16(*inData);
  45     /// Horizontal max across register
  46     return BlockFloatCompander::horizontalMax1x32(thisRegAbs);
  47   }
  48
  49   /// Compute exponent value for a set of 16 RB from the maximum absolute value.
  50   inline __m512i
  51   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  52   {
  53     __m512i maxAbs = __m512i();
  54     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  55 #pragma unroll(16)
  56     for (int n = 0; n < 16; ++n)
  57     {
  58       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n);
  59     }
  60     /// Calculate exponent
  61     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  62   }
  63
  64   /// Compute exponent value for a set of 4 RB from the maximum absolute value.
  65   inline __m512i
  66   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  67   {
  68     __m512i maxAbs = __m512i();
  69     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  70 #pragma unroll(4)
  71     for (int n = 0; n < 4; ++n)
  72     {
  73       ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n);
  74     }
  75     /// Calculate exponent
  76     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  77   }
  78
  79   /// Compute exponent value for 1 RB from the maximum absolute value.
  80   inline uint8_t
  81   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
  82   {
  83     __m512i maxAbs = __m512i();
  84     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  85     ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
  86     /// Calculate exponent
  87     const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
  88     return ((uint8_t*)&exps)[0];
  89   }
  90
  91
  92
  93   /// Apply compression to one compression block
  94   template<BlockFloatCompander::PackFunction networkBytePack>
  95   inline void
  96   applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
  97                         const int iqWidth, const uint8_t thisExp, const uint16_t rbWriteMask)
  98   {
  99     /// Store exponent first
 100     *outBlockAddr = thisExp;
 101     /// Apply the exponent shift
 102     const auto compData = _mm512_srai_epi16(*dataIn, thisExp);
 103     /// Pack compressed data network byte order
 104     const auto compDataBytePacked = networkBytePack(compData);
 105     /// Now have 1 register worth of bytes separated into 4 chunks (1 per lane)
 106     /// Use four offset stores to join
 107     const auto thisOutRegAddr = outBlockAddr + 1;
 108     _mm_mask_storeu_epi8(thisOutRegAddr, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
 109     _mm_mask_storeu_epi8(thisOutRegAddr + iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
 110     _mm_mask_storeu_epi8(thisOutRegAddr + (2 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
 111     _mm_mask_storeu_epi8(thisOutRegAddr + (3 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 3));
 112   }
 113
 114   /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
 115   template<BlockFloatCompander::PackFunction networkBytePack>
 116   inline void
 117   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 118                  const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 119   {
 120     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 121     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 122 #pragma unroll(16)
 123     for (int n = 0; n < 16; ++n)
 124     {
 125       applyCompressionN_1RB<networkBytePack>(dataInAddr + n, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], rbWriteMask);
 126     }
 127   }
 128
 129   /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
 130   template<BlockFloatCompander::PackFunction networkBytePack>
 131   inline void
 132   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 133                 const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 134   {
 135     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 136     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 137 #pragma unroll(4)
 138     for (int n = 0; n < 4; ++n)
 139     {
 140       applyCompressionN_1RB<networkBytePack>(dataInAddr + n, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], rbWriteMask);
 141     }
 142   }
 143
 144   /// Derive and apply 9, 10, or 12bit compression to 1 RB
 145   template<BlockFloatCompander::PackFunction networkBytePack>
 146   inline void
 147   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 148                 const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 149   {
 150     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 151     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 152     applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, rbWriteMask);
 153   }
 154
 155   /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 156   template<BlockFloatCompander::PackFunction networkBytePack>
 157   inline void
 158   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 159                    const __m512i totShiftBits, const int totNumBytesPerBlock, const uint16_t rbWriteMask)
 160   {
 161     switch (dataIn.numBlocks)
 162     {
 163     case 16:
 164       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 165       break;
 166
 167     case 4:
 168       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 169       break;
 170
 171     case 1:
 172       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, rbWriteMask);
 173       break;
 174     }
 175   }
 176
 177
 178
 179   /// Apply 8b compression to 1 compression block.
 180   inline void
 181   applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
 182   {
 183     /// Store exponent first
 184     *outBlockAddr = thisExp;
 185     /// Apply the exponent shift
 186     const auto compData = _mm512_srai_epi16(*dataIn, thisExp);
 187     /// Truncate to 8bit and store
 188     constexpr uint32_t k_writeMask = 0xFFFFFFFF;
 189     _mm256_mask_storeu_epi8(outBlockAddr + 1, k_writeMask, _mm512_cvtepi16_epi8(compData));
 190   }
 191
 192   /// Derive and apply 8b compression to 16 compression blocks
 193   inline void
 194   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 195   {
 196     const __m512i exponents = computeExponent_16RB(dataIn, totShiftBits);
 197     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 198 #pragma unroll(16)
 199     for (int n = 0; n < 16; ++n)
 200     {
 201       applyCompression8_1RB(dataInAddr + n, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 202     }
 203   }
 204
 205   /// Derive and apply 8b compression to 4 compression blocks
 206   inline void
 207   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 208   {
 209     const __m512i exponents = computeExponent_4RB(dataIn, totShiftBits);
 210     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 211 #pragma unroll(4)
 212     for (int n = 0; n < 4; ++n)
 213     {
 214       applyCompression8_1RB(dataInAddr + n, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
 215     }
 216   }
 217
 218   /// Derive and apply 8b compression to 1 compression block
 219   inline void
 220   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 221   {
 222     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 223     const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 224     applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
 225   }
 226
 227   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 228   inline void
 229   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 230   {
 231     switch (dataIn.numBlocks)
 232     {
 233     case 16:
 234       compress8_16RB(dataIn, dataOut, totShiftBits);
 235       break;
 236
 237     case 4:
 238       compress8_4RB(dataIn, dataOut, totShiftBits);
 239       break;
 240
 241     case 1:
 242       compress8_1RB(dataIn, dataOut, totShiftBits);
 243       break;
 244     }
 245   }
 246
 247
 248
 249   /// Expand 1 compression block
 250   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 251   inline void
 252   applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift)
 253   {
 254     const auto thisExpShift = maxExpShift - *expAddr;
 255     /// Unpack network order packed data
 256     const auto inDataUnpacked = networkByteUnpack(expAddr + 1);
 257     /// Apply exponent scaling (by appropriate arithmetic shift right)
 258     const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
 259     /// Write expanded data to output
 260     static constexpr uint8_t k_WriteMask = 0xFF;
 261     _mm512_mask_storeu_epi64(dataOutAddr, k_WriteMask, expandedData);
 262   }
 263
 264   /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
 265   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 266   void
 267   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 268                  const int totNumBytesPerBlock, const int maxExpShift)
 269   {
 270     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 271     switch (dataIn.numBlocks)
 272     {
 273     case 16:
 274 #pragma unroll(16)
 275       for (int n = 0; n < 16; ++n)
 276       {
 277         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n, maxExpShift);
 278       }
 279       break;
 280
 281     case 4:
 282 #pragma unroll(4)
 283       for (int n = 0; n < 4; ++n)
 284       {
 285         applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n, maxExpShift);
 286       }
 287       break;
 288
 289     case 1:
 290       applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift);
 291       break;
 292     }
 293   }
 294
 295
 296   /// Apply expansion to 1 compression block
 297   inline void
 298   applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
 299   {
 300     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
 301     const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn);
 302     const auto expData = _mm512_slli_epi16(compData16, *expAddr);
 303     static constexpr uint8_t k_WriteMask = 0xFF;
 304     _mm512_mask_storeu_epi64(dataOutAddr, k_WriteMask, expData);
 305   }
 306
 307   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 308   void
 309   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 310   {
 311     __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
 312     switch (dataIn.numBlocks)
 313     {
 314     case 16:
 315 #pragma unroll(16)
 316       for (int n = 0; n < 16; ++n)
 317       {
 318         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n);
 319       }
 320       break;
 321
 322     case 4:
 323 #pragma unroll(4)
 324       for (int n = 0; n < 4; ++n)
 325       {
 326         applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n);
 327       }
 328       break;
 329
 330     case 1:
 331       applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
 332       break;
 333     }
 334   }
 335 }
 336
 337
 338 /// Main kernel function for 16 antenna C-plane compression.
 339 /// Starts by determining iqWidth specific parameters and functions.
 340 void
 341 BlockFloatCompander::BFPCompressCtrlPlane16Avx512(const ExpandedData& dataIn, CompressedData* dataOut)
 342 {
 343   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 344   const auto totShiftBits8 = _mm512_set1_epi32(25);
 345   const auto totShiftBits9 = _mm512_set1_epi32(24);
 346   const auto totShiftBits10 = _mm512_set1_epi32(23);
 347   const auto totShiftBits12 = _mm512_set1_epi32(21);
 348
 349   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 350   const auto totNumBytesPerBlock = ((BFP_CPlane_16::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 351
 352   /// Compressed data write mask for each iqWidth option
 353   constexpr uint16_t rbWriteMask9 = 0x01FF;
 354   constexpr uint16_t rbWriteMask10 = 0x03FF;
 355   constexpr uint16_t rbWriteMask12 = 0x0FFF;
 356
 357   switch (dataIn.iqWidth)
 358   {
 359   case 8:
 360     BFP_CPlane_16::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 361     break;
 362
 363   case 9:
 364     BFP_CPlane_16::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, rbWriteMask9);
 365     break;
 366
 367   case 10:
 368     BFP_CPlane_16::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, rbWriteMask10);
 369     break;
 370
 371   case 12:
 372     BFP_CPlane_16::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, rbWriteMask12);
 373     break;
 374   }
 375 }
 376
 377
 378 /// Main kernel function for 16 antenna C-plane  expansion.
 379 /// Starts by determining iqWidth specific parameters and functions.
 380 void
 381 BlockFloatCompander::BFPExpandCtrlPlane16Avx512(const CompressedData& dataIn, ExpandedData* dataOut)
 382 {
 383   constexpr int k_maxExpShift9 = 7;
 384   constexpr int k_maxExpShift10 = 6;
 385   constexpr int k_maxExpShift12 = 4;
 386
 387   /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
 388   const auto totNumBytesPerBlock = ((BFP_CPlane_16::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
 389
 390   switch (dataIn.iqWidth)
 391   {
 392   case 8:
 393     BFP_CPlane_16::expandByAlloc8(dataIn, dataOut);
 394     break;
 395
 396   case 9:
 397     BFP_CPlane_16::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift9);
 398     break;
 399
 400   case 10:
 401     BFP_CPlane_16::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift10);
 402     break;
 403
 404   case 12:
 405     BFP_CPlane_16::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, totNumBytesPerBlock, k_maxExpShift12);
 406     break;
 407   }
 408 }