fhi_lib/lib/src/xran_compression.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2019 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression U-plane implementation and interface functions
  21  *
  22  * @file xran_compression.cpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #include "xran_compression.hpp"
  28 #include "xran_bfp_utils.hpp"
  29 #include "xran_compression.h"
  30 #include <complex>
  31 #include <algorithm>
  32 #include <immintrin.h>
  33 #include <limits.h>
  34 #include <cstring>
  35
  36 namespace BFP_UPlane
  37 {
  38   /// Namespace constants
  39   const int k_numREReal = 24; /// 12 IQ pairs
  40
  41   /// Perform horizontal max of 16 bit values across each lane
  42   __m512i
  43   horizontalMax4x16(const __m512i maxAbsIn)
  44   {
  45     /// Swap 64b in each lane and compute max
  46     const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1);
  47     auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbsIn);
  48     auto maxAbsHorz = _mm512_max_epi16(maxAbsIn, maxAbsPerm);
  49
  50     /// Swap each pair of 32b in each lane and compute max
  51     const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
  52     maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz);
  53     maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
  54
  55     /// Swap each IQ pair in each lane (via 32b rotation) and compute max
  56     maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ);
  57     return _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
  58   }
  59
  60
  61   /// Perform U-plane input data re-ordering and vertical max abs of 16b values
  62   /// Works on 4 RB at a time
  63   __m512i
  64   maxAbsVertical4RB(const __m512i inA, const __m512i inB, const __m512i inC)
  65   {
  66     /// Re-order the next 4RB in input data into 3 registers
  67     /// Input SIMD vectors are:
  68     ///   [A A A A A A A A A A A A B B B B]
  69     ///   [B B B B B B B B C C C C C C C C]
  70     ///   [C C C C D D D D D D D D D D D D]
  71     /// Re-ordered SIMD vectors are:
  72     ///   [A A A A B B B B C C C C D D D D]
  73     ///   [A A A A B B B B C C C C D D D D]
  74     ///   [A A A A B B B B C C C C D D D D]
  75     constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src
  76     constexpr int k_shuff1 = 0x41;
  77     const auto z_w1 = _mm512_mask_shuffle_i64x2(inA, k_msk1, inB, inC, k_shuff1);
  78
  79     constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src
  80     constexpr int k_shuff2 = 0xB1;
  81     const auto z_w2 = _mm512_mask_shuffle_i64x2(inB, k_msk2, inA, inC, k_shuff2);
  82
  83     constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src
  84     constexpr int k_shuff3 = 0xBE;
  85     const auto z_w3 = _mm512_mask_shuffle_i64x2(inC, k_msk3, inA, inB, k_shuff3);
  86
  87     /// Perform max abs on these 3 registers
  88     const auto abs16_1 = _mm512_abs_epi16(z_w1);
  89     const auto abs16_2 = _mm512_abs_epi16(z_w2);
  90     const auto abs16_3 = _mm512_abs_epi16(z_w3);
  91     return _mm512_max_epi16(_mm512_max_epi16(abs16_1, abs16_2), abs16_3);
  92   }
  93
  94
  95   /// Selects first 32 bit value in each src lane and packs into laneNum of dest
  96   __m512i
  97   slidePermute(const __m512i src, const __m512i dest, const int laneNum)
  98   {
  99     const auto k_selectVals = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16,
 100                                                28, 24, 20, 16, 28, 24, 20, 16);
 101     constexpr uint16_t k_laneMsk[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
 102     return _mm512_mask_permutex2var_epi32(dest, k_laneMsk[laneNum], k_selectVals, src);
 103   }
 104
 105
 106   /// Compute exponent value for a set of 16 RB from the maximum absolute value.
 107   /// Max Abs operates in a loop, executing 4 RB per iteration. The results are
 108   /// packed into the final output register.
 109   __m512i
 110   computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
 111   {
 112     __m512i maxAbs = __m512i();
 113     const __m512i* rawData = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 114     /// Max Abs loop operates on 4RB at a time
 115 #pragma unroll(4)
 116     for (int n = 0; n < 4; ++n)
 117     {
 118       /// Re-order and vertical max abs
 119       auto maxAbsVert = maxAbsVertical4RB(rawData[3 * n + 0], rawData[3 * n + 1], rawData[3 * n + 2]);
 120       /// Horizontal max abs
 121       auto maxAbsHorz = horizontalMax4x16(maxAbsVert);
 122       /// Pack these 4 values into maxAbs
 123       maxAbs = slidePermute(maxAbsHorz, maxAbs, n);
 124     }
 125     /// Calculate exponent
 126     const auto maxAbs32 = BlockFloatCompander::maskUpperWord(maxAbs);
 127     return BlockFloatCompander::expLzCnt(maxAbs32, totShiftBits);
 128   }
 129
 130
 131   /// Compute exponent value for a set of 4 RB from the maximum absolute value.
 132   /// Note that we do not need to perform any packing of result as we are only
 133   /// computing 4 RB. The appropriate offset is taken later when extracting the
 134   /// exponent.
 135   __m512i
 136   computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
 137   {
 138     const __m512i* rawData = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 139     /// Re-order and vertical max abs
 140     const auto maxAbsVert = maxAbsVertical4RB(rawData[0], rawData[1], rawData[2]);
 141     /// Horizontal max abs
 142     const auto maxAbsHorz = horizontalMax4x16(maxAbsVert);
 143     /// Calculate exponent
 144     const auto maxAbs = BlockFloatCompander::maskUpperWord(maxAbsHorz);
 145     return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
 146   }
 147
 148
 149   /// Compute exponent value for 1 RB from the maximum absolute value.
 150   /// This works with horizontal max abs only, and needs to include a
 151   /// step to select the final exponent from the 4 lanes.
 152   uint8_t
 153   computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
 154   {
 155     const __m512i* rawData = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
 156     /// Abs
 157     const auto rawDataAbs = _mm512_abs_epi16(rawData[0]);
 158     /// No need to do a full horizontal max operation here, just do a max IQ step,
 159     /// compute the exponents and then use a reduce max over all exponent values. This
 160     /// is the fastest way to handle a single RB.
 161     const auto rawAbsIQSwap = _mm512_rol_epi32(rawDataAbs, BlockFloatCompander::k_numBitsIQ);
 162     const auto maxAbsIQ = _mm512_max_epi16(rawDataAbs, rawAbsIQSwap);
 163     /// Calculate exponent
 164     const auto maxAbsIQ32 = BlockFloatCompander::maskUpperWord(maxAbsIQ);
 165     const auto exps = BlockFloatCompander::expLzCnt(maxAbsIQ32, totShiftBits);
 166     /// At this point we have exponent values for the maximum of each IQ pair.
 167     /// Run a reduce max step to compute the maximum exponent value in the first
 168     /// three lanes - this will give the desired exponent for this RB.
 169     constexpr uint16_t k_expMsk = 0x0FFF;
 170     return (uint8_t)_mm512_mask_reduce_max_epi32(k_expMsk, exps);
 171   }
 172
 173
 174   /// Apply compression to 1 RB
 175   template<BlockFloatCompander::PackFunction networkBytePack>
 176   void
 177   applyCompressionN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 178                         const int numREOffset, const uint8_t thisExp, const int thisRBExpAddr, const uint16_t rbWriteMask)
 179   {
 180     /// Get AVX512 pointer aligned to desired RB
 181     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + numREOffset);
 182     /// Apply the exponent shift
 183     const auto compData = _mm512_srai_epi16(*rawDataIn, thisExp);
 184     /// Pack compressed data network byte order
 185     const auto compDataBytePacked = networkBytePack(compData);
 186     /// Store exponent first
 187     dataOut->dataCompressed[thisRBExpAddr] = thisExp;
 188     /// Now have 1 RB worth of bytes separated into 3 chunks (1 per lane)
 189     /// Use three offset stores to join
 190     _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
 191     _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + dataIn.iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
 192     _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * dataIn.iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
 193   }
 194
 195
 196   /// Apply 9, 10, or 12bit compression to 16 RB
 197   template<BlockFloatCompander::PackFunction networkBytePack>
 198   void
 199   compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 200                  const __m512i totShiftBits, const int totNumBytesPerRB, const uint16_t rbWriteMask)
 201   {
 202     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 203 #pragma unroll(16)
 204     for (int n = 0; n < 16; ++n)
 205     {
 206       applyCompressionN_1RB<networkBytePack>(dataIn, dataOut, n * k_numREReal, ((uint8_t*)&exponents)[n * 4], n * totNumBytesPerRB, rbWriteMask);
 207     }
 208   }
 209
 210
 211   /// Apply 9, 10, or 12bit compression to 4 RB
 212   template<BlockFloatCompander::PackFunction networkBytePack>
 213   void
 214   compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 215                 const __m512i totShiftBits, const int totNumBytesPerRB, const uint16_t rbWriteMask)
 216   {
 217     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 218 #pragma unroll(4)
 219     for (int n = 0; n < 4; ++n)
 220     {
 221       applyCompressionN_1RB<networkBytePack>(dataIn, dataOut, n * k_numREReal, ((uint8_t*)&exponents)[n * 16], n * totNumBytesPerRB, rbWriteMask);
 222     }
 223   }
 224
 225
 226   /// Apply 9, 10, or 12bit compression to 1 RB
 227   template<BlockFloatCompander::PackFunction networkBytePack>
 228   void
 229   compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 230                 const __m512i totShiftBits, const int totNumBytesPerRB, const uint16_t rbWriteMask)
 231   {
 232     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 233     applyCompressionN_1RB<networkBytePack>(dataIn, dataOut, 0, thisExponent, 0, rbWriteMask);
 234   }
 235
 236
 237   /// Calls compression function specific to the number of RB to be executed. For 9, 10, or 12bit iqWidth.
 238   template<BlockFloatCompander::PackFunction networkBytePack>
 239   void
 240   compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 241                    const __m512i totShiftBits, const int totNumBytesPerRB, const uint16_t rbWriteMask)
 242   {
 243     switch (dataIn.numBlocks)
 244     {
 245     case 16:
 246       compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerRB, rbWriteMask);
 247       break;
 248
 249     case 4:
 250       compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerRB, rbWriteMask);
 251       break;
 252
 253     case 1:
 254       compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerRB, rbWriteMask);
 255       break;
 256     }
 257   }
 258
 259
 260   /// Apply compression to 1 RB
 261   void
 262   applyCompression8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
 263                         const int numREOffset, const uint8_t thisExp, const int thisRBExpAddr)
 264   {
 265     /// Get AVX512 pointer aligned to desired RB
 266     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + numREOffset);
 267     /// Apply the exponent shift
 268     const auto compData = _mm512_srai_epi16(*rawDataIn, thisExp);
 269     /// Store exponent first
 270     dataOut->dataCompressed[thisRBExpAddr] = thisExp;
 271     /// Now have 1 RB worth of bytes separated into 3 chunks (1 per lane)
 272     /// Use three offset stores to join
 273     constexpr uint32_t k_rbMask = 0x00FFFFFF; // Write mask for 1RB (24 values)
 274     _mm256_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_rbMask, _mm512_cvtepi16_epi8(compData));
 275   }
 276
 277
 278   /// 8bit RB compression loop for 16 RB
 279   void
 280   compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 281   {
 282     const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
 283 #pragma unroll(16)
 284     for (int n = 0; n < 16; ++n)
 285     {
 286       applyCompression8_1RB(dataIn, dataOut, n * k_numREReal, ((uint8_t*)&exponents)[n * 4], n * (k_numREReal + 1));
 287     }
 288   }
 289
 290
 291   /// 8bit RB compression loop for 4 RB
 292   void
 293   compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 294   {
 295     const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
 296 #pragma unroll(4)
 297     for (int n = 0; n < 4; ++n)
 298     {
 299       applyCompression8_1RB(dataIn, dataOut, n * k_numREReal, ((uint8_t*)&exponents)[n * 16], n * (k_numREReal + 1));
 300     }
 301   }
 302
 303
 304   /// 8bit RB compression loop for 4 RB
 305   void
 306   compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 307   {
 308     const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
 309     applyCompression8_1RB(dataIn, dataOut, 0, thisExponent, 0);
 310   }
 311
 312
 313   /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
 314   void
 315   compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
 316   {
 317     switch (dataIn.numBlocks)
 318     {
 319     case 16:
 320       compress8_16RB(dataIn, dataOut, totShiftBits);
 321       break;
 322
 323     case 4:
 324       compress8_4RB(dataIn, dataOut, totShiftBits);
 325       break;
 326
 327     case 1:
 328       compress8_1RB(dataIn, dataOut, totShiftBits);
 329       break;
 330     }
 331   }
 332
 333
 334   /// Apply compression to 1 RB
 335   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 336   void
 337   applyExpansionN_1RB(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 338                       const int expAddr, const int thisRBAddr, const int maxExpShift)
 339   {
 340     /// Unpack network order packed data
 341     const auto dataUnpacked = networkByteUnpack(dataIn.dataCompressed + expAddr + 1);
 342     /// Apply exponent scaling (by appropriate arithmetic shift right)
 343     const auto dataExpanded = _mm512_srai_epi16(dataUnpacked, maxExpShift - *(dataIn.dataCompressed + expAddr));
 344     /// Write expanded data to output
 345     static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
 346     _mm512_mask_storeu_epi16(dataOut->dataExpanded + thisRBAddr, k_WriteMask, dataExpanded);
 347   }
 348
 349
 350   /// Calls compression function specific to the number of RB to be executed. For 9, 10, or 12bit iqWidth.
 351   template<BlockFloatCompander::UnpackFunction networkByteUnpack>
 352   void
 353   expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 354                  const int totNumBytesPerRB, const int maxExpShift)
 355   {
 356     switch (dataIn.numBlocks)
 357     {
 358     case 16:
 359 #pragma unroll(16)
 360       for (int n = 0; n < 16; ++n)
 361       {
 362         applyExpansionN_1RB<networkByteUnpack>(dataIn, dataOut, n * totNumBytesPerRB, n * k_numREReal, maxExpShift);
 363       }
 364       break;
 365
 366     case 4:
 367 #pragma unroll(4)
 368       for (int n = 0; n < 4; ++n)
 369       {
 370         applyExpansionN_1RB<networkByteUnpack>(dataIn, dataOut, n * totNumBytesPerRB, n * k_numREReal, maxExpShift);
 371       }
 372       break;
 373
 374     case 1:
 375       applyExpansionN_1RB<networkByteUnpack>(dataIn, dataOut, 0, 0, maxExpShift);
 376       break;
 377     }
 378   }
 379
 380
 381   /// Apply expansion to 1 RB and store
 382   void
 383   applyExpansion8_1RB(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
 384                       const int expAddr, const int thisRBAddr)
 385   {
 386     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(dataIn.dataCompressed + expAddr + 1);
 387     const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn);
 388     const auto expData = _mm512_slli_epi16(compData16, *(dataIn.dataCompressed + expAddr));
 389     constexpr uint8_t k_rbMask64 = 0b00111111; // 64b write mask for 1RB (24 int16 values)
 390     _mm512_mask_storeu_epi64(dataOut->dataExpanded + thisRBAddr, k_rbMask64, expData);
 391   }
 392
 393
 394   /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
 395   void
 396   expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
 397   {
 398     switch (dataIn.numBlocks)
 399     {
 400     case 16:
 401 #pragma unroll(16)
 402       for (int n = 0; n < 16; ++n)
 403       {
 404         applyExpansion8_1RB(dataIn, dataOut, n * (k_numREReal + 1), n * k_numREReal);
 405       }
 406       break;
 407
 408     case 4:
 409 #pragma unroll(4)
 410       for (int n = 0; n < 4; ++n)
 411       {
 412         applyExpansion8_1RB(dataIn, dataOut, n * (k_numREReal + 1), n * k_numREReal);
 413       }
 414       break;
 415
 416     case 1:
 417       applyExpansion8_1RB(dataIn, dataOut, 0, 0);
 418       break;
 419     }
 420   }
 421 }
 422
 423
 424
 425 /// Main kernel function for compression.
 426 /// Starts by determining iqWidth specific parameters and functions.
 427 void
 428 BlockFloatCompander::BFPCompressUserPlaneAvx512(const ExpandedData& dataIn, CompressedData* dataOut)
 429 {
 430   /// Compensation for extra zeros in 32b leading zero count when computing exponent
 431   const auto totShiftBits8 = _mm512_set1_epi32(25);
 432   const auto totShiftBits9 = _mm512_set1_epi32(24);
 433   const auto totShiftBits10 = _mm512_set1_epi32(23);
 434   const auto totShiftBits12 = _mm512_set1_epi32(21);
 435
 436   /// Total number of compressed bytes per RB for each iqWidth option
 437   constexpr int totNumBytesPerRB9 = 28;
 438   constexpr int totNumBytesPerRB10 = 31;
 439   constexpr int totNumBytesPerRB12 = 37;
 440
 441   /// Compressed data write mask for each iqWidth option
 442   constexpr uint16_t rbWriteMask9 = 0x01FF;
 443   constexpr uint16_t rbWriteMask10 = 0x03FF;
 444   constexpr uint16_t rbWriteMask12 = 0x0FFF;
 445
 446   switch (dataIn.iqWidth)
 447   {
 448   case 8:
 449     BFP_UPlane::compressByAlloc8(dataIn, dataOut, totShiftBits8);
 450     break;
 451
 452   case 9:
 453     BFP_UPlane::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerRB9, rbWriteMask9);
 454     break;
 455
 456   case 10:
 457     BFP_UPlane::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerRB10, rbWriteMask10);
 458     break;
 459
 460   case 12:
 461     BFP_UPlane::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerRB12, rbWriteMask12);
 462     break;
 463   }
 464 }
 465
 466
 467
 468 /// Main kernel function for expansion.
 469 /// Starts by determining iqWidth specific parameters and functions.
 470 void
 471 BlockFloatCompander::BFPExpandUserPlaneAvx512(const CompressedData& dataIn, ExpandedData* dataOut)
 472 {
 473   constexpr int k_totNumBytesPerRB9 = 28;
 474   constexpr int k_totNumBytesPerRB10 = 31;
 475   constexpr int k_totNumBytesPerRB12 = 37;
 476
 477   constexpr int k_maxExpShift9 = 7;
 478   constexpr int k_maxExpShift10 = 6;
 479   constexpr int k_maxExpShift12 = 4;
 480
 481   switch (dataIn.iqWidth)
 482   {
 483   case 8:
 484     BFP_UPlane::expandByAlloc8(dataIn, dataOut);
 485     break;
 486
 487   case 9:
 488     BFP_UPlane::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, k_totNumBytesPerRB9, k_maxExpShift9);
 489     break;
 490
 491   case 10:
 492     BFP_UPlane::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, k_totNumBytesPerRB10, k_maxExpShift10);
 493     break;
 494
 495   case 12:
 496     BFP_UPlane::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, k_totNumBytesPerRB12, k_maxExpShift12);
 497     break;
 498   }
 499 }
 500
 501 /** callback function type for Symbol packet */
 502 typedef void (*xran_bfp_compress_fn)(const BlockFloatCompander::ExpandedData& dataIn,
 503                                      BlockFloatCompander::CompressedData* dataOut);
 504
 505 /** callback function type for Symbol packet */
 506 typedef void (*xran_bfp_decompress_fn)(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut);
 507
 508 int32_t
 509 xranlib_compress_avx512(const struct xranlib_compress_request *request,
 510                         struct xranlib_compress_response *response)
 511 {
 512     BlockFloatCompander::ExpandedData expandedDataInput;
 513     BlockFloatCompander::CompressedData compressedDataOut;
 514     xran_bfp_compress_fn com_fn = NULL;
 515     uint16_t totalRBs = request->numRBs;
 516     uint16_t remRBs   = totalRBs;
 517     int16_t len = 0;
 518     int16_t block_idx_bytes = 0;
 519
 520     switch (request->iqWidth) {
 521         case 8:
 522         case 9:
 523         case 10:
 524         case 12:
 525             com_fn = BlockFloatCompander::BFPCompressUserPlaneAvx512;
 526             break;
 527         default:
 528             com_fn = BlockFloatCompander::BFPCompressRef;
 529             break;
 530     }
 531
 532     expandedDataInput.iqWidth = request->iqWidth;
 533     expandedDataInput.numDataElements =  24;
 534
 535     while (remRBs){
 536         expandedDataInput.dataExpanded   = &request->data_in[block_idx_bytes];
 537         compressedDataOut.dataCompressed = (uint8_t*)&response->data_out[len];
 538         if(remRBs >= 16){
 539             expandedDataInput.numBlocks = 16;
 540             com_fn(expandedDataInput, &compressedDataOut);
 541             len  += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)16);
 542             block_idx_bytes += 16*expandedDataInput.numDataElements;
 543             remRBs -= 16;
 544         }else if(remRBs >= 4){
 545             expandedDataInput.numBlocks = 4;
 546             com_fn(expandedDataInput, &compressedDataOut);
 547             len  += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)4);
 548             block_idx_bytes +=4*expandedDataInput.numDataElements;
 549             remRBs -=4;
 550         }else if (remRBs >= 1){
 551             expandedDataInput.numBlocks = 1;
 552             com_fn(expandedDataInput, &compressedDataOut);
 553             len  += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)1);
 554             block_idx_bytes +=1*expandedDataInput.numDataElements;
 555             remRBs = remRBs - 1;
 556         }
 557     }
 558
 559     response->len =  ((3 * expandedDataInput.iqWidth) + 1) * totalRBs;
 560
 561     return 0;
 562 }
 563
 564 int32_t
 565 xranlib_decompress_avx512(const struct xranlib_decompress_request *request,
 566     struct xranlib_decompress_response *response)
 567 {
 568     BlockFloatCompander::CompressedData compressedDataInput;
 569     BlockFloatCompander::ExpandedData expandedDataOut;
 570
 571     xran_bfp_decompress_fn decom_fn = NULL;
 572     uint16_t totalRBs = request->numRBs;
 573     uint16_t remRBs   = totalRBs;
 574     int16_t len = 0;
 575     int16_t block_idx_bytes = 0;
 576
 577     switch (request->iqWidth) {
 578     case 8:
 579     case 9:
 580     case 10:
 581     case 12:
 582         decom_fn = BlockFloatCompander::BFPExpandUserPlaneAvx512;
 583         break;
 584     default:
 585         decom_fn = BlockFloatCompander::BFPExpandRef;
 586         break;
 587     }
 588
 589     compressedDataInput.iqWidth         =  request->iqWidth;
 590     compressedDataInput.numDataElements =  24;
 591
 592     while(remRBs) {
 593         compressedDataInput.dataCompressed = (uint8_t*)&request->data_in[block_idx_bytes];
 594         expandedDataOut.dataExpanded       = &response->data_out[len];
 595         if(remRBs >= 16){
 596             compressedDataInput.numBlocks = 16;
 597             decom_fn(compressedDataInput, &expandedDataOut);
 598             len  += 16*compressedDataInput.numDataElements;
 599             block_idx_bytes  += ((3 * compressedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)16);
 600             remRBs -= 16;
 601         }else if(remRBs >= 4){
 602             compressedDataInput.numBlocks = 4;
 603             decom_fn(compressedDataInput, &expandedDataOut);
 604             len  += 4*compressedDataInput.numDataElements;
 605             block_idx_bytes  += ((3 * compressedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)4);
 606             remRBs -=4;
 607         }else if (remRBs >= 1){
 608             compressedDataInput.numBlocks = 1;
 609             decom_fn(compressedDataInput, &expandedDataOut);
 610             len  += 1*compressedDataInput.numDataElements;
 611             block_idx_bytes  += ((3 * compressedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)1);
 612             remRBs = remRBs - 1;
 613         }
 614     }
 615
 616     response->len = totalRBs * compressedDataInput.numDataElements * sizeof(int16_t);
 617
 618     return 0;
 619 }
 620
 621 int32_t
 622 xranlib_compress_avx512_bfw(const struct xranlib_compress_request *request,
 623                         struct xranlib_compress_response *response)
 624 {
 625     BlockFloatCompander::ExpandedData expandedDataInput;
 626     BlockFloatCompander::CompressedData compressedDataOut;
 627     xran_bfp_compress_fn com_fn = NULL;
 628
 629     if (request->numRBs != 1){
 630         printf("Unsupported numRBs %d\n", request->numRBs);
 631         return -1;
 632     }
 633
 634     switch (request->iqWidth) {
 635         case 8:
 636         case 9:
 637         case 10:
 638         case 12:
 639         switch (request->numDataElements) {
 640             case 16:
 641                 com_fn = BlockFloatCompander::BFPCompressCtrlPlane8Avx512;
 642                 break;
 643             case 32:
 644                 com_fn = BlockFloatCompander::BFPCompressCtrlPlane16Avx512;
 645                 break;
 646             case 64:
 647                 com_fn = BlockFloatCompander::BFPCompressCtrlPlane32Avx512;
 648                 break;
 649             case 128:
 650                 com_fn = BlockFloatCompander::BFPCompressCtrlPlane64Avx512;
 651                 break;
 652             case 24:
 653             default:
 654                 printf("Unsupported numDataElements %d\n", request->numDataElements);
 655                 return -1;
 656                 break;
 657         }
 658         break;
 659     default:
 660         printf("Unsupported iqWidth %d\n", request->iqWidth);
 661         return -1;
 662         break;
 663     }
 664
 665     expandedDataInput.iqWidth         = request->iqWidth;
 666     expandedDataInput.numDataElements = request->numDataElements;
 667     expandedDataInput.numBlocks       = 1;
 668     expandedDataInput.dataExpanded    = &request->data_in[0];
 669     compressedDataOut.dataCompressed  = (uint8_t*)&response->data_out[0];
 670
 671     com_fn(expandedDataInput, &compressedDataOut);
 672
 673     response->len =  (((expandedDataInput.numDataElements  * expandedDataInput.iqWidth) >> 3) + 1)
 674                             * request->numRBs;
 675
 676     return 0;
 677 }
 678
 679 int32_t
 680 xranlib_decompress_avx512_bfw(const struct xranlib_decompress_request *request,
 681                         struct xranlib_decompress_response *response)
 682 {
 683     BlockFloatCompander::CompressedData compressedDataInput;
 684     BlockFloatCompander::ExpandedData expandedDataOut;
 685     xran_bfp_decompress_fn decom_fn = NULL;
 686
 687     if (request->numRBs != 1){
 688         printf("Unsupported numRBs %d\n", request->numRBs);
 689         return -1;
 690     }
 691
 692     switch (request->iqWidth) {
 693         case 8:
 694         case 9:
 695         case 10:
 696         case 12:
 697         switch (request->numDataElements) {
 698             case 16:
 699                 decom_fn = BlockFloatCompander::BFPExpandCtrlPlane8Avx512;
 700                 break;
 701             case 32:
 702                 decom_fn = BlockFloatCompander::BFPExpandCtrlPlane16Avx512;
 703                 break;
 704             case 64:
 705                 decom_fn = BlockFloatCompander::BFPExpandCtrlPlane32Avx512;
 706                 break;
 707             case 128:
 708                 decom_fn = BlockFloatCompander::BFPExpandCtrlPlane64Avx512;
 709                 break;
 710             case 24:
 711             default:
 712                 printf("Unsupported numDataElements %d\n", request->numDataElements);
 713                 return -1;
 714                 break;
 715         }
 716         break;
 717     default:
 718         printf("Unsupported iqWidth %d\n", request->iqWidth);
 719         return -1;
 720         break;
 721     }
 722
 723     compressedDataInput.iqWidth         = request->iqWidth;
 724     compressedDataInput.numDataElements = request->numDataElements;
 725     compressedDataInput.numBlocks       = 1;
 726     compressedDataInput.dataCompressed  = (uint8_t*)&request->data_in[0];
 727     expandedDataOut.dataExpanded        = &response->data_out[0];
 728
 729     decom_fn(compressedDataInput, &expandedDataOut);
 730
 731     response->len = request->numRBs * compressedDataInput.numDataElements * sizeof(int16_t);
 732
 733     return 0;
 734 }
 735