fhi_lib/lib/src/xran_compression.cpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2019 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 #include "xran_compression.hpp"
  20 #include <complex>
  21 #include <algorithm>
  22 #include <immintrin.h>
  23
  24 void
  25 BlockFloatCompander::BlockFloatCompress_AVX512(const ExpandedData& dataIn, CompressedData* dataOut)
  26 {
  27   __m512i maxAbs = __m512i();
  28
  29   /// Load data and find max(abs(RB))
  30   const __m512i* rawData = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
  31   static constexpr int k_numInputLoopIts = BlockFloatCompander::k_numRB / 4;
  32
  33 #pragma unroll(k_numInputLoopIts)
  34   for (int n = 0; n < k_numInputLoopIts; ++n)
  35   {
  36     /// Re-order the next 4RB in input data into 3 registers
  37     /// Input SIMD vectors are:
  38     ///   [A A A A A A A A A A A A B B B B]
  39     ///   [B B B B B B B B C C C C C C C C]
  40     ///   [C C C C D D D D D D D D D D D D]
  41     /// Re-ordered SIMD vectors are:
  42     ///   [A A A A B B B B C C C C D D D D]
  43     ///   [A A A A B B B B C C C C D D D D]
  44     ///   [A A A A B B B B C C C C D D D D]
  45     static constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src
  46     static constexpr int k_shuff1 = 0x41;
  47     const auto z_w1 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 0], k_msk1, rawData[3 * n + 1], rawData[3 * n + 2], k_shuff1);
  48
  49     static constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src
  50     static constexpr int k_shuff2 = 0xB1;
  51     const auto z_w2 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 1], k_msk2, rawData[3 * n + 0], rawData[3 * n + 2], k_shuff2);
  52
  53     static constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src
  54     static constexpr int k_shuff3 = 0xBE;
  55     const auto z_w3 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 2], k_msk3, rawData[3 * n + 0], rawData[3 * n + 1], k_shuff3);
  56
  57     /// Perform max abs on these 3 registers
  58     const auto abs16_1 = _mm512_abs_epi16(z_w1);
  59     const auto abs16_2 = _mm512_abs_epi16(z_w2);
  60     const auto abs16_3 = _mm512_abs_epi16(z_w3);
  61     const auto maxAbs_12 = _mm512_max_epi16(abs16_1, abs16_2);
  62     const auto maxAbs_123 = _mm512_max_epi16(maxAbs_12, abs16_3);
  63
  64     /// Perform horizontal max over each lane
  65     /// Swap 64b in each lane and compute max
  66     static const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1);
  67     auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbs_123);
  68     auto maxAbsHorz = _mm512_max_epi16(maxAbs_123, maxAbsPerm);
  69
  70     /// Swap each pair of 32b in each lane and compute max
  71     static const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
  72     maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz);
  73     maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
  74
  75     /// Swap each IQ pair in each lane (via 32b rotation) and compute max
  76     maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ);
  77     maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
  78
  79     /// Insert values into maxAbs
  80     /// Use sliding mask to insert wanted values into maxAbs
  81     /// Pairs of values will be inserted and corrected outside of loop
  82     static const auto k_select4RB = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16,
  83                                                      28, 24, 20, 16, 28, 24, 20, 16);
  84     static constexpr uint16_t k_expMsk[k_numInputLoopIts] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
  85     maxAbs = _mm512_mask_permutex2var_epi32(maxAbs, k_expMsk[n], k_select4RB, maxAbsHorz);
  86   }
  87
  88   /// Convert to 32b by removing repeated values in maxAbs
  89   static const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  90                                                        0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  91                                                        0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  92                                                        0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF);
  93   maxAbs = _mm512_and_epi64(maxAbs, k_upperWordMask);
  94
  95   /// Compute exponent and store for later use
  96   static constexpr int k_expTotShiftBits = 32 - BlockFloatCompander::k_iqWidth + 1;
  97   const auto totShiftBits = _mm512_set1_epi32(k_expTotShiftBits);
  98   const auto lzCount = _mm512_lzcnt_epi32(maxAbs);
  99   const auto exponent = _mm512_sub_epi32(totShiftBits, lzCount);
 100   int8_t storedExp[BlockFloatCompander::k_numRB] = {};
 101   static constexpr uint16_t k_expWriteMask = 0xFFFF;
 102   _mm512_mask_cvtepi32_storeu_epi8(storedExp, k_expWriteMask, exponent);
 103
 104   /// Shift 1RB by corresponding exponent and write exponent and data to output
 105   /// Output data is packed exponent first followed by corresponding compressed RB
 106 #pragma unroll(BlockFloatCompander::k_numRB)
 107   for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
 108   {
 109     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
 110     auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
 111
 112     dataOut->dataCompressed[n * (BlockFloatCompander::k_numREReal + 1)] = storedExp[n];
 113     static constexpr uint32_t k_rbMask = 0x00FFFFFF; // Write mask for 1RB (24 values)
 114     _mm512_mask_cvtepi16_storeu_epi8(dataOut->dataCompressed + n * (BlockFloatCompander::k_numREReal + 1) + 1, k_rbMask, compData);
 115   }
 116 }
 117
 118
 119 void
 120 BlockFloatCompander::BlockFloatExpand_AVX512(const CompressedData& dataIn, ExpandedData* dataOut)
 121 {
 122 #pragma unroll(BlockFloatCompander::k_numRB)
 123   for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
 124   {
 125     /// Expand 1RB of data
 126     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(dataIn.dataCompressed + n * (BlockFloatCompander::k_numREReal + 1) + 1);
 127     const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn);
 128     const auto expData = _mm512_slli_epi16(compData16, *(dataIn.dataCompressed + n * (BlockFloatCompander::k_numREReal + 1)));
 129
 130     /// Write expanded data to output
 131     static constexpr uint8_t k_rbMask64 = 0b00111111; // 64b write mask for 1RB (24 int16 values)
 132     _mm512_mask_storeu_epi64(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_rbMask64, expData);
 133   }
 134 }
 135
 136
 137 void
 138 BlockFloatCompander::BlockFloatCompress_Basic(const ExpandedData& dataIn, CompressedData* dataOut)
 139 {
 140   int16_t maxAbs[BlockFloatCompander::k_numRB];
 141   for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb)
 142   {
 143     // Find max abs value for this RB
 144     maxAbs[rb] = 0;
 145     for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re)
 146     {
 147       auto dataIdx = rb * BlockFloatCompander::k_numREReal + re;
 148       int16_t dataAbs = (int16_t)std::abs(dataIn.dataExpanded[dataIdx]);
 149       maxAbs[rb] = std::max(maxAbs[rb], dataAbs);
 150     }
 151
 152     // Find exponent
 153     static constexpr int k_expTotShiftBits16 = 16 - BlockFloatCompander::k_iqWidth + 1;
 154     auto thisExp = (int8_t)(k_expTotShiftBits16 - __lzcnt16(maxAbs[rb]));
 155     auto expIdx = rb * (BlockFloatCompander::k_numREReal + 1);
 156     dataOut->dataCompressed[expIdx] = thisExp;
 157
 158     // ARS data by exponent
 159     for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re)
 160     {
 161       auto dataIdxIn = rb * BlockFloatCompander::k_numREReal + re;
 162       auto dataIdxOut = (expIdx + 1) + re;
 163       dataOut->dataCompressed[dataIdxOut] = (int8_t)(dataIn.dataExpanded[dataIdxIn] >> thisExp);
 164     }
 165   }
 166 }
 167
 168
 169 void
 170 BlockFloatCompander::BlockFloatExpand_Basic(const CompressedData& dataIn, ExpandedData* dataOut)
 171 {
 172   // Expand data
 173   for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb)
 174   {
 175     for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re)
 176     {
 177       auto dataIdxOut = rb * BlockFloatCompander::k_numREReal + re;
 178       auto expIdx = rb * (BlockFloatCompander::k_numREReal + 1);
 179       auto dataIdxIn = (expIdx + 1) + re;
 180       auto thisData = (int16_t)dataIn.dataCompressed[dataIdxIn];
 181       auto thisExp = (int16_t)dataIn.dataCompressed[expIdx];
 182       dataOut->dataExpanded[dataIdxOut] = (int16_t)(thisData << thisExp);
 183     }
 184   }
 185 }