fhi_lib/lib/src/xran_bfp_utils.hpp

   1 /******************************************************************************
   2 *
   3 *   Copyright (c) 2019 Intel.
   4 *
   5 *   Licensed under the Apache License, Version 2.0 (the "License");
   6 *   you may not use this file except in compliance with the License.
   7 *   You may obtain a copy of the License at
   8 *
   9 *       http://www.apache.org/licenses/LICENSE-2.0
  10 *
  11 *   Unless required by applicable law or agreed to in writing, software
  12 *   distributed under the License is distributed on an "AS IS" BASIS,
  13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 *   See the License for the specific language governing permissions and
  15 *   limitations under the License.
  16 *
  17 *******************************************************************************/
  18
  19 /**
  20  * @brief xRAN BFP compression/decompression utilities functions
  21  *
  22  * @file xran_bfp_utils.hpp
  23  * @ingroup group_source_xran
  24  * @author Intel Corporation
  25  **/
  26
  27 #pragma once
  28 #include <immintrin.h>
  29
  30 namespace BlockFloatCompander
  31 {
  32   /// Define function signatures for byte packing functions
  33   typedef __m512i(*PackFunction)(const __m512i);
  34   typedef __m512i(*UnpackFunction)(const uint8_t*);
  35   typedef __m256i(*UnpackFunction256)(const uint8_t*);
  36
  37   /// Calculate exponent based on 16 max abs values using leading zero count.
  38   inline __m512i
  39   maskUpperWord(const __m512i inData)
  40   {
  41     const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  42                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  43                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
  44                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF);
  45     return _mm512_and_epi64(inData, k_upperWordMask);
  46   }
  47
  48   /// Calculate exponent based on 16 max abs values using leading zero count.
  49   inline __m512i
  50   expLzCnt(const __m512i maxAbs, const __m512i totShiftBits)
  51   {
  52     /// Compute exponent
  53     const auto lzCount = _mm512_lzcnt_epi32(maxAbs);
  54     return _mm512_subs_epu16(totShiftBits, lzCount);
  55   }
  56
  57   inline int
  58   horizontalMax1x32(const __m512i maxAbsReg)
  59   {
  60     /// Swap each IQ pair in each lane (via 32b rotation) and compute max of
  61     /// each pair.
  62     const auto maxRot16 = _mm512_rol_epi32(maxAbsReg, BlockFloatCompander::k_numBitsIQ);
  63     const auto maxAbsIQ = _mm512_max_epi16(maxAbsReg, maxRot16);
  64     /// Convert to 32b by removing repeated values in maxAbs
  65     const auto maxAbs32 = maskUpperWord(maxAbsIQ);
  66     /// Return reduced max
  67     return _mm512_reduce_max_epi32(maxAbs32);
  68   }
  69
  70   /// Pack compressed 9 bit data in network byte order
  71   /// See https://soco.intel.com/docs/DOC-2665619
  72   inline __m512i
  73   networkBytePack9b(const __m512i compData)
  74   {
  75     /// Logical shift left to align network order byte parts
  76     const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000100020003, 0x0004000500060007,
  77                                                  0x0000000100020003, 0x0004000500060007,
  78                                                  0x0000000100020003, 0x0004000500060007,
  79                                                  0x0000000100020003, 0x0004000500060007);
  80     const auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
  81
  82     /// First epi8 shuffle of even indexed samples
  83     const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x0000000000000000, 0x0C0D080904050001,
  84                                                         0x0000000000000000, 0x0C0D080904050001,
  85                                                         0x0000000000000000, 0x0C0D080904050001,
  86                                                         0x0000000000000000, 0x0C0D080904050001);
  87     constexpr uint64_t k_byteMask1 = 0x00FF00FF00FF00FF;
  88     const auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
  89
  90     /// Second epi8 shuffle of odd indexed samples
  91     const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000000000E, 0x0F0A0B0607020300,
  92                                                         0x000000000000000E, 0x0F0A0B0607020300,
  93                                                         0x000000000000000E, 0x0F0A0B0607020300,
  94                                                         0x000000000000000E, 0x0F0A0B0607020300);
  95     constexpr uint64_t k_byteMask2 = 0x01FE01FE01FE01FE;
  96     const auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
  97
  98     /// Ternary blend of the two shuffled results
  99     const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000000000FF, 0x01FC07F01FC07F00,
 100                                                      0x00000000000000FF, 0x01FC07F01FC07F00,
 101                                                      0x00000000000000FF, 0x01FC07F01FC07F00,
 102                                                      0x00000000000000FF, 0x01FC07F01FC07F00);
 103     return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
 104   }
 105
 106
 107   /// Pack compressed 10 bit data in network byte order
 108   /// See https://soco.intel.com/docs/DOC-2665619
 109   inline __m512i
 110   networkBytePack10b(const __m512i compData)
 111   {
 112     /// Logical shift left to align network order byte parts
 113     const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000200040006, 0x0000000200040006,
 114                                                  0x0000000200040006, 0x0000000200040006,
 115                                                  0x0000000200040006, 0x0000000200040006,
 116                                                  0x0000000200040006, 0x0000000200040006);
 117     const auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
 118
 119     /// First epi8 shuffle of even indexed samples
 120     const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x000000000000000C, 0x0D08090004050001,
 121                                                         0x000000000000000C, 0x0D08090004050001,
 122                                                         0x000000000000000C, 0x0D08090004050001,
 123                                                         0x000000000000000C, 0x0D08090004050001);
 124     constexpr uint64_t k_byteMask1 = 0x01EF01EF01EF01EF;
 125     const auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
 126
 127     /// Second epi8 shuffle of odd indexed samples
 128     const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x0000000000000E0F, 0x0A0B000607020300,
 129                                                         0x0000000000000E0F, 0x0A0B000607020300,
 130                                                         0x0000000000000E0F, 0x0A0B000607020300,
 131                                                         0x0000000000000E0F, 0x0A0B000607020300);
 132     constexpr uint64_t k_byteMask2 = 0x03DE03DE03DE03DE;
 133     const auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
 134
 135     /// Ternary blend of the two shuffled results
 136     const __m512i k_ternLogSelect = _mm512_set_epi64(0x000000000000FF03, 0xF03F00FF03F03F00,
 137                                                      0x000000000000FF03, 0xF03F00FF03F03F00,
 138                                                      0x000000000000FF03, 0xF03F00FF03F03F00,
 139                                                      0x000000000000FF03, 0xF03F00FF03F03F00);
 140     return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
 141   }
 142
 143
 144   /// Pack compressed 12 bit data in network byte order
 145   /// See https://soco.intel.com/docs/DOC-2665619
 146   inline __m512i
 147   networkBytePack12b(const __m512i compData)
 148   {
 149     /// Logical shift left to align network order byte parts
 150     const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000400000004, 0x0000000400000004,
 151                                                  0x0000000400000004, 0x0000000400000004,
 152                                                  0x0000000400000004, 0x0000000400000004,
 153                                                  0x0000000400000004, 0x0000000400000004);
 154     const auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
 155
 156     /// First epi8 shuffle of even indexed samples
 157     const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x00000000000C0D00, 0x0809000405000001,
 158                                                         0x00000000000C0D00, 0x0809000405000001,
 159                                                         0x00000000000C0D00, 0x0809000405000001,
 160                                                         0x00000000000C0D00, 0x0809000405000001);
 161     constexpr uint64_t k_byteMask1 = 0x06DB06DB06DB06DB;
 162     const auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
 163
 164     /// Second epi8 shuffle of odd indexed samples
 165     const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000E0F000A, 0x0B00060700020300,
 166                                                         0x000000000E0F000A, 0x0B00060700020300,
 167                                                         0x000000000E0F000A, 0x0B00060700020300,
 168                                                         0x000000000E0F000A, 0x0B00060700020300);
 169     constexpr uint64_t k_byteMask2 = 0x0DB60DB60DB60DB6;
 170     const auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
 171
 172     /// Ternary blend of the two shuffled results
 173     const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
 174                                                      0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
 175                                                      0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
 176                                                      0x00000000FF0F00FF, 0x0F00FF0F00FF0F00);
 177     return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
 178   }
 179
 180
 181   /// Unpack compressed 9 bit data in network byte order
 182   /// See https://soco.intel.com/docs/DOC-2665619
 183   inline __m512i
 184   networkByteUnpack9b(const uint8_t* inData)
 185   {
 186     /// Align chunks of compressed bytes into lanes to allow for expansion
 187     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
 188     const auto k_expPerm = _mm512_set_epi32(9, 8, 7, 6, 7, 6, 5, 4,
 189                                             5, 4, 3, 2, 3, 2, 1, 0);
 190     const auto inLaneAlign = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
 191
 192     /// Byte shuffle to get all bits for each sample into 16b chunks
 193     /// Due to previous permute to get chunks of bytes into each lane, there is
 194     /// a different shuffle offset in each lane
 195     const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A08090708, 0x0607050604050304,
 196                                                        0x090A080907080607, 0x0506040503040203,
 197                                                        0x0809070806070506, 0x0405030402030102,
 198                                                        0x0708060705060405, 0x0304020301020001);
 199     const auto inDatContig = _mm512_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
 200
 201     /// Logical shift left to set sign bit
 202     const __m512i k_slBits = _mm512_set_epi64(0x0007000600050004, 0x0003000200010000,
 203                                               0x0007000600050004, 0x0003000200010000,
 204                                               0x0007000600050004, 0x0003000200010000,
 205                                               0x0007000600050004, 0x0003000200010000);
 206     const auto inSetSign = _mm512_sllv_epi16(inDatContig, k_slBits);
 207
 208     /// Mask to zero unwanted bits
 209     const __m512i k_expMask = _mm512_set1_epi16(0xFF80);
 210     return _mm512_and_epi64(inSetSign, k_expMask);
 211   }
 212
 213
 214   /// Unpack compressed 10 bit data in network byte order
 215   /// See https://soco.intel.com/docs/DOC-2665619
 216   inline __m512i
 217   networkByteUnpack10b(const uint8_t* inData)
 218   {
 219     /// Align chunks of compressed bytes into lanes to allow for expansion
 220     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
 221     const auto k_expPerm = _mm512_set_epi32(10, 9, 8, 7, 8, 7, 6, 5,
 222                                              5, 4, 3, 2, 3, 2, 1, 0);
 223     const auto inLaneAlign = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
 224
 225     /// Byte shuffle to get all bits for each sample into 16b chunks
 226     /// Due to previous permute to get chunks of bytes into each lane, lanes
 227     /// 0 and 2 happen to be aligned, but lane 1 is offset by 2 bytes
 228     const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A08090708, 0x0506040503040203,
 229                                                        0x0809070806070506, 0x0304020301020001,
 230                                                        0x0A0B090A08090708, 0x0506040503040203,
 231                                                        0x0809070806070506, 0x0304020301020001);
 232     const auto inDatContig = _mm512_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
 233
 234     /// Logical shift left to set sign bit
 235     const __m512i k_slBits = _mm512_set_epi64(0x0006000400020000, 0x0006000400020000,
 236                                               0x0006000400020000, 0x0006000400020000,
 237                                               0x0006000400020000, 0x0006000400020000,
 238                                               0x0006000400020000, 0x0006000400020000);
 239     const auto inSetSign = _mm512_sllv_epi16(inDatContig, k_slBits);
 240
 241     /// Mask to zero unwanted bits
 242     const __m512i k_expMask = _mm512_set1_epi16(0xFFC0);
 243     return _mm512_and_epi64(inSetSign, k_expMask);
 244   }
 245
 246
 247   /// Unpack compressed 12 bit data in network byte order
 248   /// See https://soco.intel.com/docs/DOC-2665619
 249   inline __m512i
 250   networkByteUnpack12b(const uint8_t* inData)
 251   {
 252     /// Align chunks of compressed bytes into lanes to allow for expansion
 253     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
 254     const auto k_expPerm = _mm512_set_epi32(12, 11, 10, 9, 9, 8, 7, 6,
 255                                              6, 5, 4, 3, 3, 2, 1, 0);
 256     const auto inLaneAlign = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
 257
 258     /// Byte shuffle to get all bits for each sample into 16b chunks
 259     /// For 12b mantissa all lanes post-permute are aligned and require same shuffle offset
 260     const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A07080607, 0x0405030401020001,
 261                                                        0x0A0B090A07080607, 0x0405030401020001,
 262                                                        0x0A0B090A07080607, 0x0405030401020001,
 263                                                        0x0A0B090A07080607, 0x0405030401020001);
 264     const auto inDatContig = _mm512_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
 265
 266     /// Logical shift left to set sign bit
 267     const __m512i k_slBits = _mm512_set_epi64(0x0004000000040000, 0x0004000000040000,
 268                                               0x0004000000040000, 0x0004000000040000,
 269                                               0x0004000000040000, 0x0004000000040000,
 270                                               0x0004000000040000, 0x0004000000040000);
 271     const auto inSetSign = _mm512_sllv_epi16(inDatContig, k_slBits);
 272
 273     /// Mask to zero unwanted bits
 274     const __m512i k_expMask = _mm512_set1_epi16(0xFFF0);
 275     return _mm512_and_epi64(inSetSign, k_expMask);
 276   }
 277
 278
 279   /// Unpack compressed 9 bit data in network byte order
 280   /// See https://soco.intel.com/docs/DOC-2665619
 281   /// This unpacking function is for 256b registers
 282   inline __m256i
 283   networkByteUnpack9b256(const uint8_t* inData)
 284   {
 285     /// Align chunks of compressed bytes into lanes to allow for expansion
 286     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(inData);
 287     const auto k_expPerm = _mm256_set_epi32(5, 4, 3, 2, 3, 2, 1, 0);
 288     const auto inLaneAlign = _mm256_permutexvar_epi32(k_expPerm, *rawDataIn);
 289
 290     /// Byte shuffle to get all bits for each sample into 16b chunks
 291     /// Due to previous permute to get chunks of bytes into each lane, there is
 292     /// a different shuffle offset in each lane
 293     const __m256i k_byteShuffleMask = _mm256_set_epi64x(0x0809070806070506, 0x0405030402030102,
 294                                                         0x0708060705060405, 0x0304020301020001);
 295     const auto inDatContig = _mm256_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
 296
 297     /// Logical shift left to set sign bit
 298     const __m256i k_slBits = _mm256_set_epi64x(0x0007000600050004, 0x0003000200010000,
 299                                                0x0007000600050004, 0x0003000200010000);
 300     const auto inSetSign = _mm256_sllv_epi16(inDatContig, k_slBits);
 301
 302     /// Mask to zero unwanted bits
 303     const __m256i k_expMask = _mm256_set1_epi16(0xFF80);
 304     return _mm256_and_si256(inSetSign, k_expMask);
 305   }
 306
 307
 308   /// Unpack compressed 10 bit data in network byte order
 309   /// See https://soco.intel.com/docs/DOC-2665619
 310   /// This unpacking function is for 256b registers
 311   inline __m256i
 312   networkByteUnpack10b256(const uint8_t* inData)
 313   {
 314     /// Align chunks of compressed bytes into lanes to allow for expansion
 315     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(inData);
 316     const auto k_expPerm = _mm256_set_epi32(5, 4, 3, 2, 3, 2, 1, 0);
 317     const auto inLaneAlign = _mm256_permutexvar_epi32(k_expPerm, *rawDataIn);
 318
 319     /// Byte shuffle to get all bits for each sample into 16b chunks
 320     /// Due to previous permute to get chunks of bytes into each lane, lanes
 321     /// 0 and 2 happen to be aligned, but lane 1 is offset by 2 bytes
 322     const __m256i k_byteShuffleMask = _mm256_set_epi64x(0x0A0B090A08090708, 0x0506040503040203,
 323                                                         0x0809070806070506, 0x0304020301020001);
 324     const auto inDatContig = _mm256_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
 325
 326     /// Logical shift left to set sign bit
 327     const __m256i k_slBits = _mm256_set_epi64x(0x0006000400020000, 0x0006000400020000,
 328                                                0x0006000400020000, 0x0006000400020000);
 329     const auto inSetSign = _mm256_sllv_epi16(inDatContig, k_slBits);
 330
 331     /// Mask to zero unwanted bits
 332     const __m256i k_expMask = _mm256_set1_epi16(0xFFC0);
 333     return _mm256_and_si256(inSetSign, k_expMask);
 334   }
 335
 336
 337   /// Unpack compressed 12 bit data in network byte order
 338   /// See https://soco.intel.com/docs/DOC-2665619
 339   /// This unpacking function is for 256b registers
 340   inline __m256i
 341   networkByteUnpack12b256(const uint8_t* inData)
 342   {
 343     /// Align chunks of compressed bytes into lanes to allow for expansion
 344     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(inData);
 345     const auto k_expPerm = _mm256_set_epi32(6, 5, 4, 3, 3, 2, 1, 0);
 346     const auto inLaneAlign = _mm256_permutexvar_epi32(k_expPerm, *rawDataIn);
 347
 348     /// Byte shuffle to get all bits for each sample into 16b chunks
 349     /// For 12b mantissa all lanes post-permute are aligned and require same shuffle offset
 350     const __m256i k_byteShuffleMask = _mm256_set_epi64x(0x0A0B090A07080607, 0x0405030401020001,
 351                                                         0x0A0B090A07080607, 0x0405030401020001);
 352     const auto inDatContig = _mm256_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
 353
 354     /// Logical shift left to set sign bit
 355     const __m256i k_slBits = _mm256_set_epi64x(0x0004000000040000, 0x0004000000040000,
 356                                                0x0004000000040000, 0x0004000000040000);
 357     const auto inSetSign = _mm256_sllv_epi16(inDatContig, k_slBits);
 358
 359     /// Mask to zero unwanted bits
 360     const __m256i k_expMask = _mm256_set1_epi16(0xFFF0);
 361     return _mm256_and_si256(inSetSign, k_expMask);
 362   }
 363 }