/****************************************************************************** * * Copyright (c) 2020 Intel. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * *******************************************************************************/ /** * @brief xRAN BFP compression/decompression utilities functions * * @file xran_bfp_utils.hpp * @ingroup group_source_xran * @author Intel Corporation **/ #pragma once #include namespace BlockFloatCompander { /// Calculate exponent based on 16 max abs values using leading zero count. inline __m512i maskUpperWord(const __m512i inData) { const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF); return _mm512_and_epi64(inData, k_upperWordMask); } /// Calculate exponent based on 16 max abs values using leading zero count. inline __m512i expLzCnt(const __m512i maxAbs, const __m512i totShiftBits) { /// Compute exponent const auto lzCount = _mm512_lzcnt_epi32(maxAbs); return _mm512_subs_epu16(totShiftBits, lzCount); } /// Full horizontal max of 16b values inline int horizontalMax1x32(const __m512i maxAbsReg) { /// Swap each IQ pair in each lane (via 32b rotation) and compute max of /// each pair. const auto maxRot16 = _mm512_rol_epi32(maxAbsReg, BlockFloatCompander::k_numBitsIQ); const auto maxAbsIQ = _mm512_max_epi16(maxAbsReg, maxRot16); /// Convert to 32b by removing repeated values in maxAbs const auto maxAbs32 = maskUpperWord(maxAbsIQ); /// Return reduced max return _mm512_reduce_max_epi32(maxAbs32); } /// Perform horizontal max of 16 bit values across each lane inline __m512i horizontalMax4x16(const __m512i maxAbsIn) { /// Swap 64b in each lane and compute max const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1); auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbsIn); auto maxAbsHorz = _mm512_max_epi16(maxAbsIn, maxAbsPerm); /// Swap each pair of 32b in each lane and compute max const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz); maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm); /// Swap each IQ pair in each lane (via 32b rotation) and compute max maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ); return _mm512_max_epi16(maxAbsHorz, maxAbsPerm); } /// Perform U-plane input data re-ordering and vertical max abs of 16b values /// Works on 4 RB at a time inline __m512i maxAbsVertical4RB(const __m512i inA, const __m512i inB, const __m512i inC) { /// Re-order the next 4RB in input data into 3 registers /// Input SIMD vectors are: /// [A A A A A A A A A A A A B B B B] /// [B B B B B B B B C C C C C C C C] /// [C C C C D D D D D D D D D D D D] /// Re-ordered SIMD vectors are: /// [A A A A B B B B C C C C D D D D] /// [A A A A B B B B C C C C D D D D] /// [A A A A B B B B C C C C D D D D] constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src constexpr int k_shuff1 = 0x41; const auto z_w1 = _mm512_mask_shuffle_i64x2(inA, k_msk1, inB, inC, k_shuff1); constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src constexpr int k_shuff2 = 0xB1; const auto z_w2 = _mm512_mask_shuffle_i64x2(inB, k_msk2, inA, inC, k_shuff2); constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src constexpr int k_shuff3 = 0xBE; const auto z_w3 = _mm512_mask_shuffle_i64x2(inC, k_msk3, inA, inB, k_shuff3); /// Perform max abs on these 3 registers const auto abs16_1 = _mm512_abs_epi16(z_w1); const auto abs16_2 = _mm512_abs_epi16(z_w2); const auto abs16_3 = _mm512_abs_epi16(z_w3); return _mm512_max_epi16(_mm512_max_epi16(abs16_1, abs16_2), abs16_3); } /// Selects first 32 bit value in each src lane and packs into laneNum of dest inline __m512i slidePermute(const __m512i src, const __m512i dest, const int laneNum) { const auto k_selectVals = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16, 28, 24, 20, 16, 28, 24, 20, 16); constexpr uint16_t k_laneMsk[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 }; return _mm512_mask_permutex2var_epi32(dest, k_laneMsk[laneNum], k_selectVals, src); } }