1 /******************************************************************************
3 * Copyright (c) 2020 Intel.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 *******************************************************************************/
20 * @brief xRAN BFP compression/decompression utilities functions
22 * @file xran_bfp_utils.hpp
23 * @ingroup group_source_xran
24 * @author Intel Corporation
28 #include <immintrin.h>
30 namespace BlockFloatCompander
32 /// Calculate exponent based on 16 max abs values using leading zero count.
34 maskUpperWord(const __m512i inData)
36 const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
37 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
38 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
39 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF);
40 return _mm512_and_epi64(inData, k_upperWordMask);
44 /// Calculate exponent based on 16 max abs values using leading zero count.
46 expLzCnt(const __m512i maxAbs, const __m512i totShiftBits)
49 const auto lzCount = _mm512_lzcnt_epi32(maxAbs);
50 return _mm512_subs_epu16(totShiftBits, lzCount);
54 /// Full horizontal max of 16b values
56 horizontalMax1x32(const __m512i maxAbsReg)
58 /// Swap each IQ pair in each lane (via 32b rotation) and compute max of
60 const auto maxRot16 = _mm512_rol_epi32(maxAbsReg, BlockFloatCompander::k_numBitsIQ);
61 const auto maxAbsIQ = _mm512_max_epi16(maxAbsReg, maxRot16);
62 /// Convert to 32b by removing repeated values in maxAbs
63 const auto maxAbs32 = maskUpperWord(maxAbsIQ);
64 /// Return reduced max
65 return _mm512_reduce_max_epi32(maxAbs32);
69 /// Perform horizontal max of 16 bit values across each lane
71 horizontalMax4x16(const __m512i maxAbsIn)
73 /// Swap 64b in each lane and compute max
74 const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1);
75 auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbsIn);
76 auto maxAbsHorz = _mm512_max_epi16(maxAbsIn, maxAbsPerm);
78 /// Swap each pair of 32b in each lane and compute max
79 const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
80 maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz);
81 maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
83 /// Swap each IQ pair in each lane (via 32b rotation) and compute max
84 maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ);
85 return _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
89 /// Perform U-plane input data re-ordering and vertical max abs of 16b values
90 /// Works on 4 RB at a time
92 maxAbsVertical4RB(const __m512i inA, const __m512i inB, const __m512i inC)
94 /// Re-order the next 4RB in input data into 3 registers
95 /// Input SIMD vectors are:
96 /// [A A A A A A A A A A A A B B B B]
97 /// [B B B B B B B B C C C C C C C C]
98 /// [C C C C D D D D D D D D D D D D]
99 /// Re-ordered SIMD vectors are:
100 /// [A A A A B B B B C C C C D D D D]
101 /// [A A A A B B B B C C C C D D D D]
102 /// [A A A A B B B B C C C C D D D D]
103 constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src
104 constexpr int k_shuff1 = 0x41;
105 const auto z_w1 = _mm512_mask_shuffle_i64x2(inA, k_msk1, inB, inC, k_shuff1);
107 constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src
108 constexpr int k_shuff2 = 0xB1;
109 const auto z_w2 = _mm512_mask_shuffle_i64x2(inB, k_msk2, inA, inC, k_shuff2);
111 constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src
112 constexpr int k_shuff3 = 0xBE;
113 const auto z_w3 = _mm512_mask_shuffle_i64x2(inC, k_msk3, inA, inB, k_shuff3);
115 /// Perform max abs on these 3 registers
116 const auto abs16_1 = _mm512_abs_epi16(z_w1);
117 const auto abs16_2 = _mm512_abs_epi16(z_w2);
118 const auto abs16_3 = _mm512_abs_epi16(z_w3);
119 return _mm512_max_epi16(_mm512_max_epi16(abs16_1, abs16_2), abs16_3);
123 /// Selects first 32 bit value in each src lane and packs into laneNum of dest
125 slidePermute(const __m512i src, const __m512i dest, const int laneNum)
127 const auto k_selectVals = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16,
128 28, 24, 20, 16, 28, 24, 20, 16);
129 constexpr uint16_t k_laneMsk[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
130 return _mm512_mask_permutex2var_epi32(dest, k_laneMsk[laneNum], k_selectVals, src);