1 /******************************************************************************
3 * Copyright (c) 2019 Intel.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 *******************************************************************************/
20 * @brief xRAN BFP compression/decompression for C-plane with 64T64R
22 * @file xran_bfp_cplane64.cpp
23 * @ingroup group_source_xran
24 * @author Intel Corporation
27 #include "xran_compression.hpp"
28 #include "xran_bfp_utils.hpp"
31 #include <immintrin.h>
34 namespace BFP_CPlane_64
36 /// Namespace constants
37 const int k_numDataElements = 128; /// 16 IQ pairs
38 const int k_numRegsPerBlock = 4; /// Number of AVX512 registers per compression block (input)
41 maxAbsOneBlock(const __m512i* inData)
43 /// Vertical maxAbs on all registers
44 __m512i maxAbsReg = __m512i();
45 #pragma unroll(k_numRegsPerBlock)
46 for (int n = 0; n < k_numRegsPerBlock; ++n)
48 const auto thisRegAbs = _mm512_abs_epi16(inData[n]);
49 maxAbsReg = _mm512_max_epi16(thisRegAbs, maxAbsReg);
51 /// Horizontal max across remaining register
52 return BlockFloatCompander::horizontalMax1x32(maxAbsReg);
55 /// Compute exponent value for a set of 16 RB from the maximum absolute value
57 computeExponent_16RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
59 __m512i maxAbs = __m512i();
60 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
62 for (int n = 0; n < 16; ++n)
64 ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
66 /// Calculate exponent
67 return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
70 /// Compute exponent value for a set of 4 RB from the maximum absolute value
72 computeExponent_4RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
74 __m512i maxAbs = __m512i();
75 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
77 for (int n = 0; n < 4; ++n)
79 ((uint32_t*)&maxAbs)[n] = maxAbsOneBlock(dataInAddr + n * k_numRegsPerBlock);
81 /// Calculate exponent
82 return BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
85 /// Compute exponent value for 1 RB from the maximum absolute value
87 computeExponent_1RB(const BlockFloatCompander::ExpandedData& dataIn, const __m512i totShiftBits)
89 __m512i maxAbs = __m512i();
90 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
91 ((uint32_t*)&maxAbs)[0] = maxAbsOneBlock(dataInAddr);
92 /// Calculate exponent
93 const auto exps = BlockFloatCompander::expLzCnt(maxAbs, totShiftBits);
94 return ((uint8_t*)&exps)[0];
99 /// Apply compression to one compression block
100 template<BlockFloatCompander::PackFunction networkBytePack>
102 applyCompressionN_1RB(const __m512i* dataIn, uint8_t* outBlockAddr,
103 const int iqWidth, const uint8_t thisExp, const int totNumBytesPerReg, const uint16_t rbWriteMask)
105 /// Store exponent first
106 *outBlockAddr = thisExp;
107 #pragma unroll(k_numRegsPerBlock)
108 for (int n = 0; n < k_numRegsPerBlock; ++n)
110 /// Apply the exponent shift
111 const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
112 /// Pack compressed data network byte order
113 const auto compDataBytePacked = networkBytePack(compData);
114 /// Now have 1 register worth of bytes separated into 4 chunks (1 per lane)
115 /// Use four offset stores to join
116 const auto thisOutRegAddr = outBlockAddr + 1 + n * totNumBytesPerReg;
117 _mm_mask_storeu_epi8(thisOutRegAddr, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
118 _mm_mask_storeu_epi8(thisOutRegAddr + iqWidth, rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
119 _mm_mask_storeu_epi8(thisOutRegAddr + (2 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
120 _mm_mask_storeu_epi8(thisOutRegAddr + (3 * iqWidth), rbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 3));
124 /// Derive and apply 9, 10, or 12bit compression to 16 compression blocks
125 template<BlockFloatCompander::PackFunction networkBytePack>
127 compressN_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
128 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
130 const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
131 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
133 for (int n = 0; n < 16; ++n)
135 applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
139 /// Derive and apply 9, 10, or 12bit compression to 4 compression blocks
140 template<BlockFloatCompander::PackFunction networkBytePack>
142 compressN_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
143 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
145 const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
146 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
148 for (int n = 0; n < 4; ++n)
150 applyCompressionN_1RB<networkBytePack>(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * totNumBytesPerBlock, dataIn.iqWidth, ((uint8_t*)&exponents)[n * 4], totNumBytesPerReg, rbWriteMask);
154 /// Derive and apply 9, 10, or 12bit compression to 1 RB
155 template<BlockFloatCompander::PackFunction networkBytePack>
157 compressN_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
158 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
160 const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
161 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
162 applyCompressionN_1RB<networkBytePack>(dataInAddr, dataOut->dataCompressed, dataIn.iqWidth, thisExponent, totNumBytesPerReg, rbWriteMask);
165 /// Calls compression function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
166 template<BlockFloatCompander::PackFunction networkBytePack>
168 compressByAllocN(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut,
169 const __m512i totShiftBits, const int totNumBytesPerBlock, const int totNumBytesPerReg, const uint16_t rbWriteMask)
171 switch (dataIn.numBlocks)
174 compressN_16RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
178 compressN_4RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
182 compressN_1RB<networkBytePack>(dataIn, dataOut, totShiftBits, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask);
189 /// Apply 8b compression to 1 compression block.
191 applyCompression8_1RB(const __m512i* dataIn, uint8_t* outBlockAddr, const uint8_t thisExp)
193 /// Store exponent first
194 *outBlockAddr = thisExp;
195 constexpr uint32_t k_writeMask = 0xFFFFFFFF;
196 __m256i* regOutAddr = reinterpret_cast<__m256i*>(outBlockAddr + 1);
197 #pragma unroll(k_numRegsPerBlock)
198 for (int n = 0; n < k_numRegsPerBlock; ++n)
200 /// Apply the exponent shift
201 const auto compData = _mm512_srai_epi16(dataIn[n], thisExp);
202 /// Truncate to 8bit and store
203 _mm256_mask_storeu_epi8(regOutAddr + n, k_writeMask, _mm512_cvtepi16_epi8(compData));
207 /// Derive and apply 8b compression to 16 compression blocks
209 compress8_16RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
211 const auto exponents = computeExponent_16RB(dataIn, totShiftBits);
212 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
214 for (int n = 0; n < 16; ++n)
216 applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
220 /// Derive and apply 8b compression to 4 compression blocks
222 compress8_4RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
224 const auto exponents = computeExponent_4RB(dataIn, totShiftBits);
225 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
227 for (int n = 0; n < 4; ++n)
229 applyCompression8_1RB(dataInAddr + n * k_numRegsPerBlock, dataOut->dataCompressed + n * (k_numDataElements + 1), ((uint8_t*)&exponents)[n * 4]);
233 /// Derive and apply 8b compression to 1 compression block
235 compress8_1RB(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
237 const auto thisExponent = computeExponent_1RB(dataIn, totShiftBits);
238 const __m512i* dataInAddr = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
239 applyCompression8_1RB(dataInAddr, dataOut->dataCompressed, thisExponent);
242 /// Calls compression function specific to the number of RB to be executed. For 8 bit iqWidth.
244 compressByAlloc8(const BlockFloatCompander::ExpandedData& dataIn, BlockFloatCompander::CompressedData* dataOut, const __m512i totShiftBits)
246 switch (dataIn.numBlocks)
249 compress8_16RB(dataIn, dataOut, totShiftBits);
253 compress8_4RB(dataIn, dataOut, totShiftBits);
257 compress8_1RB(dataIn, dataOut, totShiftBits);
264 /// Expand 1 compression block
265 template<BlockFloatCompander::UnpackFunction networkByteUnpack>
267 applyExpansionN_1RB(const uint8_t* expAddr, __m512i* dataOutAddr, const int maxExpShift, const int totNumBytesPerReg)
269 static constexpr uint8_t k_WriteMask = 0xFF;
270 const auto thisExpShift = maxExpShift - *expAddr;
271 #pragma unroll(k_numRegsPerBlock)
272 for (int n = 0; n < k_numRegsPerBlock; ++n)
274 const auto thisInRegAddr = expAddr + 1 + n * totNumBytesPerReg;
275 /// Unpack network order packed data
276 const auto inDataUnpacked = networkByteUnpack(thisInRegAddr);
277 /// Apply exponent scaling (by appropriate arithmetic shift right)
278 const auto expandedData = _mm512_srai_epi16(inDataUnpacked, thisExpShift);
279 /// Write expanded data to output
280 _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expandedData);
284 /// Calls expansion function specific to the number of blocks to be executed. For 9, 10, or 12bit iqWidth.
285 template<BlockFloatCompander::UnpackFunction networkByteUnpack>
286 void expandByAllocN(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut,
287 const int totNumBytesPerBlock, const int totNumBytesPerReg, const int maxExpShift)
289 __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
290 switch (dataIn.numBlocks)
294 for (int n = 0; n < 16; ++n)
296 applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
302 for (int n = 0; n < 4; ++n)
304 applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed + n * totNumBytesPerBlock, dataOutAddr + n * k_numRegsPerBlock, maxExpShift, totNumBytesPerReg);
309 applyExpansionN_1RB<networkByteUnpack>(dataIn.dataCompressed, dataOutAddr, maxExpShift, totNumBytesPerReg);
315 /// Apply expansion to 1 compression block
317 applyExpansion8_1RB(const uint8_t* expAddr, __m512i* dataOutAddr)
319 const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(expAddr + 1);
320 static constexpr uint8_t k_WriteMask = 0xFF;
321 #pragma unroll(k_numRegsPerBlock)
322 for (int n = 0; n < k_numRegsPerBlock; ++n)
324 const auto compData16 = _mm512_cvtepi8_epi16(rawDataIn[n]);
325 const auto expData = _mm512_slli_epi16(compData16, *expAddr);
326 _mm512_mask_storeu_epi64(dataOutAddr + n, k_WriteMask, expData);
330 /// Calls expansion function specific to the number of RB to be executed. For 8 bit iqWidth.
332 expandByAlloc8(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut)
334 __m512i* dataOutAddr = reinterpret_cast<__m512i*>(dataOut->dataExpanded);
335 switch (dataIn.numBlocks)
339 for (int n = 0; n < 16; ++n)
341 applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
347 for (int n = 0; n < 4; ++n)
349 applyExpansion8_1RB(dataIn.dataCompressed + n * (k_numDataElements + 1), dataOutAddr + n * k_numRegsPerBlock);
354 applyExpansion8_1RB(dataIn.dataCompressed, dataOutAddr);
361 /// Main kernel function for 64 antenna C-plane compression.
362 /// Starts by determining iqWidth specific parameters and functions.
364 BlockFloatCompander::BFPCompressCtrlPlane64Avx512(const ExpandedData& dataIn, CompressedData* dataOut)
366 /// Compensation for extra zeros in 32b leading zero count when computing exponent
367 const auto totShiftBits8 = _mm512_set1_epi32(25);
368 const auto totShiftBits9 = _mm512_set1_epi32(24);
369 const auto totShiftBits10 = _mm512_set1_epi32(23);
370 const auto totShiftBits12 = _mm512_set1_epi32(21);
372 /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
373 const auto totNumBytesPerBlock = ((BFP_CPlane_64::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
374 /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
375 const auto totNumBytesPerReg = dataIn.iqWidth << 2;
377 /// Compressed data write mask for each iqWidth option
378 constexpr uint16_t rbWriteMask9 = 0x01FF;
379 constexpr uint16_t rbWriteMask10 = 0x03FF;
380 constexpr uint16_t rbWriteMask12 = 0x0FFF;
382 switch (dataIn.iqWidth)
385 BFP_CPlane_64::compressByAlloc8(dataIn, dataOut, totShiftBits8);
389 BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack9b>(dataIn, dataOut, totShiftBits9, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask9);
393 BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack10b>(dataIn, dataOut, totShiftBits10, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask10);
397 BFP_CPlane_64::compressByAllocN<BlockFloatCompander::networkBytePack12b>(dataIn, dataOut, totShiftBits12, totNumBytesPerBlock, totNumBytesPerReg, rbWriteMask12);
403 /// Main kernel function for 64 antenna C-plane expansion.
404 /// Starts by determining iqWidth specific parameters and functions.
406 BlockFloatCompander::BFPExpandCtrlPlane64Avx512(const CompressedData& dataIn, ExpandedData* dataOut)
408 constexpr int k_maxExpShift9 = 7;
409 constexpr int k_maxExpShift10 = 6;
410 constexpr int k_maxExpShift12 = 4;
412 /// Total number of data bytes per compression block is (iqWidth * numElements / 8) + 1
413 const auto totNumBytesPerBlock = ((BFP_CPlane_64::k_numDataElements * dataIn.iqWidth) >> 3) + 1;
414 /// Total number of compressed bytes to handle per register is 32 * iqWidth / 8
415 const auto totNumBytesPerReg = dataIn.iqWidth << 2;
417 switch (dataIn.iqWidth)
420 BFP_CPlane_64::expandByAlloc8(dataIn, dataOut);
424 BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack9b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift9);
428 BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack10b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift10);
432 BFP_CPlane_64::expandByAllocN<BlockFloatCompander::networkByteUnpack12b>(dataIn, dataOut, totNumBytesPerBlock, totNumBytesPerReg, k_maxExpShift12);