X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=fhi_lib%2Flib%2Fsrc%2Fxran_compression.cpp;h=112caae2a659e87d8c2d0b0d0b0cff464ecbc2c1;hb=2de97529a4c5a1922214ba0e6f0fb84cacbd0bc7;hp=8730a201b1d99a0145c08127d981c39d7d5a5cbc;hpb=cef07f74965b1749dd909fc1322e211489fea2ea;p=o-du%2Fphy.git diff --git a/fhi_lib/lib/src/xran_compression.cpp b/fhi_lib/lib/src/xran_compression.cpp index 8730a20..112caae 100644 --- a/fhi_lib/lib/src/xran_compression.cpp +++ b/fhi_lib/lib/src/xran_compression.cpp @@ -1,6 +1,6 @@ /****************************************************************************** * -* Copyright (c) 2019 Intel. +* Copyright (c) 2020 Intel. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,170 +16,334 @@ * *******************************************************************************/ +/** + * @brief xRAN BFP compression/decompression U-plane implementation and interface functions + * + * @file xran_compression.cpp + * @ingroup group_source_xran + * @author Intel Corporation + **/ + #include "xran_compression.hpp" +#include "xran_compression.h" +#include "xran_mod_compression.h" +#include "xran_fh_o_du.h" #include #include #include +#include +#include -void -BlockFloatCompander::BlockFloatCompress_AVX512(const ExpandedData& dataIn, CompressedData* dataOut) -{ - __m512i maxAbs = __m512i(); +using namespace BlockFloatCompander; - /// Load data and find max(abs(RB)) - const __m512i* rawData = reinterpret_cast(dataIn.dataExpanded); - static constexpr int k_numInputLoopIts = BlockFloatCompander::k_numRB / 4; +/** callback function type for Symbol packet */ +typedef void (*xran_bfp_compress_fn)(const BlockFloatCompander::ExpandedData& dataIn, + BlockFloatCompander::CompressedData* dataOut); -#pragma unroll(k_numInputLoopIts) - for (int n = 0; n < k_numInputLoopIts; ++n) +/** callback function type for Symbol packet */ +typedef void (*xran_bfp_decompress_fn)(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut); + +int32_t +xranlib_compress(const struct xranlib_compress_request *request, + struct xranlib_compress_response *response) { - /// Re-order the next 4RB in input data into 3 registers - /// Input SIMD vectors are: - /// [A A A A A A A A A A A A B B B B] - /// [B B B B B B B B C C C C C C C C] - /// [C C C C D D D D D D D D D D D D] - /// Re-ordered SIMD vectors are: - /// [A A A A B B B B C C C C D D D D] - /// [A A A A B B B B C C C C D D D D] - /// [A A A A B B B B C C C C D D D D] - static constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src - static constexpr int k_shuff1 = 0x41; - const auto z_w1 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 0], k_msk1, rawData[3 * n + 1], rawData[3 * n + 2], k_shuff1); - - static constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src - static constexpr int k_shuff2 = 0xB1; - const auto z_w2 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 1], k_msk2, rawData[3 * n + 0], rawData[3 * n + 2], k_shuff2); - - static constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src - static constexpr int k_shuff3 = 0xBE; - const auto z_w3 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 2], k_msk3, rawData[3 * n + 0], rawData[3 * n + 1], k_shuff3); - - /// Perform max abs on these 3 registers - const auto abs16_1 = _mm512_abs_epi16(z_w1); - const auto abs16_2 = _mm512_abs_epi16(z_w2); - const auto abs16_3 = _mm512_abs_epi16(z_w3); - const auto maxAbs_12 = _mm512_max_epi16(abs16_1, abs16_2); - const auto maxAbs_123 = _mm512_max_epi16(maxAbs_12, abs16_3); - - /// Perform horizontal max over each lane - /// Swap 64b in each lane and compute max - static const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1); - auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbs_123); - auto maxAbsHorz = _mm512_max_epi16(maxAbs_123, maxAbsPerm); - - /// Swap each pair of 32b in each lane and compute max - static const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); - maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz); - maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm); - - /// Swap each IQ pair in each lane (via 32b rotation) and compute max - maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ); - maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm); - - /// Insert values into maxAbs - /// Use sliding mask to insert wanted values into maxAbs - /// Pairs of values will be inserted and corrected outside of loop - static const auto k_select4RB = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16, - 28, 24, 20, 16, 28, 24, 20, 16); - static constexpr uint16_t k_expMsk[k_numInputLoopIts] = { 0x000F, 0x00F0, 0x0F00, 0xF000 }; - maxAbs = _mm512_mask_permutex2var_epi32(maxAbs, k_expMsk[n], k_select4RB, maxAbsHorz); + if (request->compMethod == XRAN_COMPMETHOD_MODULATION) + { + struct xranlib_5gnr_mod_compression_request mod_request; + struct xranlib_5gnr_mod_compression_response mod_response; + mod_request.data_in = request->data_in; + mod_request.unit = request->ScaleFactor; + mod_request.modulation = (enum xran_modulation_order)(request->iqWidth * 2); + mod_request.num_symbols = request->numRBs * XRAN_NUM_OF_SC_PER_RB; + mod_request.re_mask = request->reMask; + mod_response.data_out = response->data_out; + response->len = (request->numRBs * XRAN_NUM_OF_SC_PER_RB * request->iqWidth * 2) >> 3; + + return xranlib_5gnr_mod_compression(&mod_request, &mod_response); + } + else{ + if(_may_i_use_cpu_feature(_FEATURE_AVX512IFMA52)) { + return xranlib_compress_avxsnc(request,response); + } else { + return xranlib_compress_avx512(request,response); + } + } } - /// Convert to 32b by removing repeated values in maxAbs - static const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, - 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, - 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, - 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF); - maxAbs = _mm512_and_epi64(maxAbs, k_upperWordMask); - - /// Compute exponent and store for later use - static constexpr int k_expTotShiftBits = 32 - BlockFloatCompander::k_iqWidth + 1; - const auto totShiftBits = _mm512_set1_epi32(k_expTotShiftBits); - const auto lzCount = _mm512_lzcnt_epi32(maxAbs); - const auto exponent = _mm512_sub_epi32(totShiftBits, lzCount); - int8_t storedExp[BlockFloatCompander::k_numRB] = {}; - static constexpr uint16_t k_expWriteMask = 0xFFFF; - _mm512_mask_cvtepi32_storeu_epi8(storedExp, k_expWriteMask, exponent); - - /// Shift 1RB by corresponding exponent and write exponent and data to output - /// Output data is packed exponent first followed by corresponding compressed RB -#pragma unroll(BlockFloatCompander::k_numRB) - for (int n = 0; n < BlockFloatCompander::k_numRB; ++n) +int32_t +xranlib_decompress(const struct xranlib_decompress_request *request, + struct xranlib_decompress_response *response) { - const __m512i* rawDataIn = reinterpret_cast(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal); - auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]); + if (request->compMethod == XRAN_COMPMETHOD_MODULATION) + { + struct xranlib_5gnr_mod_decompression_request mod_request; + struct xranlib_5gnr_mod_decompression_response mod_response; + mod_request.data_in = request->data_in; + mod_request.unit = request->ScaleFactor; + mod_request.modulation = (enum xran_modulation_order)(request->iqWidth * 2); + mod_request.num_symbols = request->numRBs * XRAN_NUM_OF_SC_PER_RB; + mod_request.re_mask = request->reMask; + mod_response.data_out = response->data_out; + response->len = request->numRBs * XRAN_NUM_OF_SC_PER_RB * 4; - dataOut->dataCompressed[n * (BlockFloatCompander::k_numREReal + 1)] = storedExp[n]; - static constexpr uint32_t k_rbMask = 0x00FFFFFF; // Write mask for 1RB (24 values) - _mm512_mask_cvtepi16_storeu_epi8(dataOut->dataCompressed + n * (BlockFloatCompander::k_numREReal + 1) + 1, k_rbMask, compData); + return xranlib_5gnr_mod_decompression(&mod_request, &mod_response); + } + else{ + if(_may_i_use_cpu_feature(_FEATURE_AVX512IFMA52)) { + return xranlib_decompress_avxsnc(request,response); + } else { + return xranlib_decompress_avx512(request,response); + } + } } -} +int32_t +xranlib_compress_bfw(const struct xranlib_compress_request *request, + struct xranlib_compress_response *response) + { + if(_may_i_use_cpu_feature(_FEATURE_AVX512IFMA52)) { + return xranlib_compress_avxsnc_bfw(request,response); + } else { + return xranlib_compress_avx512_bfw(request,response); + } + } -void -BlockFloatCompander::BlockFloatExpand_AVX512(const CompressedData& dataIn, ExpandedData* dataOut) -{ -#pragma unroll(BlockFloatCompander::k_numRB) - for (int n = 0; n < BlockFloatCompander::k_numRB; ++n) +int32_t +xranlib_decompress_bfw(const struct xranlib_decompress_request *request, + struct xranlib_decompress_response *response) { - /// Expand 1RB of data - const __m256i* rawDataIn = reinterpret_cast(dataIn.dataCompressed + n * (BlockFloatCompander::k_numREReal + 1) + 1); - const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn); - const auto expData = _mm512_slli_epi16(compData16, *(dataIn.dataCompressed + n * (BlockFloatCompander::k_numREReal + 1))); - - /// Write expanded data to output - static constexpr uint8_t k_rbMask64 = 0b00111111; // 64b write mask for 1RB (24 int16 values) - _mm512_mask_storeu_epi64(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_rbMask64, expData); + if(_may_i_use_cpu_feature(_FEATURE_AVX512IFMA52)) { + return xranlib_decompress_avxsnc_bfw(request,response); + } else { + return xranlib_decompress_avx512_bfw(request,response); } } +int32_t +xranlib_compress_avx512(const struct xranlib_compress_request *request, + struct xranlib_compress_response *response) +{ + BlockFloatCompander::ExpandedData expandedDataInput; + BlockFloatCompander::CompressedData compressedDataOut; + xran_bfp_compress_fn com_fn = NULL; + uint16_t totalRBs = request->numRBs; + uint16_t remRBs = totalRBs; + int16_t len = 0; + int16_t block_idx_bytes = 0; -void -BlockFloatCompander::BlockFloatCompress_Basic(const ExpandedData& dataIn, CompressedData* dataOut) + switch (request->iqWidth) { + case 8: + case 9: + case 10: + case 12: + com_fn = BlockFloatCompander::BFPCompressUserPlaneAvx512; + break; + default: + com_fn = BlockFloatCompander::BFPCompressRef; + break; + } + + expandedDataInput.iqWidth = request->iqWidth; + expandedDataInput.numDataElements = 24; + + while (remRBs){ + expandedDataInput.dataExpanded = &request->data_in[block_idx_bytes]; + compressedDataOut.dataCompressed = (uint8_t*)&response->data_out[len]; + if(remRBs >= 16){ + expandedDataInput.numBlocks = 16; + com_fn(expandedDataInput, &compressedDataOut); + len += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)16); + block_idx_bytes += 16*expandedDataInput.numDataElements; + remRBs -= 16; + }else if(remRBs >= 4){ + expandedDataInput.numBlocks = 4; + com_fn(expandedDataInput, &compressedDataOut); + len += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)4); + block_idx_bytes +=4*expandedDataInput.numDataElements; + remRBs -=4; + }else if (remRBs >= 1){ + expandedDataInput.numBlocks = 1; + com_fn(expandedDataInput, &compressedDataOut); + len += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)1); + block_idx_bytes +=1*expandedDataInput.numDataElements; + remRBs = remRBs - 1; + } + } + + response->len = ((3 * expandedDataInput.iqWidth) + 1) * totalRBs; + + return 0; +} + +int32_t +xranlib_decompress_avx512(const struct xranlib_decompress_request *request, + struct xranlib_decompress_response *response) { - int16_t maxAbs[BlockFloatCompander::k_numRB]; - for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb) - { - // Find max abs value for this RB - maxAbs[rb] = 0; - for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re) - { - auto dataIdx = rb * BlockFloatCompander::k_numREReal + re; - int16_t dataAbs = (int16_t)std::abs(dataIn.dataExpanded[dataIdx]); - maxAbs[rb] = std::max(maxAbs[rb], dataAbs); + BlockFloatCompander::CompressedData compressedDataInput; + BlockFloatCompander::ExpandedData expandedDataOut; + + xran_bfp_decompress_fn decom_fn = NULL; + uint16_t totalRBs = request->numRBs; + uint16_t remRBs = totalRBs; + int16_t len = 0; + int16_t block_idx_bytes = 0; + + switch (request->iqWidth) { + case 8: + case 9: + case 10: + case 12: + decom_fn = BlockFloatCompander::BFPExpandUserPlaneAvx512; + break; + default: + decom_fn = BlockFloatCompander::BFPExpandRef; + break; } - // Find exponent - static constexpr int k_expTotShiftBits16 = 16 - BlockFloatCompander::k_iqWidth + 1; - auto thisExp = (int8_t)(k_expTotShiftBits16 - __lzcnt16(maxAbs[rb])); - auto expIdx = rb * (BlockFloatCompander::k_numREReal + 1); - dataOut->dataCompressed[expIdx] = thisExp; + compressedDataInput.iqWidth = request->iqWidth; + compressedDataInput.numDataElements = 24; - // ARS data by exponent - for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re) - { - auto dataIdxIn = rb * BlockFloatCompander::k_numREReal + re; - auto dataIdxOut = (expIdx + 1) + re; - dataOut->dataCompressed[dataIdxOut] = (int8_t)(dataIn.dataExpanded[dataIdxIn] >> thisExp); + while(remRBs) { + compressedDataInput.dataCompressed = (uint8_t*)&request->data_in[block_idx_bytes]; + expandedDataOut.dataExpanded = &response->data_out[len]; + if(remRBs >= 16){ + compressedDataInput.numBlocks = 16; + decom_fn(compressedDataInput, &expandedDataOut); + len += 16*compressedDataInput.numDataElements; + block_idx_bytes += ((3 * compressedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)16); + remRBs -= 16; + }else if(remRBs >= 4){ + compressedDataInput.numBlocks = 4; + decom_fn(compressedDataInput, &expandedDataOut); + len += 4*compressedDataInput.numDataElements; + block_idx_bytes += ((3 * compressedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)4); + remRBs -=4; + }else if (remRBs >= 1){ + compressedDataInput.numBlocks = 1; + decom_fn(compressedDataInput, &expandedDataOut); + len += 1*compressedDataInput.numDataElements; + block_idx_bytes += ((3 * compressedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_maxNumBlocks,(int16_t)1); + remRBs = remRBs - 1; + } } - } + + response->len = totalRBs * compressedDataInput.numDataElements * sizeof(int16_t); + + return 0; } +int32_t +xranlib_compress_avx512_bfw(const struct xranlib_compress_request *request, + struct xranlib_compress_response *response) +{ + BlockFloatCompander::ExpandedData expandedDataInput; + BlockFloatCompander::CompressedData compressedDataOut; + xran_bfp_compress_fn com_fn = NULL; + + if (request->numRBs != 1){ + printf("Unsupported numRBs %d\n", request->numRBs); + return -1; + } + + switch (request->iqWidth) { + case 8: + case 9: + case 10: + case 12: + switch (request->numDataElements) { + case 16: + com_fn = BlockFloatCompander::BFPCompressCtrlPlane8Avx512; + break; + case 32: + com_fn = BlockFloatCompander::BFPCompressCtrlPlane16Avx512; + break; + case 64: + com_fn = BlockFloatCompander::BFPCompressCtrlPlane32Avx512; + break; + case 128: + com_fn = BlockFloatCompander::BFPCompressCtrlPlane64Avx512; + break; + case 24: + default: + printf("Unsupported numDataElements %d\n", request->numDataElements); + return -1; + break; + } + break; + default: + printf("Unsupported iqWidth %d\n", request->iqWidth); + return -1; + break; + } + + expandedDataInput.iqWidth = request->iqWidth; + expandedDataInput.numDataElements = request->numDataElements; + expandedDataInput.numBlocks = 1; + expandedDataInput.dataExpanded = &request->data_in[0]; + compressedDataOut.dataCompressed = (uint8_t*)&response->data_out[0]; + + com_fn(expandedDataInput, &compressedDataOut); + + response->len = (((expandedDataInput.numDataElements * expandedDataInput.iqWidth) >> 3) + 1) + * request->numRBs; -void -BlockFloatCompander::BlockFloatExpand_Basic(const CompressedData& dataIn, ExpandedData* dataOut) + return 0; +} + +int32_t +xranlib_decompress_avx512_bfw(const struct xranlib_decompress_request *request, + struct xranlib_decompress_response *response) { - // Expand data - for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb) - { - for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re) - { - auto dataIdxOut = rb * BlockFloatCompander::k_numREReal + re; - auto expIdx = rb * (BlockFloatCompander::k_numREReal + 1); - auto dataIdxIn = (expIdx + 1) + re; - auto thisData = (int16_t)dataIn.dataCompressed[dataIdxIn]; - auto thisExp = (int16_t)dataIn.dataCompressed[expIdx]; - dataOut->dataExpanded[dataIdxOut] = (int16_t)(thisData << thisExp); + BlockFloatCompander::CompressedData compressedDataInput; + BlockFloatCompander::ExpandedData expandedDataOut; + xran_bfp_decompress_fn decom_fn = NULL; + + if (request->numRBs != 1){ + printf("Unsupported numRBs %d\n", request->numRBs); + return -1; } - } + + switch (request->iqWidth) { + case 8: + case 9: + case 10: + case 12: + switch (request->numDataElements) { + case 16: + decom_fn = BlockFloatCompander::BFPExpandCtrlPlane8Avx512; + break; + case 32: + decom_fn = BlockFloatCompander::BFPExpandCtrlPlane16Avx512; + break; + case 64: + decom_fn = BlockFloatCompander::BFPExpandCtrlPlane32Avx512; + break; + case 128: + decom_fn = BlockFloatCompander::BFPExpandCtrlPlane64Avx512; + break; + case 24: + default: + printf("Unsupported numDataElements %d\n", request->numDataElements); + return -1; + break; + } + break; + default: + printf("Unsupported iqWidth %d\n", request->iqWidth); + return -1; + break; + } + + compressedDataInput.iqWidth = request->iqWidth; + compressedDataInput.numDataElements = request->numDataElements; + compressedDataInput.numBlocks = 1; + compressedDataInput.dataCompressed = (uint8_t*)&request->data_in[0]; + expandedDataOut.dataExpanded = &response->data_out[0]; + + decom_fn(compressedDataInput, &expandedDataOut); + + response->len = request->numRBs * compressedDataInput.numDataElements * sizeof(int16_t); + + return 0; } +