o-du/phy
Intel O-RAN/X-RAN Generated Doxygen Documentation
xran_compression.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 *
3 * Copyright (c) 2019 Intel.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *******************************************************************************/
18 
19 #include "xran_compression.hpp"
20 #include "xran_compression.h"
21 #include <complex>
22 #include <algorithm>
23 #include <immintrin.h>
24 #include <limits.h>
25 #include <cstring>
26 
27 static int16_t saturateAbs(int16_t inVal)
28 {
29  int16_t result;
30  if (inVal == std::numeric_limits<short>::min())
31  {
32  result = std::numeric_limits<short>::max();
33  }
34  else
35  {
36  result = (int16_t)std::abs(inVal);
37  }
38  return result;
39 }
40 
41 
43 void
44 computeExponent(const BlockFloatCompander::ExpandedData& dataIn, int8_t* expStore)
45 {
46  __m512i maxAbs = __m512i();
47 
49  const __m512i* rawData = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
50  constexpr int k_numRBPerLoop = 4;
51  constexpr int k_numInputLoopIts = BlockFloatCompander::k_numRB / k_numRBPerLoop;
52 
53 #pragma unroll(k_numInputLoopIts)
54  for (int n = 0; n < k_numInputLoopIts; ++n)
55  {
65  constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src
66  constexpr int k_shuff1 = 0x41;
67  const auto z_w1 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 0], k_msk1, rawData[3 * n + 1], rawData[3 * n + 2], k_shuff1);
68 
69  constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src
70  constexpr int k_shuff2 = 0xB1;
71  const auto z_w2 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 1], k_msk2, rawData[3 * n + 0], rawData[3 * n + 2], k_shuff2);
72 
73  constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src
74  constexpr int k_shuff3 = 0xBE;
75  const auto z_w3 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 2], k_msk3, rawData[3 * n + 0], rawData[3 * n + 1], k_shuff3);
76 
78  const auto abs16_1 = _mm512_abs_epi16(z_w1);
79  const auto abs16_2 = _mm512_abs_epi16(z_w2);
80  const auto abs16_3 = _mm512_abs_epi16(z_w3);
81  const auto maxAbs_12 = _mm512_max_epi16(abs16_1, abs16_2);
82  const auto maxAbs_123 = _mm512_max_epi16(maxAbs_12, abs16_3);
83 
86  const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1);
87  auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbs_123);
88  auto maxAbsHorz = _mm512_max_epi16(maxAbs_123, maxAbsPerm);
89 
91  const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
92  maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz);
93  maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
94 
96  maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ);
97  maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
98 
102  const auto k_select4RB = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16,
103  28, 24, 20, 16, 28, 24, 20, 16);
104  constexpr uint16_t k_expMsk[k_numInputLoopIts] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
105  maxAbs = _mm512_mask_permutex2var_epi32(maxAbs, k_expMsk[n], k_select4RB, maxAbsHorz);
106  }
107 
109  const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
110  0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
111  0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
112  0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF);
113  maxAbs = _mm512_and_epi64(maxAbs, k_upperWordMask);
114 
116  const auto totShiftBits = _mm512_set1_epi32(32 - dataIn.iqWidth + 1);
117  const auto lzCount = _mm512_lzcnt_epi32(maxAbs);
118  const auto exponent = _mm512_sub_epi32(totShiftBits, lzCount);
119  constexpr uint16_t k_expWriteMask = 0xFFFF;
120  _mm512_mask_cvtepi32_storeu_epi8(expStore, k_expWriteMask, exponent);
121 }
122 
123 
126 __m512i
127 networkBytePack9b(const __m512i compData)
128 {
130  const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000100020003, 0x0004000500060007,
131  0x0000000100020003, 0x0004000500060007,
132  0x0000000100020003, 0x0004000500060007,
133  0x0000000100020003, 0x0004000500060007);
134  auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
135 
137  const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x0000000000000000, 0x0C0D080904050001,
138  0x0000000000000000, 0x0C0D080904050001,
139  0x0000000000000000, 0x0C0D080904050001,
140  0x0000000000000000, 0x0C0D080904050001);
141  constexpr uint64_t k_byteMask1 = 0x000000FF00FF00FF;
142  auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
143 
145  const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000000000E, 0x0F0A0B0607020300,
146  0x000000000000000E, 0x0F0A0B0607020300,
147  0x000000000000000E, 0x0F0A0B0607020300,
148  0x000000000000000E, 0x0F0A0B0607020300);
149  constexpr uint64_t k_byteMask2 = 0x000001FE01FE01FE;
150  auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
151 
153  const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000000000FF, 0x01FC07F01FC07F00,
154  0x00000000000000FF, 0x01FC07F01FC07F00,
155  0x00000000000000FF, 0x01FC07F01FC07F00,
156  0x00000000000000FF, 0x01FC07F01FC07F00);
157  return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
158 }
159 
160 
163 __m512i
164 networkBytePack10b(const __m512i compData)
165 {
167  const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000200040006, 0x0000000200040006,
168  0x0000000200040006, 0x0000000200040006,
169  0x0000000200040006, 0x0000000200040006,
170  0x0000000200040006, 0x0000000200040006);
171  auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
172 
174  const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x000000000000000C, 0x0D08090004050001,
175  0x000000000000000C, 0x0D08090004050001,
176  0x000000000000000C, 0x0D08090004050001,
177  0x000000000000000C, 0x0D08090004050001);
178  constexpr uint64_t k_byteMask1 = 0x000001EF01EF01EF;
179  auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
180 
182  const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x0000000000000E0F, 0x0A0B000607020300,
183  0x0000000000000E0F, 0x0A0B000607020300,
184  0x0000000000000E0F, 0x0A0B000607020300,
185  0x0000000000000E0F, 0x0A0B000607020300);
186  constexpr uint64_t k_byteMask2 = 0x000003DE03DE03DE;
187  auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
188 
190  const __m512i k_ternLogSelect = _mm512_set_epi64(0x000000000000FF03, 0xF03F00FF03F03F00,
191  0x000000000000FF03, 0xF03F00FF03F03F00,
192  0x000000000000FF03, 0xF03F00FF03F03F00,
193  0x000000000000FF03, 0xF03F00FF03F03F00);
194  return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
195 }
196 
197 
200 __m512i
201 networkBytePack12b(const __m512i compData)
202 {
204  const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000400000004, 0x0000000400000004,
205  0x0000000400000004, 0x0000000400000004,
206  0x0000000400000004, 0x0000000400000004,
207  0x0000000400000004, 0x0000000400000004);
208  auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
209 
211  const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x00000000000C0D00, 0x0809000405000001,
212  0x00000000000C0D00, 0x0809000405000001,
213  0x00000000000C0D00, 0x0809000405000001,
214  0x00000000000C0D00, 0x0809000405000001);
215  constexpr uint64_t k_byteMask1 = 0x000006DB06DB06DB;
216  auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
217 
219  const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000E0F000A, 0x0B00060700020300,
220  0x000000000E0F000A, 0x0B00060700020300,
221  0x000000000E0F000A, 0x0B00060700020300,
222  0x000000000E0F000A, 0x0B00060700020300);
223  constexpr uint64_t k_byteMask2 = 0x00000DB60DB60DB6;
224  auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
225 
227  const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
228  0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
229  0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
230  0x00000000FF0F00FF, 0x0F00FF0F00FF0F00);
231  return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
232 }
233 
234 
237 __m512i
238 networkByteUnpack9b(const uint8_t* inData)
239 {
241  const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
242  const auto k_expPerm = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4,
243  5, 4, 3, 2, 3, 2, 1, 0);
244  auto expData = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
245 
249  const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0F0E0D0C0B0A0908, 0x0706050403020100,
250  0x090A080907080607, 0x0506040503040203,
251  0x0809070806070506, 0x0405030402030102,
252  0x0708060705060405, 0x0304020301020001);
253  expData = _mm512_shuffle_epi8(expData, k_byteShuffleMask);
254 
256  const __m512i k_slBits = _mm512_set_epi64(0x0007000600050004, 0x0003000200010000,
257  0x0007000600050004, 0x0003000200010000,
258  0x0007000600050004, 0x0003000200010000,
259  0x0007000600050004, 0x0003000200010000);
260  expData = _mm512_sllv_epi16(expData, k_slBits);
261 
263  const __m512i k_expMask = _mm512_set1_epi16(0xFF80);
264  return _mm512_and_epi64(expData, k_expMask);
265 }
266 
267 
270 __m512i
271 networkByteUnpack10b(const uint8_t* inData)
272 {
274  const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
275  const auto k_expPerm = _mm512_set_epi32(15, 14, 13, 12, 8, 7, 6, 5,
276  5, 4, 3, 2, 3, 2, 1, 0);
277  auto expData = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
278 
282  const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0809070806070506, 0x0304020301020001,
283  0x0809070806070506, 0x0304020301020001,
284  0x0A0B090A08090708, 0x0506040503040203,
285  0x0809070806070506, 0x0304020301020001);
286  expData = _mm512_shuffle_epi8(expData, k_byteShuffleMask);
287 
289  const __m512i k_slBits = _mm512_set_epi64(0x0006000400020000, 0x0006000400020000,
290  0x0006000400020000, 0x0006000400020000,
291  0x0006000400020000, 0x0006000400020000,
292  0x0006000400020000, 0x0006000400020000);
293  expData = _mm512_sllv_epi16(expData, k_slBits);
294 
296  const __m512i k_expMask = _mm512_set1_epi16(0xFFC0);
297  return _mm512_and_epi64(expData, k_expMask);
298 }
299 
300 
303 __m512i
304 networkByteUnpack12b(const uint8_t* inData)
305 {
307  const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
308  const auto k_expPerm = _mm512_set_epi32(15, 14, 13, 12, 9, 8, 7, 6,
309  6, 5, 4, 3, 3, 2, 1, 0);
310  auto expData = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
311 
314  const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A07080607, 0x0405030401020001,
315  0x0A0B090A07080607, 0x0405030401020001,
316  0x0A0B090A07080607, 0x0405030401020001,
317  0x0A0B090A07080607, 0x0405030401020001);
318  expData = _mm512_shuffle_epi8(expData, k_byteShuffleMask);
319 
321  const __m512i k_slBits = _mm512_set_epi64(0x0004000000040000, 0x0004000000040000,
322  0x0004000000040000, 0x0004000000040000,
323  0x0004000000040000, 0x0004000000040000,
324  0x0004000000040000, 0x0004000000040000);
325  expData = _mm512_sllv_epi16(expData, k_slBits);
326 
328  const __m512i k_expMask = _mm512_set1_epi16(0xFFF0);
329  return _mm512_and_epi64(expData, k_expMask);
330 }
331 
332 
334 void
336 {
338  int8_t storedExp[BlockFloatCompander::k_numRB] = {};
339  computeExponent(dataIn, storedExp);
340 
342 #pragma unroll(BlockFloatCompander::k_numRB)
343  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
344  {
345  const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
346  auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
347  auto thisRBExpAddr = n * (BlockFloatCompander::k_numREReal + 1);
349  dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
351  constexpr uint32_t k_rbMask = 0x00FFFFFF; // Write mask for 1RB (24 values)
352  _mm256_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_rbMask, _mm512_cvtepi16_epi8(compData));
353  }
354 }
355 
356 
358 void
360 {
362  int8_t storedExp[BlockFloatCompander::k_numRB] = {};
363  computeExponent(dataIn, storedExp);
364 
367 #pragma unroll(BlockFloatCompander::k_numRB)
368  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
369  {
371  const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
372  auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
373 
375  auto compDataBytePacked = networkBytePack9b(compData);
376 
378  constexpr int k_totNumBytesPerRB = 28;
379  auto thisRBExpAddr = n * k_totNumBytesPerRB;
380  dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
381 
384  constexpr uint16_t k_RbWriteMask = 0x01FF;
385  constexpr int k_numDataBytesPerLane = 9;
386  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
387  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + k_numDataBytesPerLane, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
388  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * k_numDataBytesPerLane), k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
389  }
390 }
391 
392 
394 void
396 {
398  int8_t storedExp[BlockFloatCompander::k_numRB] = {};
399  computeExponent(dataIn, storedExp);
400 
403 #pragma unroll(BlockFloatCompander::k_numRB)
404  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
405  {
407  const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
408  auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
409 
411  auto compDataBytePacked = networkBytePack10b(compData);
412 
414  constexpr int k_totNumBytesPerRB = 31;
415  auto thisRBExpAddr = n * k_totNumBytesPerRB;
416  dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
417 
420  constexpr uint16_t k_RbWriteMask = 0x03FF;
421  constexpr int k_numDataBytesPerLane = 10;
422  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
423  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + k_numDataBytesPerLane, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
424  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * k_numDataBytesPerLane), k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
425  }
426 }
427 
428 
430 void
432 {
434  int8_t storedExp[BlockFloatCompander::k_numRB] = {};
435  computeExponent(dataIn, storedExp);
436 
439 #pragma unroll(BlockFloatCompander::k_numRB)
440  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
441  {
443  const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
444  auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
445 
447  auto compDataBytePacked = networkBytePack12b(compData);
448 
450  constexpr int k_totNumBytesPerRB = 37;
451  auto thisRBExpAddr = n * k_totNumBytesPerRB;
452  dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
453 
456  constexpr uint16_t k_RbWriteMask = 0x0FFF;
457  constexpr int k_numDataBytesPerLane = 12;
458  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
459  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + k_numDataBytesPerLane, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
460  _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * k_numDataBytesPerLane), k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
461  }
462 }
463 
464 
466 void
468 {
469 #pragma unroll(BlockFloatCompander::k_numRB)
470  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
471  {
473  auto expAddr = n * (BlockFloatCompander::k_numREReal + 1);
474  const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(dataIn.dataCompressed + expAddr + 1);
475  const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn);
476  const auto expData = _mm512_slli_epi16(compData16, *(dataIn.dataCompressed + expAddr));
478  constexpr uint8_t k_rbMask64 = 0b00111111; // 64b write mask for 1RB (24 int16 values)
479  _mm512_mask_storeu_epi64(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_rbMask64, expData);
480  }
481 }
482 
483 
485 void
487 {
488 #pragma unroll(BlockFloatCompander::k_numRB)
489  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
490  {
491  constexpr int k_totNumBytesPerRB = 28;
492  auto expAddr = n * k_totNumBytesPerRB;
493 
495  auto expData = networkByteUnpack9b(dataIn.dataCompressed + expAddr + 1);
496 
498  constexpr int k_maxExpShift = 7;
499  expData = _mm512_srai_epi16(expData, k_maxExpShift - *(dataIn.dataCompressed + expAddr));
500 
502  static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
503  _mm512_mask_storeu_epi16(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_WriteMask, expData);
504  }
505 }
506 
507 
509 void
511 {
512 #pragma unroll(BlockFloatCompander::k_numRB)
513  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
514  {
515  constexpr int k_totNumBytesPerRB = 31;
516  auto expAddr = n * k_totNumBytesPerRB;
517 
519  auto expData = networkByteUnpack10b(dataIn.dataCompressed + expAddr + 1);
520 
522  constexpr int k_maxExpShift = 6;
523  expData = _mm512_srai_epi16(expData, k_maxExpShift - *(dataIn.dataCompressed + expAddr));
524 
526  static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
527  _mm512_mask_storeu_epi16(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_WriteMask, expData);
528  }
529 }
530 
531 
533 void
535 {
536 #pragma unroll(BlockFloatCompander::k_numRB)
537  for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
538  {
539  constexpr int k_totNumBytesPerRB = 37;
540  auto expAddr = n * k_totNumBytesPerRB;
541 
543  auto expData = networkByteUnpack12b(dataIn.dataCompressed + expAddr + 1);
544 
546  constexpr int k_maxExpShift = 4;
547  expData = _mm512_srai_epi16(expData, k_maxExpShift - *(dataIn.dataCompressed + expAddr));
548 
550  static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
551  _mm512_mask_storeu_epi16(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_WriteMask, expData);
552  }
553 }
554 
555 
557 void
559 {
560  int dataOutIdx = 0;
561  int16_t iqMask = (int16_t)((1 << dataIn.iqWidth) - 1);
562  int byteShiftUnits = dataIn.iqWidth - 8;
563 
564  for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb)
565  {
567  int16_t maxAbs = 0;
568  for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re)
569  {
570  auto dataIdx = rb * BlockFloatCompander::k_numREReal + re;
571  auto dataAbs = saturateAbs(dataIn.dataExpanded[dataIdx]);
572  maxAbs = std::max(maxAbs, dataAbs);
573  }
574 
575  // Find exponent and insert into byte stream
576  auto thisExp = (uint8_t)(std::max(0,(16 - dataIn.iqWidth + 1 - __lzcnt16(maxAbs))));
577  dataOut->dataCompressed[dataOutIdx++] = thisExp;
578 
582  static constexpr int k_byteMask = 0xFF;
583  int byteShiftVal = -8;
584  int byteBuffer = { 0 };
585  for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re)
586  {
587  auto dataIdxIn = rb * BlockFloatCompander::k_numREReal + re;
588  auto thisRE = dataIn.dataExpanded[dataIdxIn] >> thisExp;
589  byteBuffer = (byteBuffer << dataIn.iqWidth) + (int)(thisRE & iqMask);
590 
591  byteShiftVal += (8 + byteShiftUnits);
592  while (byteShiftVal >= 0)
593  {
594  auto thisByte = (uint8_t)((byteBuffer >> byteShiftVal) & k_byteMask);
595  dataOut->dataCompressed[dataOutIdx++] = thisByte;
596  byteShiftVal -= 8;
597  }
598  }
599  }
600  dataOut->iqWidth = dataIn.iqWidth;
601 }
602 
604 void
606 {
607  uint32_t iqMask = (uint32_t)(UINT_MAX - ((1 << (32 - dataIn.iqWidth)) - 1));
608  uint32_t byteBuffer = { 0 };
609  int numBytesPerRB = (3 * dataIn.iqWidth) + 1;
610  int bitPointer = 0;
611  int dataIdxOut = 0;
612 
613  for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb)
614  {
615  auto expIdx = rb * numBytesPerRB;
616  auto signExtShift = 32 - dataIn.iqWidth - dataIn.dataCompressed[expIdx];
617 
618  for (int b = 0; b < numBytesPerRB - 1; ++b)
619  {
620  auto dataIdxIn = (expIdx + 1) + b;
621  auto thisByte = (uint16_t)dataIn.dataCompressed[dataIdxIn];
622  byteBuffer = (uint32_t)((byteBuffer << 8) + thisByte);
623  bitPointer += 8;
624  while (bitPointer >= dataIn.iqWidth)
625  {
629  int32_t thisSample32 = (int32_t)((byteBuffer << (32 - bitPointer)) & iqMask);
630  int16_t thisSample = (int16_t)(thisSample32 >> signExtShift);
631  bitPointer -= dataIn.iqWidth;
632  dataOut->dataExpanded[dataIdxOut++] = thisSample;
633  }
634  }
635  }
636 }
637 
639 void
641 {
642  int dataOutIdx = 0;
643  int16_t iqMask = (int16_t)((1 << dataIn.iqWidth) - 1);
644  int byteShiftUnits = dataIn.iqWidth - 8;
645 
646  for (int rb = 0; rb < BlockFloatCompanderBFW::k_numRB; ++rb)
647  {
649  int16_t maxAbs = 0;
650  for (int re = 0; re < BlockFloatCompanderBFW::k_numREReal; ++re)
651  {
652  auto dataIdx = rb * BlockFloatCompanderBFW::k_numREReal + re;
653  auto dataAbs = saturateAbs(dataIn.dataExpanded[dataIdx]);
654  maxAbs = std::max(maxAbs, dataAbs);
655  }
656 
657  // Find exponent and insert into byte stream
658  auto thisExp = (uint8_t)(std::max(0,(16 - dataIn.iqWidth + 1 - __lzcnt16(maxAbs))));
659  dataOut->dataCompressed[dataOutIdx++] = thisExp;
660 
664  static constexpr int k_byteMask = 0xFF;
665  int byteShiftVal = -8;
666  int byteBuffer = { 0 };
667  for (int re = 0; re < BlockFloatCompanderBFW::k_numREReal; ++re)
668  {
669  auto dataIdxIn = rb * BlockFloatCompanderBFW::k_numREReal + re;
670  auto thisRE = dataIn.dataExpanded[dataIdxIn] >> thisExp;
671  byteBuffer = (byteBuffer << dataIn.iqWidth) + (int)(thisRE & iqMask);
672 
673  byteShiftVal += (8 + byteShiftUnits);
674  while (byteShiftVal >= 0)
675  {
676  auto thisByte = (uint8_t)((byteBuffer >> byteShiftVal) & k_byteMask);
677  dataOut->dataCompressed[dataOutIdx++] = thisByte;
678  byteShiftVal -= 8;
679  }
680  }
681  }
682  dataOut->iqWidth = dataIn.iqWidth;
683 }
684 
686 void
688 {
689  uint32_t iqMask = (uint32_t)(UINT_MAX - ((1 << (32 - dataIn.iqWidth)) - 1));
690  uint32_t byteBuffer = { 0 };
691  int numBytesPerRB = (3 * dataIn.iqWidth) + 1;
692  int bitPointer = 0;
693  int dataIdxOut = 0;
694 
695  for (int rb = 0; rb < BlockFloatCompanderBFW::k_numRB; ++rb)
696  {
697  auto expIdx = rb * numBytesPerRB;
698  auto signExtShift = 32 - dataIn.iqWidth - dataIn.dataCompressed[expIdx];
699 
700  for (int b = 0; b < numBytesPerRB - 1; ++b)
701  {
702  auto dataIdxIn = (expIdx + 1) + b;
703  auto thisByte = (uint16_t)dataIn.dataCompressed[dataIdxIn];
704  byteBuffer = (uint32_t)((byteBuffer << 8) + thisByte);
705  bitPointer += 8;
706  while (bitPointer >= dataIn.iqWidth)
707  {
711  int32_t thisSample32 = (int32_t)((byteBuffer << (32 - bitPointer)) & iqMask);
712  int16_t thisSample = (int16_t)(thisSample32 >> signExtShift);
713  bitPointer -= dataIn.iqWidth;
714  dataOut->dataExpanded[dataIdxOut++] = thisSample;
715  }
716  }
717  }
718 }
719 
720 #define RB_NUM_ROUNDUP(rb) \
721  (BlockFloatCompander::k_numRB * ((rb + BlockFloatCompander::k_numRB - 1) / BlockFloatCompander::k_numRB))
722 
723 
727 
728 int32_t
730  struct xranlib_compress_response *response)
731 {
732  BlockFloatCompander::ExpandedData expandedDataInput;
733  BlockFloatCompander::CompressedData compressedDataOut;
734  xran_bfp_compress_fn com_fn = NULL;
735  int16_t numRBs = request->numRBs;
736  int16_t len = 0;
737 
738  switch (request->iqWidth){
739  case 8:
740  expandedDataInput.iqWidth = 8;
742  break;
743  case 9:
744  expandedDataInput.iqWidth = 9;
746  break;
747  case 10:
748  expandedDataInput.iqWidth = 10;
750  break;
751  case 12:
752  expandedDataInput.iqWidth = 12;
754  break;
755  default:
756  expandedDataInput.iqWidth = request->iqWidth;
758  break;
759  }
760 
761  for (int16_t block_idx = 0;
762  block_idx < RB_NUM_ROUNDUP(numRBs)/BlockFloatCompander::k_numRB /*+ 1*/; /* 16 RBs at time */
763  block_idx++) {
764 
765  expandedDataInput.dataExpanded =
766  &request->data_in[block_idx*BlockFloatCompander::k_numSampsExpanded];
767  compressedDataOut.dataCompressed =
768  (uint8_t*)&response->data_out[len];
769 
770  com_fn(expandedDataInput, &compressedDataOut);
771  len += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_numRB,(int16_t)numRBs);
772  }
773 
774  response->len = ((3 * expandedDataInput.iqWidth) + 1) * numRBs;
775 
776  return 0;
777 }
778 
781 
782 int32_t
784  struct xranlib_compress_response *response)
785 {
786  BlockFloatCompanderBFW::ExpandedData expandedDataInput;
787  BlockFloatCompanderBFW::CompressedData compressedDataKern;
788  xran_bfp_compress_bfw_fn com_fn = NULL;
789 
790 #if 0
791  for (int m = 0; m < BlockFloatCompander::k_numRB; ++m){
792  for (int n = 0; n < BlockFloatCompander::k_numREReal; ++n){
793  expandedDataInput.dataExpanded[m*BlockFloatCompander::k_numREReal+n] =
794  request->data_in[m*BlockFloatCompander::k_numREReal+n];
795  }
796  }
797 #endif
798 
799  expandedDataInput.dataExpanded = request->data_in;
800  compressedDataKern.dataCompressed = (uint8_t*)response->data_out;
801 
803  switch (request->iqWidth){
804  case 8:
805  expandedDataInput.iqWidth = 8;
806  break;
807  case 9:
808  expandedDataInput.iqWidth = 9;
809  //com_fn = BlockFloatCompanderBFW::BlockFloatExpand_9b_AVX512
810  break;
811  case 10:
812  expandedDataInput.iqWidth = 10;
813  break;
814  case 12:
815  expandedDataInput.iqWidth = 12;
816  break;
817  default:
818  printf("bfwIqWidth is not supported %d\n", request->iqWidth);
819  return -1;
820  break;
821  }
822 
823  com_fn(expandedDataInput, &compressedDataKern);
824  response->len = ((BlockFloatCompanderBFW::k_numRE/16*4*expandedDataInput.iqWidth)+1)*BlockFloatCompanderBFW::k_numRB;
825 
826  return 0;
827 }
828 
831 
832 
833 int32_t
835  struct xranlib_decompress_response *response)
836 {
837 
838  BlockFloatCompander::CompressedData compressedDataInput;
839  BlockFloatCompander::ExpandedData expandedDataOut;
840 
841  xran_bfp_decompress_fn decom_fn = NULL;
842  int16_t numRBs = request->numRBs;
843  int16_t len = 0;
844 
845  switch (request->iqWidth){
846  case 8:
847  compressedDataInput.iqWidth = 8;
849  break;
850  case 9:
851  compressedDataInput.iqWidth = 9;
853  break;
854  case 10:
855  compressedDataInput.iqWidth = 10;
857  break;
858  case 12:
859  compressedDataInput.iqWidth = 12;
861  break;
862  default:
863  compressedDataInput.iqWidth = request->iqWidth;
865  break;
866  }
867 
868  for (int16_t block_idx = 0;
869  block_idx < RB_NUM_ROUNDUP(numRBs)/BlockFloatCompander::k_numRB;
870  block_idx++) {
871 
872  compressedDataInput.dataCompressed = (uint8_t*)&request->data_in[block_idx*(((3 * compressedDataInput.iqWidth ) + 1) * BlockFloatCompander::k_numRB)];
873  expandedDataOut.dataExpanded = &response->data_out[len];
874 
875  decom_fn(compressedDataInput, &expandedDataOut);
876  len += std::min((int16_t)BlockFloatCompander::k_numSampsExpanded, (int16_t)(numRBs*BlockFloatCompander::k_numREReal));
877  }
878 
879  response->len = numRBs * BlockFloatCompander::k_numREReal* sizeof(int16_t);
880 
881  return 0;
882 }
void BlockFloatCompress_12b_AVX512(const ExpandedData &dataIn, CompressedData *dataOut)
12 bit compression
uint32_t rb
Definition: xran_pkt_cp.h:243
void(* xran_bfp_compress_bfw_fn)(const BlockFloatCompanderBFW::ExpandedData &dataIn, BlockFloatCompanderBFW::CompressedData *dataOut)
__m512i networkBytePack10b(const __m512i compData)
#define RB_NUM_ROUNDUP(rb)
int iqWidth
Size of mantissa including sign bit.
int iqWidth
Size of mantissa including sign bit.
Request structure containing pointer to data and its length.
void BlockFloatCompress_Basic(const ExpandedData &dataIn, CompressedData *dataOut)
Reference compression.
External API for compading with the use BFP algorithm.
numRBs
Definition: gen_test.m:96
__m512i networkByteUnpack10b(const uint8_t *inData)
int iqWidth
Size of mantissa including sign bit.
void(* xran_bfp_compress_fn)(const BlockFloatCompander::ExpandedData &dataIn, BlockFloatCompander::CompressedData *dataOut)
void BlockFloatCompress_9b_AVX512(const ExpandedData &dataIn, CompressedData *dataOut)
9 bit compression
void BlockFloatCompress_10b_AVX512(const ExpandedData &dataIn, CompressedData *dataOut)
10 bit compression
void(* xran_bfp_decompress_fn)(const BlockFloatCompander::CompressedData &dataIn, BlockFloatCompander::ExpandedData *dataOut)
int32_t xranlib_compress_avx512(const struct xranlib_compress_request *request, struct xranlib_compress_response *response)
int iqWidth
Size of mantissa including sign bit.
void BlockFloatExpand_8b_AVX512(const CompressedData &dataIn, ExpandedData *dataOut)
8 bit expansion
__m512i networkByteUnpack12b(const uint8_t *inData)
Request structure containing pointer to data and its length.
int32_t xranlib_decompress_avx512(const struct xranlib_decompress_request *request, struct xranlib_decompress_response *response)
__m512i networkByteUnpack9b(const uint8_t *inData)
void BlockFloatExpand_Basic(const CompressedData &dataIn, ExpandedData *dataOut)
Reference expansion.
void BlockFloatExpand_9b_AVX512(const CompressedData &dataIn, ExpandedData *dataOut)
9 bit expansion
void computeExponent(const BlockFloatCompander::ExpandedData &dataIn, int8_t *expStore)
Compute exponent value for a set of RB from the maximum absolute value.
switch(bw) case
Definition: gen_test.m:94
Response structure containing pointer to data and its length.
void BlockFloatExpand_12b_AVX512(const CompressedData &dataIn, ExpandedData *dataOut)
12 bit expansion
CACHE_ALIGNED uint8_t * dataCompressed
void BlockFloatExpand_10b_AVX512(const CompressedData &dataIn, ExpandedData *dataOut)
10 bit expansion
__m512i networkBytePack9b(const __m512i compData)
void BlockFloatCompress_8b_AVX512(const ExpandedData &dataIn, CompressedData *dataOut)
8 bit compression
CACHE_ALIGNED int16_t * dataExpanded
__m512i networkBytePack12b(const __m512i compData)
int32_t xranlib_compress_avx512_bfw(const struct xranlib_compress_request *request, struct xranlib_compress_response *response)
void BlockFloatCompress_Basic(const ExpandedData &dataIn, CompressedData *dataOut)
Reference compression.
Response structure containing pointer to data and its length.
void BlockFloatExpand_Basic(const CompressedData &dataIn, ExpandedData *dataOut)
Reference expansion.