1 /******************************************************************************
\r
3 * Copyright (c) 2019 Intel.
\r
5 * Licensed under the Apache License, Version 2.0 (the "License");
\r
6 * you may not use this file except in compliance with the License.
\r
7 * You may obtain a copy of the License at
\r
9 * http://www.apache.org/licenses/LICENSE-2.0
\r
11 * Unless required by applicable law or agreed to in writing, software
\r
12 * distributed under the License is distributed on an "AS IS" BASIS,
\r
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
\r
14 * See the License for the specific language governing permissions and
\r
15 * limitations under the License.
\r
17 *******************************************************************************/
\r
19 #include "xran_compression.hpp"
\r
20 #include "xran_compression.h"
\r
22 #include <algorithm>
\r
23 #include <immintrin.h>
\r
27 static int16_t saturateAbs(int16_t inVal)
\r
30 if (inVal == std::numeric_limits<short>::min())
\r
32 result = std::numeric_limits<short>::max();
\r
36 result = (int16_t)std::abs(inVal);
\r
42 /// Compute exponent value for a set of RB from the maximum absolute value
\r
44 computeExponent(const BlockFloatCompander::ExpandedData& dataIn, int8_t* expStore)
\r
46 __m512i maxAbs = __m512i();
\r
48 /// Load data and find max(abs(RB))
\r
49 const __m512i* rawData = reinterpret_cast<const __m512i*>(dataIn.dataExpanded);
\r
50 constexpr int k_numRBPerLoop = 4;
\r
51 constexpr int k_numInputLoopIts = BlockFloatCompander::k_numRB / k_numRBPerLoop;
\r
53 #pragma unroll(k_numInputLoopIts)
\r
54 for (int n = 0; n < k_numInputLoopIts; ++n)
\r
56 /// Re-order the next 4RB in input data into 3 registers
\r
57 /// Input SIMD vectors are:
\r
58 /// [A A A A A A A A A A A A B B B B]
\r
59 /// [B B B B B B B B C C C C C C C C]
\r
60 /// [C C C C D D D D D D D D D D D D]
\r
61 /// Re-ordered SIMD vectors are:
\r
62 /// [A A A A B B B B C C C C D D D D]
\r
63 /// [A A A A B B B B C C C C D D D D]
\r
64 /// [A A A A B B B B C C C C D D D D]
\r
65 constexpr uint8_t k_msk1 = 0b11111100; // Copy first lane of src
\r
66 constexpr int k_shuff1 = 0x41;
\r
67 const auto z_w1 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 0], k_msk1, rawData[3 * n + 1], rawData[3 * n + 2], k_shuff1);
\r
69 constexpr uint8_t k_msk2 = 0b11000011; // Copy middle two lanes of src
\r
70 constexpr int k_shuff2 = 0xB1;
\r
71 const auto z_w2 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 1], k_msk2, rawData[3 * n + 0], rawData[3 * n + 2], k_shuff2);
\r
73 constexpr uint8_t k_msk3 = 0b00111111; // Copy last lane of src
\r
74 constexpr int k_shuff3 = 0xBE;
\r
75 const auto z_w3 = _mm512_mask_shuffle_i64x2(rawData[3 * n + 2], k_msk3, rawData[3 * n + 0], rawData[3 * n + 1], k_shuff3);
\r
77 /// Perform max abs on these 3 registers
\r
78 const auto abs16_1 = _mm512_abs_epi16(z_w1);
\r
79 const auto abs16_2 = _mm512_abs_epi16(z_w2);
\r
80 const auto abs16_3 = _mm512_abs_epi16(z_w3);
\r
81 const auto maxAbs_12 = _mm512_max_epi16(abs16_1, abs16_2);
\r
82 const auto maxAbs_123 = _mm512_max_epi16(maxAbs_12, abs16_3);
\r
84 /// Perform horizontal max over each lane
\r
85 /// Swap 64b in each lane and compute max
\r
86 const auto k_perm64b = _mm512_set_epi64(6, 7, 4, 5, 2, 3, 0, 1);
\r
87 auto maxAbsPerm = _mm512_permutexvar_epi64(k_perm64b, maxAbs_123);
\r
88 auto maxAbsHorz = _mm512_max_epi16(maxAbs_123, maxAbsPerm);
\r
90 /// Swap each pair of 32b in each lane and compute max
\r
91 const auto k_perm32b = _mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
\r
92 maxAbsPerm = _mm512_permutexvar_epi32(k_perm32b, maxAbsHorz);
\r
93 maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
\r
95 /// Swap each IQ pair in each lane (via 32b rotation) and compute max
\r
96 maxAbsPerm = _mm512_rol_epi32(maxAbsHorz, BlockFloatCompander::k_numBitsIQ);
\r
97 maxAbsHorz = _mm512_max_epi16(maxAbsHorz, maxAbsPerm);
\r
99 /// Insert values into maxAbs
\r
100 /// Use sliding mask to insert wanted values into maxAbs
\r
101 /// Pairs of values will be inserted and corrected outside of loop
\r
102 const auto k_select4RB = _mm512_set_epi32(28, 24, 20, 16, 28, 24, 20, 16,
\r
103 28, 24, 20, 16, 28, 24, 20, 16);
\r
104 constexpr uint16_t k_expMsk[k_numInputLoopIts] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
\r
105 maxAbs = _mm512_mask_permutex2var_epi32(maxAbs, k_expMsk[n], k_select4RB, maxAbsHorz);
\r
108 /// Convert to 32b by removing repeated values in maxAbs
\r
109 const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
\r
110 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
\r
111 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
\r
112 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF);
\r
113 maxAbs = _mm512_and_epi64(maxAbs, k_upperWordMask);
\r
115 /// Compute and store exponent
\r
116 const auto totShiftBits = _mm512_set1_epi32(32 - dataIn.iqWidth + 1);
\r
117 const auto lzCount = _mm512_lzcnt_epi32(maxAbs);
\r
118 const auto exponent = _mm512_sub_epi32(totShiftBits, lzCount);
\r
119 constexpr uint16_t k_expWriteMask = 0xFFFF;
\r
120 _mm512_mask_cvtepi32_storeu_epi8(expStore, k_expWriteMask, exponent);
\r
124 /// Pack compressed 9 bit data in network byte order
\r
125 /// See https://soco.intel.com/docs/DOC-2665619
\r
127 networkBytePack9b(const __m512i compData)
\r
129 /// Logical shift left to align network order byte parts
\r
130 const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000100020003, 0x0004000500060007,
\r
131 0x0000000100020003, 0x0004000500060007,
\r
132 0x0000000100020003, 0x0004000500060007,
\r
133 0x0000000100020003, 0x0004000500060007);
\r
134 auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
\r
136 /// First epi8 shuffle of even indexed samples
\r
137 const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x0000000000000000, 0x0C0D080904050001,
\r
138 0x0000000000000000, 0x0C0D080904050001,
\r
139 0x0000000000000000, 0x0C0D080904050001,
\r
140 0x0000000000000000, 0x0C0D080904050001);
\r
141 constexpr uint64_t k_byteMask1 = 0x000000FF00FF00FF;
\r
142 auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
\r
144 /// Second epi8 shuffle of odd indexed samples
\r
145 const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000000000E, 0x0F0A0B0607020300,
\r
146 0x000000000000000E, 0x0F0A0B0607020300,
\r
147 0x000000000000000E, 0x0F0A0B0607020300,
\r
148 0x000000000000000E, 0x0F0A0B0607020300);
\r
149 constexpr uint64_t k_byteMask2 = 0x000001FE01FE01FE;
\r
150 auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
\r
152 /// Ternary blend of the two shuffled results
\r
153 const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000000000FF, 0x01FC07F01FC07F00,
\r
154 0x00000000000000FF, 0x01FC07F01FC07F00,
\r
155 0x00000000000000FF, 0x01FC07F01FC07F00,
\r
156 0x00000000000000FF, 0x01FC07F01FC07F00);
\r
157 return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
\r
161 /// Pack compressed 10 bit data in network byte order
\r
162 /// See https://soco.intel.com/docs/DOC-2665619
\r
164 networkBytePack10b(const __m512i compData)
\r
166 /// Logical shift left to align network order byte parts
\r
167 const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000200040006, 0x0000000200040006,
\r
168 0x0000000200040006, 0x0000000200040006,
\r
169 0x0000000200040006, 0x0000000200040006,
\r
170 0x0000000200040006, 0x0000000200040006);
\r
171 auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
\r
173 /// First epi8 shuffle of even indexed samples
\r
174 const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x000000000000000C, 0x0D08090004050001,
\r
175 0x000000000000000C, 0x0D08090004050001,
\r
176 0x000000000000000C, 0x0D08090004050001,
\r
177 0x000000000000000C, 0x0D08090004050001);
\r
178 constexpr uint64_t k_byteMask1 = 0x000001EF01EF01EF;
\r
179 auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
\r
181 /// Second epi8 shuffle of odd indexed samples
\r
182 const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x0000000000000E0F, 0x0A0B000607020300,
\r
183 0x0000000000000E0F, 0x0A0B000607020300,
\r
184 0x0000000000000E0F, 0x0A0B000607020300,
\r
185 0x0000000000000E0F, 0x0A0B000607020300);
\r
186 constexpr uint64_t k_byteMask2 = 0x000003DE03DE03DE;
\r
187 auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
\r
189 /// Ternary blend of the two shuffled results
\r
190 const __m512i k_ternLogSelect = _mm512_set_epi64(0x000000000000FF03, 0xF03F00FF03F03F00,
\r
191 0x000000000000FF03, 0xF03F00FF03F03F00,
\r
192 0x000000000000FF03, 0xF03F00FF03F03F00,
\r
193 0x000000000000FF03, 0xF03F00FF03F03F00);
\r
194 return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
\r
198 /// Pack compressed 12 bit data in network byte order
\r
199 /// See https://soco.intel.com/docs/DOC-2665619
\r
201 networkBytePack12b(const __m512i compData)
\r
203 /// Logical shift left to align network order byte parts
\r
204 const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000400000004, 0x0000000400000004,
\r
205 0x0000000400000004, 0x0000000400000004,
\r
206 0x0000000400000004, 0x0000000400000004,
\r
207 0x0000000400000004, 0x0000000400000004);
\r
208 auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
\r
210 /// First epi8 shuffle of even indexed samples
\r
211 const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x00000000000C0D00, 0x0809000405000001,
\r
212 0x00000000000C0D00, 0x0809000405000001,
\r
213 0x00000000000C0D00, 0x0809000405000001,
\r
214 0x00000000000C0D00, 0x0809000405000001);
\r
215 constexpr uint64_t k_byteMask1 = 0x000006DB06DB06DB;
\r
216 auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
\r
218 /// Second epi8 shuffle of odd indexed samples
\r
219 const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000E0F000A, 0x0B00060700020300,
\r
220 0x000000000E0F000A, 0x0B00060700020300,
\r
221 0x000000000E0F000A, 0x0B00060700020300,
\r
222 0x000000000E0F000A, 0x0B00060700020300);
\r
223 constexpr uint64_t k_byteMask2 = 0x00000DB60DB60DB6;
\r
224 auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
\r
226 /// Ternary blend of the two shuffled results
\r
227 const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
\r
228 0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
\r
229 0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
\r
230 0x00000000FF0F00FF, 0x0F00FF0F00FF0F00);
\r
231 return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
\r
235 /// Unpack compressed 9 bit data in network byte order
\r
236 /// See https://soco.intel.com/docs/DOC-2665619
\r
238 networkByteUnpack9b(const uint8_t* inData)
\r
240 /// Align chunks of compressed bytes into lanes to allow for expansion
\r
241 const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
\r
242 const auto k_expPerm = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4,
\r
243 5, 4, 3, 2, 3, 2, 1, 0);
\r
244 auto expData = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
\r
246 /// Byte shuffle to get all bits for each sample into 16b chunks
\r
247 /// Due to previous permute to get chunks of bytes into each lane, there is
\r
248 /// a different shuffle offset in each lane
\r
249 const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0F0E0D0C0B0A0908, 0x0706050403020100,
\r
250 0x090A080907080607, 0x0506040503040203,
\r
251 0x0809070806070506, 0x0405030402030102,
\r
252 0x0708060705060405, 0x0304020301020001);
\r
253 expData = _mm512_shuffle_epi8(expData, k_byteShuffleMask);
\r
255 /// Logical shift left to set sign bit
\r
256 const __m512i k_slBits = _mm512_set_epi64(0x0007000600050004, 0x0003000200010000,
\r
257 0x0007000600050004, 0x0003000200010000,
\r
258 0x0007000600050004, 0x0003000200010000,
\r
259 0x0007000600050004, 0x0003000200010000);
\r
260 expData = _mm512_sllv_epi16(expData, k_slBits);
\r
262 /// Mask to zero unwanted bits
\r
263 const __m512i k_expMask = _mm512_set1_epi16(0xFF80);
\r
264 return _mm512_and_epi64(expData, k_expMask);
\r
268 /// Unpack compressed 10 bit data in network byte order
\r
269 /// See https://soco.intel.com/docs/DOC-2665619
\r
271 networkByteUnpack10b(const uint8_t* inData)
\r
273 /// Align chunks of compressed bytes into lanes to allow for expansion
\r
274 const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
\r
275 const auto k_expPerm = _mm512_set_epi32(15, 14, 13, 12, 8, 7, 6, 5,
\r
276 5, 4, 3, 2, 3, 2, 1, 0);
\r
277 auto expData = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
\r
279 /// Byte shuffle to get all bits for each sample into 16b chunks
\r
280 /// Due to previous permute to get chunks of bytes into each lane, lanes
\r
281 /// 0 and 2 happen to be aligned, but lane 1 is offset by 2 bytes
\r
282 const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0809070806070506, 0x0304020301020001,
\r
283 0x0809070806070506, 0x0304020301020001,
\r
284 0x0A0B090A08090708, 0x0506040503040203,
\r
285 0x0809070806070506, 0x0304020301020001);
\r
286 expData = _mm512_shuffle_epi8(expData, k_byteShuffleMask);
\r
288 /// Logical shift left to set sign bit
\r
289 const __m512i k_slBits = _mm512_set_epi64(0x0006000400020000, 0x0006000400020000,
\r
290 0x0006000400020000, 0x0006000400020000,
\r
291 0x0006000400020000, 0x0006000400020000,
\r
292 0x0006000400020000, 0x0006000400020000);
\r
293 expData = _mm512_sllv_epi16(expData, k_slBits);
\r
295 /// Mask to zero unwanted bits
\r
296 const __m512i k_expMask = _mm512_set1_epi16(0xFFC0);
\r
297 return _mm512_and_epi64(expData, k_expMask);
\r
301 /// Unpack compressed 12 bit data in network byte order
\r
302 /// See https://soco.intel.com/docs/DOC-2665619
\r
304 networkByteUnpack12b(const uint8_t* inData)
\r
306 /// Align chunks of compressed bytes into lanes to allow for expansion
\r
307 const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
\r
308 const auto k_expPerm = _mm512_set_epi32(15, 14, 13, 12, 9, 8, 7, 6,
\r
309 6, 5, 4, 3, 3, 2, 1, 0);
\r
310 auto expData = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
\r
312 /// Byte shuffle to get all bits for each sample into 16b chunks
\r
313 /// For 12b mantissa all lanes post-permute are aligned and require same shuffle offset
\r
314 const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A07080607, 0x0405030401020001,
\r
315 0x0A0B090A07080607, 0x0405030401020001,
\r
316 0x0A0B090A07080607, 0x0405030401020001,
\r
317 0x0A0B090A07080607, 0x0405030401020001);
\r
318 expData = _mm512_shuffle_epi8(expData, k_byteShuffleMask);
\r
320 /// Logical shift left to set sign bit
\r
321 const __m512i k_slBits = _mm512_set_epi64(0x0004000000040000, 0x0004000000040000,
\r
322 0x0004000000040000, 0x0004000000040000,
\r
323 0x0004000000040000, 0x0004000000040000,
\r
324 0x0004000000040000, 0x0004000000040000);
\r
325 expData = _mm512_sllv_epi16(expData, k_slBits);
\r
327 /// Mask to zero unwanted bits
\r
328 const __m512i k_expMask = _mm512_set1_epi16(0xFFF0);
\r
329 return _mm512_and_epi64(expData, k_expMask);
\r
333 /// 8 bit compression
\r
335 BlockFloatCompander::BlockFloatCompress_8b_AVX512(const ExpandedData& dataIn, CompressedData* dataOut)
\r
337 /// Compute exponent and store for later use
\r
338 int8_t storedExp[BlockFloatCompander::k_numRB] = {};
\r
339 computeExponent(dataIn, storedExp);
\r
341 /// Shift 1RB by corresponding exponent and write exponent and data to output
\r
342 #pragma unroll(BlockFloatCompander::k_numRB)
\r
343 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
345 const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
\r
346 auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
\r
347 auto thisRBExpAddr = n * (BlockFloatCompander::k_numREReal + 1);
\r
348 /// Store exponent first
\r
349 dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
\r
350 /// Store compressed RB
\r
351 constexpr uint32_t k_rbMask = 0x00FFFFFF; // Write mask for 1RB (24 values)
\r
352 _mm256_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_rbMask, _mm512_cvtepi16_epi8(compData));
\r
357 /// 9 bit compression
\r
359 BlockFloatCompander::BlockFloatCompress_9b_AVX512(const ExpandedData& dataIn, CompressedData* dataOut)
\r
361 /// Compute exponent and store for later use
\r
362 int8_t storedExp[BlockFloatCompander::k_numRB] = {};
\r
363 computeExponent(dataIn, storedExp);
\r
365 /// Shift 1RB by corresponding exponent and write exponent and data to output
\r
366 /// Output data is packed exponent first followed by corresponding compressed RB
\r
367 #pragma unroll(BlockFloatCompander::k_numRB)
\r
368 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
370 /// Apply exponent shift
\r
371 const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
\r
372 auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
\r
374 /// Pack compressed data network byte order
\r
375 auto compDataBytePacked = networkBytePack9b(compData);
\r
377 /// Store exponent first
\r
378 constexpr int k_totNumBytesPerRB = 28;
\r
379 auto thisRBExpAddr = n * k_totNumBytesPerRB;
\r
380 dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
\r
382 /// Now have 1 RB worth of bytes separated into 3 chunks (1 per lane)
\r
383 /// Use three offset stores to join
\r
384 constexpr uint16_t k_RbWriteMask = 0x01FF;
\r
385 constexpr int k_numDataBytesPerLane = 9;
\r
386 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
\r
387 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + k_numDataBytesPerLane, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
\r
388 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * k_numDataBytesPerLane), k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
\r
393 /// 10 bit compression
\r
395 BlockFloatCompander::BlockFloatCompress_10b_AVX512(const ExpandedData& dataIn, CompressedData* dataOut)
\r
397 /// Compute exponent and store for later use
\r
398 int8_t storedExp[BlockFloatCompander::k_numRB] = {};
\r
399 computeExponent(dataIn, storedExp);
\r
401 /// Shift 1RB by corresponding exponent and write exponent and data to output
\r
402 /// Output data is packed exponent first followed by corresponding compressed RB
\r
403 #pragma unroll(BlockFloatCompander::k_numRB)
\r
404 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
406 /// Apply exponent shift
\r
407 const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
\r
408 auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
\r
410 /// Pack compressed data network byte order
\r
411 auto compDataBytePacked = networkBytePack10b(compData);
\r
413 /// Store exponent first
\r
414 constexpr int k_totNumBytesPerRB = 31;
\r
415 auto thisRBExpAddr = n * k_totNumBytesPerRB;
\r
416 dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
\r
418 /// Now have 1 RB worth of bytes separated into 3 chunks (1 per lane)
\r
419 /// Use three offset stores to join
\r
420 constexpr uint16_t k_RbWriteMask = 0x03FF;
\r
421 constexpr int k_numDataBytesPerLane = 10;
\r
422 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
\r
423 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + k_numDataBytesPerLane, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
\r
424 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * k_numDataBytesPerLane), k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
\r
429 /// 12 bit compression
\r
431 BlockFloatCompander::BlockFloatCompress_12b_AVX512(const ExpandedData& dataIn, CompressedData* dataOut)
\r
433 /// Compute exponent and store for later use
\r
434 int8_t storedExp[BlockFloatCompander::k_numRB] = {};
\r
435 computeExponent(dataIn, storedExp);
\r
437 /// Shift 1RB by corresponding exponent and write exponent and data to output
\r
438 /// Output data is packed exponent first followed by corresponding compressed RB
\r
439 #pragma unroll(BlockFloatCompander::k_numRB)
\r
440 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
442 /// Apply exponent shift
\r
443 const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(dataIn.dataExpanded + n * BlockFloatCompander::k_numREReal);
\r
444 auto compData = _mm512_srai_epi16(*rawDataIn, storedExp[n]);
\r
446 /// Pack compressed data network byte order
\r
447 auto compDataBytePacked = networkBytePack12b(compData);
\r
449 /// Store exponent first
\r
450 constexpr int k_totNumBytesPerRB = 37;
\r
451 auto thisRBExpAddr = n * k_totNumBytesPerRB;
\r
452 dataOut->dataCompressed[thisRBExpAddr] = storedExp[n];
\r
454 /// Now have 1 RB worth of bytes separated into 3 chunks (1 per lane)
\r
455 /// Use three offset stores to join
\r
456 constexpr uint16_t k_RbWriteMask = 0x0FFF;
\r
457 constexpr int k_numDataBytesPerLane = 12;
\r
458 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 0));
\r
459 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + k_numDataBytesPerLane, k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 1));
\r
460 _mm_mask_storeu_epi8(dataOut->dataCompressed + thisRBExpAddr + 1 + (2 * k_numDataBytesPerLane), k_RbWriteMask, _mm512_extracti64x2_epi64(compDataBytePacked, 2));
\r
465 /// 8 bit expansion
\r
467 BlockFloatCompander::BlockFloatExpand_8b_AVX512(const CompressedData& dataIn, ExpandedData* dataOut)
\r
469 #pragma unroll(BlockFloatCompander::k_numRB)
\r
470 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
472 /// Expand 1RB of data
\r
473 auto expAddr = n * (BlockFloatCompander::k_numREReal + 1);
\r
474 const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(dataIn.dataCompressed + expAddr + 1);
\r
475 const auto compData16 = _mm512_cvtepi8_epi16(*rawDataIn);
\r
476 const auto expData = _mm512_slli_epi16(compData16, *(dataIn.dataCompressed + expAddr));
\r
477 /// Write expanded data to output
\r
478 constexpr uint8_t k_rbMask64 = 0b00111111; // 64b write mask for 1RB (24 int16 values)
\r
479 _mm512_mask_storeu_epi64(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_rbMask64, expData);
\r
484 /// 9 bit expansion
\r
486 BlockFloatCompander::BlockFloatExpand_9b_AVX512(const CompressedData& dataIn, ExpandedData* dataOut)
\r
488 #pragma unroll(BlockFloatCompander::k_numRB)
\r
489 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
491 constexpr int k_totNumBytesPerRB = 28;
\r
492 auto expAddr = n * k_totNumBytesPerRB;
\r
494 /// Unpack network order packed data
\r
495 auto expData = networkByteUnpack9b(dataIn.dataCompressed + expAddr + 1);
\r
497 /// Apply exponent scaling (by appropriate arithmetic shift right)
\r
498 constexpr int k_maxExpShift = 7;
\r
499 expData = _mm512_srai_epi16(expData, k_maxExpShift - *(dataIn.dataCompressed + expAddr));
\r
501 /// Write expanded data to output
\r
502 static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
\r
503 _mm512_mask_storeu_epi16(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_WriteMask, expData);
\r
508 /// 10 bit expansion
\r
510 BlockFloatCompander::BlockFloatExpand_10b_AVX512(const CompressedData& dataIn, ExpandedData* dataOut)
\r
512 #pragma unroll(BlockFloatCompander::k_numRB)
\r
513 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
515 constexpr int k_totNumBytesPerRB = 31;
\r
516 auto expAddr = n * k_totNumBytesPerRB;
\r
518 /// Unpack network order packed data
\r
519 auto expData = networkByteUnpack10b(dataIn.dataCompressed + expAddr + 1);
\r
521 /// Apply exponent scaling (by appropriate arithmetic shift right)
\r
522 constexpr int k_maxExpShift = 6;
\r
523 expData = _mm512_srai_epi16(expData, k_maxExpShift - *(dataIn.dataCompressed + expAddr));
\r
525 /// Write expanded data to output
\r
526 static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
\r
527 _mm512_mask_storeu_epi16(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_WriteMask, expData);
\r
532 /// 12 bit expansion
\r
534 BlockFloatCompander::BlockFloatExpand_12b_AVX512(const CompressedData& dataIn, ExpandedData* dataOut)
\r
536 #pragma unroll(BlockFloatCompander::k_numRB)
\r
537 for (int n = 0; n < BlockFloatCompander::k_numRB; ++n)
\r
539 constexpr int k_totNumBytesPerRB = 37;
\r
540 auto expAddr = n * k_totNumBytesPerRB;
\r
542 /// Unpack network order packed data
\r
543 auto expData = networkByteUnpack12b(dataIn.dataCompressed + expAddr + 1);
\r
545 /// Apply exponent scaling (by appropriate arithmetic shift right)
\r
546 constexpr int k_maxExpShift = 4;
\r
547 expData = _mm512_srai_epi16(expData, k_maxExpShift - *(dataIn.dataCompressed + expAddr));
\r
549 /// Write expanded data to output
\r
550 static constexpr uint32_t k_WriteMask = 0x00FFFFFF;
\r
551 _mm512_mask_storeu_epi16(dataOut->dataExpanded + n * BlockFloatCompander::k_numREReal, k_WriteMask, expData);
\r
556 /// Reference compression
\r
558 BlockFloatCompander::BlockFloatCompress_Basic(const ExpandedData& dataIn, CompressedData* dataOut)
\r
560 int dataOutIdx = 0;
\r
561 int16_t iqMask = (int16_t)((1 << dataIn.iqWidth) - 1);
\r
562 int byteShiftUnits = dataIn.iqWidth - 8;
\r
564 for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb)
\r
566 /// Find max abs value for this RB
\r
567 int16_t maxAbs = 0;
\r
568 for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re)
\r
570 auto dataIdx = rb * BlockFloatCompander::k_numREReal + re;
\r
571 auto dataAbs = saturateAbs(dataIn.dataExpanded[dataIdx]);
\r
572 maxAbs = std::max(maxAbs, dataAbs);
\r
575 // Find exponent and insert into byte stream
\r
576 auto thisExp = (uint8_t)(std::max(0,(16 - dataIn.iqWidth + 1 - __lzcnt16(maxAbs))));
\r
577 dataOut->dataCompressed[dataOutIdx++] = thisExp;
\r
579 /// ARS data by exponent and pack bytes in Network order
\r
580 /// This uses a sliding buffer where one or more bytes are
\r
581 /// extracted after the insertion of each compressed sample
\r
582 static constexpr int k_byteMask = 0xFF;
\r
583 int byteShiftVal = -8;
\r
584 int byteBuffer = { 0 };
\r
585 for (int re = 0; re < BlockFloatCompander::k_numREReal; ++re)
\r
587 auto dataIdxIn = rb * BlockFloatCompander::k_numREReal + re;
\r
588 auto thisRE = dataIn.dataExpanded[dataIdxIn] >> thisExp;
\r
589 byteBuffer = (byteBuffer << dataIn.iqWidth) + (int)(thisRE & iqMask);
\r
591 byteShiftVal += (8 + byteShiftUnits);
\r
592 while (byteShiftVal >= 0)
\r
594 auto thisByte = (uint8_t)((byteBuffer >> byteShiftVal) & k_byteMask);
\r
595 dataOut->dataCompressed[dataOutIdx++] = thisByte;
\r
600 dataOut->iqWidth = dataIn.iqWidth;
\r
603 /// Reference expansion
\r
605 BlockFloatCompander::BlockFloatExpand_Basic(const CompressedData& dataIn, ExpandedData* dataOut)
\r
607 uint32_t iqMask = (uint32_t)(UINT_MAX - ((1 << (32 - dataIn.iqWidth)) - 1));
\r
608 uint32_t byteBuffer = { 0 };
\r
609 int numBytesPerRB = (3 * dataIn.iqWidth) + 1;
\r
610 int bitPointer = 0;
\r
611 int dataIdxOut = 0;
\r
613 for (int rb = 0; rb < BlockFloatCompander::k_numRB; ++rb)
\r
615 auto expIdx = rb * numBytesPerRB;
\r
616 auto signExtShift = 32 - dataIn.iqWidth - dataIn.dataCompressed[expIdx];
\r
618 for (int b = 0; b < numBytesPerRB - 1; ++b)
\r
620 auto dataIdxIn = (expIdx + 1) + b;
\r
621 auto thisByte = (uint16_t)dataIn.dataCompressed[dataIdxIn];
\r
622 byteBuffer = (uint32_t)((byteBuffer << 8) + thisByte);
\r
624 while (bitPointer >= dataIn.iqWidth)
\r
626 /// byteBuffer currently has enough data in it to extract a sample
\r
627 /// Shift left first to set sign bit at MSB, then shift right to
\r
628 /// sign extend down to iqWidth. Finally recast to int16.
\r
629 int32_t thisSample32 = (int32_t)((byteBuffer << (32 - bitPointer)) & iqMask);
\r
630 int16_t thisSample = (int16_t)(thisSample32 >> signExtShift);
\r
631 bitPointer -= dataIn.iqWidth;
\r
632 dataOut->dataExpanded[dataIdxOut++] = thisSample;
\r
638 /// Reference compression
\r
640 BlockFloatCompanderBFW::BlockFloatCompress_Basic(const BlockFloatCompanderBFW::ExpandedData& dataIn, BlockFloatCompanderBFW::CompressedData* dataOut)
\r
642 int dataOutIdx = 0;
\r
643 int16_t iqMask = (int16_t)((1 << dataIn.iqWidth) - 1);
\r
644 int byteShiftUnits = dataIn.iqWidth - 8;
\r
646 for (int rb = 0; rb < BlockFloatCompanderBFW::k_numRB; ++rb)
\r
648 /// Find max abs value for this RB
\r
649 int16_t maxAbs = 0;
\r
650 for (int re = 0; re < BlockFloatCompanderBFW::k_numREReal; ++re)
\r
652 auto dataIdx = rb * BlockFloatCompanderBFW::k_numREReal + re;
\r
653 auto dataAbs = saturateAbs(dataIn.dataExpanded[dataIdx]);
\r
654 maxAbs = std::max(maxAbs, dataAbs);
\r
657 // Find exponent and insert into byte stream
\r
658 auto thisExp = (uint8_t)(std::max(0,(16 - dataIn.iqWidth + 1 - __lzcnt16(maxAbs))));
\r
659 dataOut->dataCompressed[dataOutIdx++] = thisExp;
\r
661 /// ARS data by exponent and pack bytes in Network order
\r
662 /// This uses a sliding buffer where one or more bytes are
\r
663 /// extracted after the insertion of each compressed sample
\r
664 static constexpr int k_byteMask = 0xFF;
\r
665 int byteShiftVal = -8;
\r
666 int byteBuffer = { 0 };
\r
667 for (int re = 0; re < BlockFloatCompanderBFW::k_numREReal; ++re)
\r
669 auto dataIdxIn = rb * BlockFloatCompanderBFW::k_numREReal + re;
\r
670 auto thisRE = dataIn.dataExpanded[dataIdxIn] >> thisExp;
\r
671 byteBuffer = (byteBuffer << dataIn.iqWidth) + (int)(thisRE & iqMask);
\r
673 byteShiftVal += (8 + byteShiftUnits);
\r
674 while (byteShiftVal >= 0)
\r
676 auto thisByte = (uint8_t)((byteBuffer >> byteShiftVal) & k_byteMask);
\r
677 dataOut->dataCompressed[dataOutIdx++] = thisByte;
\r
682 dataOut->iqWidth = dataIn.iqWidth;
\r
685 /// Reference expansion
\r
687 BlockFloatCompanderBFW::BlockFloatExpand_Basic(const BlockFloatCompanderBFW::CompressedData& dataIn, BlockFloatCompanderBFW::ExpandedData* dataOut)
\r
689 uint32_t iqMask = (uint32_t)(UINT_MAX - ((1 << (32 - dataIn.iqWidth)) - 1));
\r
690 uint32_t byteBuffer = { 0 };
\r
691 int numBytesPerRB = (3 * dataIn.iqWidth) + 1;
\r
692 int bitPointer = 0;
\r
693 int dataIdxOut = 0;
\r
695 for (int rb = 0; rb < BlockFloatCompanderBFW::k_numRB; ++rb)
\r
697 auto expIdx = rb * numBytesPerRB;
\r
698 auto signExtShift = 32 - dataIn.iqWidth - dataIn.dataCompressed[expIdx];
\r
700 for (int b = 0; b < numBytesPerRB - 1; ++b)
\r
702 auto dataIdxIn = (expIdx + 1) + b;
\r
703 auto thisByte = (uint16_t)dataIn.dataCompressed[dataIdxIn];
\r
704 byteBuffer = (uint32_t)((byteBuffer << 8) + thisByte);
\r
706 while (bitPointer >= dataIn.iqWidth)
\r
708 /// byteBuffer currently has enough data in it to extract a sample
\r
709 /// Shift left first to set sign bit at MSB, then shift right to
\r
710 /// sign extend down to iqWidth. Finally recast to int16.
\r
711 int32_t thisSample32 = (int32_t)((byteBuffer << (32 - bitPointer)) & iqMask);
\r
712 int16_t thisSample = (int16_t)(thisSample32 >> signExtShift);
\r
713 bitPointer -= dataIn.iqWidth;
\r
714 dataOut->dataExpanded[dataIdxOut++] = thisSample;
\r
720 #define RB_NUM_ROUNDUP(rb) \
\r
721 (BlockFloatCompander::k_numRB * ((rb + BlockFloatCompander::k_numRB - 1) / BlockFloatCompander::k_numRB))
\r
724 /** callback function type for Symbol packet */
\r
725 typedef void (*xran_bfp_compress_fn)(const BlockFloatCompander::ExpandedData& dataIn,
\r
726 BlockFloatCompander::CompressedData* dataOut);
\r
729 xranlib_compress_avx512(const struct xranlib_compress_request *request,
\r
730 struct xranlib_compress_response *response)
\r
732 BlockFloatCompander::ExpandedData expandedDataInput;
\r
733 BlockFloatCompander::CompressedData compressedDataOut;
\r
734 xran_bfp_compress_fn com_fn = NULL;
\r
735 int16_t numRBs = request->numRBs;
\r
738 switch (request->iqWidth){
\r
740 expandedDataInput.iqWidth = 8;
\r
741 com_fn = BlockFloatCompander::BlockFloatCompress_8b_AVX512;
\r
744 expandedDataInput.iqWidth = 9;
\r
745 com_fn = BlockFloatCompander::BlockFloatCompress_9b_AVX512;
\r
748 expandedDataInput.iqWidth = 10;
\r
749 com_fn = BlockFloatCompander::BlockFloatCompress_10b_AVX512;
\r
752 expandedDataInput.iqWidth = 12;
\r
753 com_fn = BlockFloatCompander::BlockFloatCompress_12b_AVX512;
\r
756 expandedDataInput.iqWidth = request->iqWidth;
\r
757 com_fn = BlockFloatCompander::BlockFloatCompress_Basic;
\r
761 for (int16_t block_idx = 0;
\r
762 block_idx < RB_NUM_ROUNDUP(numRBs)/BlockFloatCompander::k_numRB /*+ 1*/; /* 16 RBs at time */
\r
765 expandedDataInput.dataExpanded =
\r
766 &request->data_in[block_idx*BlockFloatCompander::k_numSampsExpanded];
\r
767 compressedDataOut.dataCompressed =
\r
768 (uint8_t*)&response->data_out[len];
\r
770 com_fn(expandedDataInput, &compressedDataOut);
\r
771 len += ((3 * expandedDataInput.iqWidth) + 1) * std::min((int16_t)BlockFloatCompander::k_numRB,(int16_t)numRBs);
\r
774 response->len = ((3 * expandedDataInput.iqWidth) + 1) * numRBs;
\r
779 /** callback function type for Symbol packet */
\r
780 typedef void (*xran_bfp_compress_bfw_fn)(const BlockFloatCompanderBFW::ExpandedData& dataIn, BlockFloatCompanderBFW::CompressedData* dataOut);
\r
783 xranlib_compress_avx512_bfw(const struct xranlib_compress_request *request,
\r
784 struct xranlib_compress_response *response)
\r
786 BlockFloatCompanderBFW::ExpandedData expandedDataInput;
\r
787 BlockFloatCompanderBFW::CompressedData compressedDataKern;
\r
788 xran_bfp_compress_bfw_fn com_fn = NULL;
\r
791 for (int m = 0; m < BlockFloatCompander::k_numRB; ++m){
\r
792 for (int n = 0; n < BlockFloatCompander::k_numREReal; ++n){
\r
793 expandedDataInput.dataExpanded[m*BlockFloatCompander::k_numREReal+n] =
\r
794 request->data_in[m*BlockFloatCompander::k_numREReal+n];
\r
799 expandedDataInput.dataExpanded = request->data_in;
\r
800 compressedDataKern.dataCompressed = (uint8_t*)response->data_out;
\r
802 com_fn = BlockFloatCompanderBFW::BlockFloatCompress_Basic;
\r
803 switch (request->iqWidth){
\r
805 expandedDataInput.iqWidth = 8;
\r
808 expandedDataInput.iqWidth = 9;
\r
809 //com_fn = BlockFloatCompanderBFW::BlockFloatExpand_9b_AVX512
\r
812 expandedDataInput.iqWidth = 10;
\r
815 expandedDataInput.iqWidth = 12;
\r
818 printf("bfwIqWidth is not supported %d\n", request->iqWidth);
\r
823 com_fn(expandedDataInput, &compressedDataKern);
\r
824 response->len = ((BlockFloatCompanderBFW::k_numRE/16*4*expandedDataInput.iqWidth)+1)*BlockFloatCompanderBFW::k_numRB;
\r
829 /** callback function type for Symbol packet */
\r
830 typedef void (*xran_bfp_decompress_fn)(const BlockFloatCompander::CompressedData& dataIn, BlockFloatCompander::ExpandedData* dataOut);
\r
834 xranlib_decompress_avx512(const struct xranlib_decompress_request *request,
\r
835 struct xranlib_decompress_response *response)
\r
838 BlockFloatCompander::CompressedData compressedDataInput;
\r
839 BlockFloatCompander::ExpandedData expandedDataOut;
\r
841 xran_bfp_decompress_fn decom_fn = NULL;
\r
842 int16_t numRBs = request->numRBs;
\r
845 switch (request->iqWidth){
\r
847 compressedDataInput.iqWidth = 8;
\r
848 decom_fn = BlockFloatCompander::BlockFloatExpand_8b_AVX512;
\r
851 compressedDataInput.iqWidth = 9;
\r
852 decom_fn = BlockFloatCompander::BlockFloatExpand_9b_AVX512;
\r
855 compressedDataInput.iqWidth = 10;
\r
856 decom_fn = BlockFloatCompander::BlockFloatExpand_10b_AVX512;
\r
859 compressedDataInput.iqWidth = 12;
\r
860 decom_fn = BlockFloatCompander::BlockFloatExpand_12b_AVX512;
\r
863 compressedDataInput.iqWidth = request->iqWidth;
\r
864 decom_fn = BlockFloatCompander::BlockFloatExpand_Basic;
\r
868 for (int16_t block_idx = 0;
\r
869 block_idx < RB_NUM_ROUNDUP(numRBs)/BlockFloatCompander::k_numRB;
\r
872 compressedDataInput.dataCompressed = (uint8_t*)&request->data_in[block_idx*(((3 * compressedDataInput.iqWidth ) + 1) * BlockFloatCompander::k_numRB)];
\r
873 expandedDataOut.dataExpanded = &response->data_out[len];
\r
875 decom_fn(compressedDataInput, &expandedDataOut);
\r
876 len += std::min((int16_t)BlockFloatCompander::k_numSampsExpanded, (int16_t)(numRBs*BlockFloatCompander::k_numREReal));
\r
879 response->len = numRBs * BlockFloatCompander::k_numREReal* sizeof(int16_t);
\r