Update to odulow per maintenance bronze
[o-du/phy.git] / fhi_lib / lib / src / xran_bfp_utils.hpp
1 /******************************************************************************
2 *
3 *   Copyright (c) 2019 Intel.
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 *
17 *******************************************************************************/
18
19 /**
20  * @brief xRAN BFP compression/decompression utilities functions
21  *
22  * @file xran_bfp_utils.hpp
23  * @ingroup group_source_xran
24  * @author Intel Corporation
25  **/
26
27 #pragma once
28 #include <immintrin.h>
29
30 namespace BlockFloatCompander
31 {
32   /// Define function signatures for byte packing functions
33   typedef __m512i(*PackFunction)(const __m512i);
34   typedef __m512i(*UnpackFunction)(const uint8_t*);
35   typedef __m256i(*UnpackFunction256)(const uint8_t*);
36
37   /// Calculate exponent based on 16 max abs values using leading zero count.
38   inline __m512i
39   maskUpperWord(const __m512i inData)
40   {
41     const auto k_upperWordMask = _mm512_set_epi64(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
42                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
43                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
44                                                   0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF);
45     return _mm512_and_epi64(inData, k_upperWordMask);
46   }
47
48   /// Calculate exponent based on 16 max abs values using leading zero count.
49   inline __m512i
50   expLzCnt(const __m512i maxAbs, const __m512i totShiftBits)
51   {
52     /// Compute exponent
53     const auto lzCount = _mm512_lzcnt_epi32(maxAbs);
54     return _mm512_subs_epu16(totShiftBits, lzCount);
55   }
56
57   inline int
58   horizontalMax1x32(const __m512i maxAbsReg)
59   {
60     /// Swap each IQ pair in each lane (via 32b rotation) and compute max of
61     /// each pair.
62     const auto maxRot16 = _mm512_rol_epi32(maxAbsReg, BlockFloatCompander::k_numBitsIQ);
63     const auto maxAbsIQ = _mm512_max_epi16(maxAbsReg, maxRot16);
64     /// Convert to 32b by removing repeated values in maxAbs
65     const auto maxAbs32 = maskUpperWord(maxAbsIQ);
66     /// Return reduced max
67     return _mm512_reduce_max_epi32(maxAbs32);
68   }
69
70   /// Pack compressed 9 bit data in network byte order
71   /// See https://soco.intel.com/docs/DOC-2665619
72   inline __m512i
73   networkBytePack9b(const __m512i compData)
74   {
75     /// Logical shift left to align network order byte parts
76     const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000100020003, 0x0004000500060007,
77                                                  0x0000000100020003, 0x0004000500060007,
78                                                  0x0000000100020003, 0x0004000500060007,
79                                                  0x0000000100020003, 0x0004000500060007);
80     const auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
81
82     /// First epi8 shuffle of even indexed samples
83     const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x0000000000000000, 0x0C0D080904050001,
84                                                         0x0000000000000000, 0x0C0D080904050001,
85                                                         0x0000000000000000, 0x0C0D080904050001,
86                                                         0x0000000000000000, 0x0C0D080904050001);
87     constexpr uint64_t k_byteMask1 = 0x00FF00FF00FF00FF;
88     const auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
89
90     /// Second epi8 shuffle of odd indexed samples
91     const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000000000E, 0x0F0A0B0607020300,
92                                                         0x000000000000000E, 0x0F0A0B0607020300,
93                                                         0x000000000000000E, 0x0F0A0B0607020300,
94                                                         0x000000000000000E, 0x0F0A0B0607020300);
95     constexpr uint64_t k_byteMask2 = 0x01FE01FE01FE01FE;
96     const auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
97
98     /// Ternary blend of the two shuffled results
99     const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000000000FF, 0x01FC07F01FC07F00,
100                                                      0x00000000000000FF, 0x01FC07F01FC07F00,
101                                                      0x00000000000000FF, 0x01FC07F01FC07F00,
102                                                      0x00000000000000FF, 0x01FC07F01FC07F00);
103     return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
104   }
105
106
107   /// Pack compressed 10 bit data in network byte order
108   /// See https://soco.intel.com/docs/DOC-2665619
109   inline __m512i
110   networkBytePack10b(const __m512i compData)
111   {
112     /// Logical shift left to align network order byte parts
113     const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000200040006, 0x0000000200040006,
114                                                  0x0000000200040006, 0x0000000200040006,
115                                                  0x0000000200040006, 0x0000000200040006,
116                                                  0x0000000200040006, 0x0000000200040006);
117     const auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
118
119     /// First epi8 shuffle of even indexed samples
120     const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x000000000000000C, 0x0D08090004050001,
121                                                         0x000000000000000C, 0x0D08090004050001,
122                                                         0x000000000000000C, 0x0D08090004050001,
123                                                         0x000000000000000C, 0x0D08090004050001);
124     constexpr uint64_t k_byteMask1 = 0x01EF01EF01EF01EF;
125     const auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
126
127     /// Second epi8 shuffle of odd indexed samples
128     const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x0000000000000E0F, 0x0A0B000607020300,
129                                                         0x0000000000000E0F, 0x0A0B000607020300,
130                                                         0x0000000000000E0F, 0x0A0B000607020300,
131                                                         0x0000000000000E0F, 0x0A0B000607020300);
132     constexpr uint64_t k_byteMask2 = 0x03DE03DE03DE03DE;
133     const auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
134
135     /// Ternary blend of the two shuffled results
136     const __m512i k_ternLogSelect = _mm512_set_epi64(0x000000000000FF03, 0xF03F00FF03F03F00,
137                                                      0x000000000000FF03, 0xF03F00FF03F03F00,
138                                                      0x000000000000FF03, 0xF03F00FF03F03F00,
139                                                      0x000000000000FF03, 0xF03F00FF03F03F00);
140     return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
141   }
142
143
144   /// Pack compressed 12 bit data in network byte order
145   /// See https://soco.intel.com/docs/DOC-2665619
146   inline __m512i
147   networkBytePack12b(const __m512i compData)
148   {
149     /// Logical shift left to align network order byte parts
150     const __m512i k_shiftLeft = _mm512_set_epi64(0x0000000400000004, 0x0000000400000004,
151                                                  0x0000000400000004, 0x0000000400000004,
152                                                  0x0000000400000004, 0x0000000400000004,
153                                                  0x0000000400000004, 0x0000000400000004);
154     const auto compDataPacked = _mm512_sllv_epi16(compData, k_shiftLeft);
155
156     /// First epi8 shuffle of even indexed samples
157     const __m512i k_byteShuffleMask1 = _mm512_set_epi64(0x00000000000C0D00, 0x0809000405000001,
158                                                         0x00000000000C0D00, 0x0809000405000001,
159                                                         0x00000000000C0D00, 0x0809000405000001,
160                                                         0x00000000000C0D00, 0x0809000405000001);
161     constexpr uint64_t k_byteMask1 = 0x06DB06DB06DB06DB;
162     const auto compDataShuff1 = _mm512_maskz_shuffle_epi8(k_byteMask1, compDataPacked, k_byteShuffleMask1);
163
164     /// Second epi8 shuffle of odd indexed samples
165     const __m512i k_byteShuffleMask2 = _mm512_set_epi64(0x000000000E0F000A, 0x0B00060700020300,
166                                                         0x000000000E0F000A, 0x0B00060700020300,
167                                                         0x000000000E0F000A, 0x0B00060700020300,
168                                                         0x000000000E0F000A, 0x0B00060700020300);
169     constexpr uint64_t k_byteMask2 = 0x0DB60DB60DB60DB6;
170     const auto compDataShuff2 = _mm512_maskz_shuffle_epi8(k_byteMask2, compDataPacked, k_byteShuffleMask2);
171
172     /// Ternary blend of the two shuffled results
173     const __m512i k_ternLogSelect = _mm512_set_epi64(0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
174                                                      0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
175                                                      0x00000000FF0F00FF, 0x0F00FF0F00FF0F00,
176                                                      0x00000000FF0F00FF, 0x0F00FF0F00FF0F00);
177     return _mm512_ternarylogic_epi64(compDataShuff1, compDataShuff2, k_ternLogSelect, 0xd8);
178   }
179
180
181   /// Unpack compressed 9 bit data in network byte order
182   /// See https://soco.intel.com/docs/DOC-2665619
183   inline __m512i
184   networkByteUnpack9b(const uint8_t* inData)
185   {
186     /// Align chunks of compressed bytes into lanes to allow for expansion
187     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
188     const auto k_expPerm = _mm512_set_epi32(9, 8, 7, 6, 7, 6, 5, 4,
189                                             5, 4, 3, 2, 3, 2, 1, 0);
190     const auto inLaneAlign = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
191
192     /// Byte shuffle to get all bits for each sample into 16b chunks
193     /// Due to previous permute to get chunks of bytes into each lane, there is
194     /// a different shuffle offset in each lane
195     const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A08090708, 0x0607050604050304,
196                                                        0x090A080907080607, 0x0506040503040203,
197                                                        0x0809070806070506, 0x0405030402030102,
198                                                        0x0708060705060405, 0x0304020301020001);
199     const auto inDatContig = _mm512_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
200
201     /// Logical shift left to set sign bit
202     const __m512i k_slBits = _mm512_set_epi64(0x0007000600050004, 0x0003000200010000,
203                                               0x0007000600050004, 0x0003000200010000,
204                                               0x0007000600050004, 0x0003000200010000,
205                                               0x0007000600050004, 0x0003000200010000);
206     const auto inSetSign = _mm512_sllv_epi16(inDatContig, k_slBits);
207
208     /// Mask to zero unwanted bits
209     const __m512i k_expMask = _mm512_set1_epi16(0xFF80);
210     return _mm512_and_epi64(inSetSign, k_expMask);
211   }
212
213
214   /// Unpack compressed 10 bit data in network byte order
215   /// See https://soco.intel.com/docs/DOC-2665619
216   inline __m512i
217   networkByteUnpack10b(const uint8_t* inData)
218   {
219     /// Align chunks of compressed bytes into lanes to allow for expansion
220     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
221     const auto k_expPerm = _mm512_set_epi32(10, 9, 8, 7, 8, 7, 6, 5,
222                                              5, 4, 3, 2, 3, 2, 1, 0);
223     const auto inLaneAlign = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
224
225     /// Byte shuffle to get all bits for each sample into 16b chunks
226     /// Due to previous permute to get chunks of bytes into each lane, lanes
227     /// 0 and 2 happen to be aligned, but lane 1 is offset by 2 bytes
228     const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A08090708, 0x0506040503040203,
229                                                        0x0809070806070506, 0x0304020301020001,
230                                                        0x0A0B090A08090708, 0x0506040503040203,
231                                                        0x0809070806070506, 0x0304020301020001);
232     const auto inDatContig = _mm512_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
233
234     /// Logical shift left to set sign bit
235     const __m512i k_slBits = _mm512_set_epi64(0x0006000400020000, 0x0006000400020000,
236                                               0x0006000400020000, 0x0006000400020000,
237                                               0x0006000400020000, 0x0006000400020000,
238                                               0x0006000400020000, 0x0006000400020000);
239     const auto inSetSign = _mm512_sllv_epi16(inDatContig, k_slBits);
240
241     /// Mask to zero unwanted bits
242     const __m512i k_expMask = _mm512_set1_epi16(0xFFC0);
243     return _mm512_and_epi64(inSetSign, k_expMask);
244   }
245
246
247   /// Unpack compressed 12 bit data in network byte order
248   /// See https://soco.intel.com/docs/DOC-2665619
249   inline __m512i
250   networkByteUnpack12b(const uint8_t* inData)
251   {
252     /// Align chunks of compressed bytes into lanes to allow for expansion
253     const __m512i* rawDataIn = reinterpret_cast<const __m512i*>(inData);
254     const auto k_expPerm = _mm512_set_epi32(12, 11, 10, 9, 9, 8, 7, 6,
255                                              6, 5, 4, 3, 3, 2, 1, 0);
256     const auto inLaneAlign = _mm512_permutexvar_epi32(k_expPerm, *rawDataIn);
257
258     /// Byte shuffle to get all bits for each sample into 16b chunks
259     /// For 12b mantissa all lanes post-permute are aligned and require same shuffle offset
260     const __m512i k_byteShuffleMask = _mm512_set_epi64(0x0A0B090A07080607, 0x0405030401020001,
261                                                        0x0A0B090A07080607, 0x0405030401020001,
262                                                        0x0A0B090A07080607, 0x0405030401020001,
263                                                        0x0A0B090A07080607, 0x0405030401020001);
264     const auto inDatContig = _mm512_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
265
266     /// Logical shift left to set sign bit
267     const __m512i k_slBits = _mm512_set_epi64(0x0004000000040000, 0x0004000000040000,
268                                               0x0004000000040000, 0x0004000000040000,
269                                               0x0004000000040000, 0x0004000000040000,
270                                               0x0004000000040000, 0x0004000000040000);
271     const auto inSetSign = _mm512_sllv_epi16(inDatContig, k_slBits);
272
273     /// Mask to zero unwanted bits
274     const __m512i k_expMask = _mm512_set1_epi16(0xFFF0);
275     return _mm512_and_epi64(inSetSign, k_expMask);
276   }
277
278
279   /// Unpack compressed 9 bit data in network byte order
280   /// See https://soco.intel.com/docs/DOC-2665619
281   /// This unpacking function is for 256b registers
282   inline __m256i
283   networkByteUnpack9b256(const uint8_t* inData)
284   {
285     /// Align chunks of compressed bytes into lanes to allow for expansion
286     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(inData);
287     const auto k_expPerm = _mm256_set_epi32(5, 4, 3, 2, 3, 2, 1, 0);
288     const auto inLaneAlign = _mm256_permutexvar_epi32(k_expPerm, *rawDataIn);
289
290     /// Byte shuffle to get all bits for each sample into 16b chunks
291     /// Due to previous permute to get chunks of bytes into each lane, there is
292     /// a different shuffle offset in each lane
293     const __m256i k_byteShuffleMask = _mm256_set_epi64x(0x0809070806070506, 0x0405030402030102,
294                                                         0x0708060705060405, 0x0304020301020001);
295     const auto inDatContig = _mm256_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
296
297     /// Logical shift left to set sign bit
298     const __m256i k_slBits = _mm256_set_epi64x(0x0007000600050004, 0x0003000200010000,
299                                                0x0007000600050004, 0x0003000200010000);
300     const auto inSetSign = _mm256_sllv_epi16(inDatContig, k_slBits);
301
302     /// Mask to zero unwanted bits
303     const __m256i k_expMask = _mm256_set1_epi16(0xFF80);
304     return _mm256_and_si256(inSetSign, k_expMask);
305   }
306
307
308   /// Unpack compressed 10 bit data in network byte order
309   /// See https://soco.intel.com/docs/DOC-2665619
310   /// This unpacking function is for 256b registers
311   inline __m256i
312   networkByteUnpack10b256(const uint8_t* inData)
313   {
314     /// Align chunks of compressed bytes into lanes to allow for expansion
315     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(inData);
316     const auto k_expPerm = _mm256_set_epi32(5, 4, 3, 2, 3, 2, 1, 0);
317     const auto inLaneAlign = _mm256_permutexvar_epi32(k_expPerm, *rawDataIn);
318
319     /// Byte shuffle to get all bits for each sample into 16b chunks
320     /// Due to previous permute to get chunks of bytes into each lane, lanes
321     /// 0 and 2 happen to be aligned, but lane 1 is offset by 2 bytes
322     const __m256i k_byteShuffleMask = _mm256_set_epi64x(0x0A0B090A08090708, 0x0506040503040203,
323                                                         0x0809070806070506, 0x0304020301020001);
324     const auto inDatContig = _mm256_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
325
326     /// Logical shift left to set sign bit
327     const __m256i k_slBits = _mm256_set_epi64x(0x0006000400020000, 0x0006000400020000,
328                                                0x0006000400020000, 0x0006000400020000);
329     const auto inSetSign = _mm256_sllv_epi16(inDatContig, k_slBits);
330
331     /// Mask to zero unwanted bits
332     const __m256i k_expMask = _mm256_set1_epi16(0xFFC0);
333     return _mm256_and_si256(inSetSign, k_expMask);
334   }
335
336
337   /// Unpack compressed 12 bit data in network byte order
338   /// See https://soco.intel.com/docs/DOC-2665619
339   /// This unpacking function is for 256b registers
340   inline __m256i
341   networkByteUnpack12b256(const uint8_t* inData)
342   {
343     /// Align chunks of compressed bytes into lanes to allow for expansion
344     const __m256i* rawDataIn = reinterpret_cast<const __m256i*>(inData);
345     const auto k_expPerm = _mm256_set_epi32(6, 5, 4, 3, 3, 2, 1, 0);
346     const auto inLaneAlign = _mm256_permutexvar_epi32(k_expPerm, *rawDataIn);
347
348     /// Byte shuffle to get all bits for each sample into 16b chunks
349     /// For 12b mantissa all lanes post-permute are aligned and require same shuffle offset
350     const __m256i k_byteShuffleMask = _mm256_set_epi64x(0x0A0B090A07080607, 0x0405030401020001,
351                                                         0x0A0B090A07080607, 0x0405030401020001);
352     const auto inDatContig = _mm256_shuffle_epi8(inLaneAlign, k_byteShuffleMask);
353
354     /// Logical shift left to set sign bit
355     const __m256i k_slBits = _mm256_set_epi64x(0x0004000000040000, 0x0004000000040000,
356                                                0x0004000000040000, 0x0004000000040000);
357     const auto inSetSign = _mm256_sllv_epi16(inDatContig, k_slBits);
358
359     /// Mask to zero unwanted bits
360     const __m256i k_expMask = _mm256_set1_epi16(0xFFF0);
361     return _mm256_and_si256(inSetSign, k_expMask);
362   }
363 }