1 /******************************************************************************
3 * Copyright (c) 2020 Intel.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 *******************************************************************************/
19 #include <immintrin.h>
20 #include "xran_mod_compression.h"
24 mod_compression_qpsk_c(int16_t *pData,int8_t *pOut,int16_t unit, int32_t nSc)
26 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
28 uint8_t bit_pos= iSc &0x3;
29 int8_t bit_i = pData[iSc*2] >=0 ? 0 :1;
30 int8_t bit_q = pData[iSc*2+1] >=0 ? 0 :1;
31 *pOut |= bit_i<<(7-(bit_pos*2))|bit_q<<(6-(bit_pos*2));
38 mod_compression_16qam_c(int16_t *pData,int8_t *pOut,int16_t unit, int32_t nSc)
40 int16_t bit_unit = unit>>1;
41 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
43 uint8_t bit_pos= iSc &0x1;
44 int8_t bit_i = pData[iSc*2]/bit_unit;
45 int8_t bit_q = pData[iSc*2+1]/bit_unit;
55 *pOut |= bit_i<<(6-(bit_pos*4))|bit_q<<(4-(bit_pos*4));
62 mod_compression_64qam_c(int16_t *pData,int8_t *pOut,int16_t unit, int32_t nSc)
64 int16_t bit_unit = unit>>2;
65 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
67 int32_t bit_pos = iSc &0x3;
68 int8_t bit_i = pData[iSc*2]/bit_unit;
69 int8_t bit_q = pData[iSc*2+1]/bit_unit;
80 *pOut |= bit_i<<5|bit_q<<2;
82 else if (1 == bit_pos)
86 *pOut |= bit_i<<7|bit_q<<4;
88 else if (2 == bit_pos)
90 *pOut |= bit_i<<1|bit_q>>2;
94 else if (3 == bit_pos)
96 *pOut |= bit_i<<3|bit_q;
103 mod_compression_256qam_c(int16_t *pData,int8_t *pOut,int16_t unit,int32_t nSc)
105 int16_t bit_unit = unit>>3;
106 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
108 int8_t bit_i = pData[iSc*2]/bit_unit;
109 int8_t bit_q = pData[iSc*2+1]/bit_unit;
118 *pOut = (bit_i<<4)|bit_q;
123 void mod_compression_qpsk_avx512(int16_t *pData,int8_t *pOut, int16_t unit, int32_t nSc)
126 __m512i permute_index = _mm512_set_epi16(24,25,26,27,28,29,30,31,
127 16,17,18,19,20,21,22,23,
128 8,9,10,11,12,13,14,15,
131 //calculate loop size
132 const int32_t nSc0 = nSc&0xfffffff0;
133 const int32_t nSc1 = nSc&0xf;
136 __m512i *pDataOffset = (__m512i *) pData;
139 for(int32_t iSc=0; iSc<nSc0; iSc=iSc+16)
141 symbol = _mm512_loadu_epi32 (pDataOffset);
143 symbol = _mm512_permutexvar_epi16 (permute_index, symbol);
144 bits = _mm512_movepi16_mask(symbol);
145 *(int32_t *)pOut = bits;
151 k1 = ((__mmask16)1<<nSc1)-1;
153 symbol = _mm512_mask_loadu_epi32 (_mm512_setzero_si512(), k1, pDataOffset);
154 symbol = _mm512_permutexvar_epi16 (permute_index, symbol);
155 bits = _mm512_movepi16_mask(symbol);
156 for (uint8_t idx = 0;idx<(((nSc1-1)>>2)+1);idx++)
158 *pOut = *(((int8_t *)&bits)+idx);
165 byte_pack2b(const __m512i comp_data)
167 const __m512i k_shift_left = _mm512_set_epi64(0x0000000200040006, 0x0000000200040006,
168 0x0000000200040006, 0x0000000200040006,
169 0x0000000200040006, 0x0000000200040006,
170 0x0000000200040006, 0x0000000200040006);
171 const auto comp_data_packed = _mm512_sllv_epi16(comp_data, k_shift_left);
173 const __m512i k_byte_shufflemask1 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000800,
174 0x0000000000000000, 0x0000000000000800,
175 0x0000000000000000, 0x0000000000000800,
176 0x0000000000000000, 0x0000000000000800);
177 constexpr uint64_t k_bytemask1 = 0x0003000300030003;
178 const auto comp_data_shuff1 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask1);
180 const __m512i k_byte_shufflemask2 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000A02,
181 0x0000000000000000, 0x0000000000000A02,
182 0x0000000000000000, 0x0000000000000A02,
183 0x0000000000000000, 0x0000000000000A02);
184 const auto comp_data_shuff2 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask2);
186 const __m512i k_byte_shufflemask3 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000C04,
187 0x0000000000000000, 0x0000000000000C04,
188 0x0000000000000000, 0x0000000000000C04,
189 0x0000000000000000, 0x0000000000000C04);
190 const auto comp_data_shuff3 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask3);
192 const __m512i k_byte_shufflemask4 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000E06,
193 0x0000000000000000, 0x0000000000000E06,
194 0x0000000000000000, 0x0000000000000E06,
195 0x0000000000000000, 0x0000000000000E06);
196 const auto comp_data_shuff4 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask4);
198 /// Ternary blend of the two shuffled results
199 const __m512i k_ternlog_select1 = _mm512_set_epi64(0x0000000000000000, 0x0000000000003030,
200 0x0000000000000000, 0x0000000000003030,
201 0x0000000000000000, 0x0000000000003030,
202 0x0000000000000000, 0x0000000000003030);
204 const __m512i k_ternlog_select2 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000C0C,
205 0x0000000000000000, 0x0000000000000C0C,
206 0x0000000000000000, 0x0000000000000C0C,
207 0x0000000000000000, 0x0000000000000C0C);
209 const __m512i k_ternlog_select3 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000303,
210 0x0000000000000000, 0x0000000000000303,
211 0x0000000000000000, 0x0000000000000303,
212 0x0000000000000000, 0x0000000000000303);
214 auto comp_data_packed2 = _mm512_ternarylogic_epi64(comp_data_shuff1, comp_data_shuff2, k_ternlog_select1, 0xd8);
215 auto comp_data_packed3 = _mm512_ternarylogic_epi64(comp_data_packed2, comp_data_shuff3, k_ternlog_select2, 0xd8);
216 return _mm512_ternarylogic_epi64(comp_data_packed3, comp_data_shuff4, k_ternlog_select3, 0xd8);
220 byte_pack2b_snc(const __m512i comp_data)
222 const __m512i k_shift_left = _mm512_set_epi64(0x0000000200040006, 0x0000000200040006,
223 0x0000000200040006, 0x0000000200040006,
224 0x0000000200040006, 0x0000000200040006,
225 0x0000000200040006, 0x0000000200040006);
226 const auto comp_data_packed = _mm512_sllv_epi16(comp_data, k_shift_left);
228 const __m512i k_byte_shufflemask1 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000800,
229 0x0000000000000000, 0x0000000000000800,
230 0x0000000000000000, 0x0000000000000800,
231 0x0000000000000000, 0x0000000000000800);
232 constexpr uint64_t k_bytemask1 = 0x0003000300030003;
233 const auto comp_data_shuff1 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask1);
235 const __m512i k_byte_shufflemask2 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000A02,
236 0x0000000000000000, 0x0000000000000A02,
237 0x0000000000000000, 0x0000000000000A02,
238 0x0000000000000000, 0x0000000000000A02);
239 const auto comp_data_shuff2 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask2);
241 const __m512i k_byte_shufflemask3 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000C04,
242 0x0000000000000000, 0x0000000000000C04,
243 0x0000000000000000, 0x0000000000000C04,
244 0x0000000000000000, 0x0000000000000C04);
245 const auto comp_data_shuff3 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask3);
247 const __m512i k_byte_shufflemask4 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000E06,
248 0x0000000000000000, 0x0000000000000E06,
249 0x0000000000000000, 0x0000000000000E06,
250 0x0000000000000000, 0x0000000000000E06);
251 const auto comp_data_shuff4 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask4);
253 /// Ternary blend of the two shuffled results
254 const __m512i k_ternlog_select1 = _mm512_set_epi64(0x0000000000000000, 0x0000000000003030,
255 0x0000000000000000, 0x0000000000003030,
256 0x0000000000000000, 0x0000000000003030,
257 0x0000000000000000, 0x0000000000003030);
259 const __m512i k_ternlog_select2 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000C0C,
260 0x0000000000000000, 0x0000000000000C0C,
261 0x0000000000000000, 0x0000000000000C0C,
262 0x0000000000000000, 0x0000000000000C0C);
264 const __m512i k_ternlog_select3 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000303,
265 0x0000000000000000, 0x0000000000000303,
266 0x0000000000000000, 0x0000000000000303,
267 0x0000000000000000, 0x0000000000000303);
269 auto comp_data_packed2 = _mm512_ternarylogic_epi64(comp_data_shuff1, comp_data_shuff2, k_ternlog_select1, 0xd8);
270 auto comp_data_packed3 = _mm512_ternarylogic_epi64(comp_data_packed2, comp_data_shuff3, k_ternlog_select2, 0xd8);
271 auto comp_data_packed4 = _mm512_ternarylogic_epi64(comp_data_packed3, comp_data_shuff4, k_ternlog_select3, 0xd8);
272 const auto k_byte_permute =
274 0x11100100, 0x31302120, 0xFFFFFFFF, 0xFFFFFFFF,
275 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
276 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
277 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
279 return _mm512_permutexvar_epi8(k_byte_permute,comp_data_packed4);
282 void mod_compression_16qam_avx512(int16_t *pData, int8_t *pOut, int16_t unit, int32_t nSc)
284 int16_t bit_unit = unit>>1;
287 printf("modulation compression unit is too low!\n ");
290 __m512i symbol,symbol_unit ,bit_convert,byte_pack;
292 __mmask16 mask_store = 0x3;
294 nSc0 = nSc&0xfffffff0;
296 symbol_unit = _mm512_set1_epi16(bit_unit);
297 bit_convert = _mm512_set1_epi16(3);
298 for (int32_t iSc = 0 ; iSc<nSc0 ; iSc =iSc+16 ,pOut = pOut+8)
300 symbol = _mm512_loadu_epi16(pData);
301 mask32 = _mm512_movepi16_mask(symbol);
303 symbol = _mm512_div_epi16(symbol ,symbol_unit);
304 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
306 byte_pack = byte_pack2b(symbol);
307 _mm_mask_storeu_epi8(pOut , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 0));
308 _mm_mask_storeu_epi8(pOut+2 , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 1));
309 _mm_mask_storeu_epi8(pOut+4 , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 2));
310 _mm_mask_storeu_epi8(pOut+6 , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 3));
315 __mmask16 k1 , left_mask;
316 k1 = ((__mmask16)1<<nSc1)-1;
317 symbol = _mm512_mask_loadu_epi32(_mm512_setzero_epi32() ,k1 ,pData);
318 mask32 = _mm512_movepi16_mask(symbol);
319 symbol = _mm512_div_epi16(symbol ,symbol_unit);
320 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
321 byte_pack = byte_pack2b(symbol);
322 left_mask = (k1&0x1)|(((k1>>2)&0x1)<<1);
323 _mm_mask_storeu_epi8(pOut , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 0));
324 left_mask = ((k1>>4)&0x1)|(((k1>>6)&0x1)<<1);
325 _mm_mask_storeu_epi8(pOut+2 , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 1));
326 left_mask = ((k1>>8)&0x1)|(((k1>>10)&0x1)<<1);
327 _mm_mask_storeu_epi8(pOut+4 , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 2));
328 left_mask = ((k1>>12)&0x1)|(((k1>>14)&0x1)<<1);
329 _mm_mask_storeu_epi8(pOut+6 , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 3));
333 void mod_compression_16qam_snc(int16_t *pData, int8_t *pOut, int16_t unit, int32_t nSc)
335 int16_t bit_unit = unit>>1;
338 printf("modulation compression unit is too low!\n ");
341 __m512i symbol,symbol_unit ,bit_convert,byte_pack;
343 __mmask16 mask_store = 0x3;
345 nSc0 = nSc&0xfffffff0;
347 symbol_unit = _mm512_set1_epi16(bit_unit);
348 bit_convert = _mm512_set1_epi16(3);
349 for (int32_t iSc = 0 ; iSc<nSc0 ; iSc =iSc+16 ,pOut = pOut+8)
351 symbol = _mm512_loadu_epi16(pData);
352 mask32 = _mm512_movepi16_mask(symbol);
354 symbol = _mm512_div_epi16(symbol ,symbol_unit);
355 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
356 _mm512_mask_storeu_epi32(pOut , mask_store ,byte_pack2b_snc(symbol));
361 __mmask16 k1 , left_mask;
362 int8_t left_byte = 0;
363 k1 = ((__mmask16)1<<nSc1)-1;
364 symbol = _mm512_mask_loadu_epi32(_mm512_setzero_epi32() ,k1 ,pData);
365 mask32 = _mm512_movepi16_mask(symbol);
366 symbol = _mm512_div_epi16(symbol ,symbol_unit);
367 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
368 byte_pack = byte_pack2b_snc(symbol);
369 left_byte = (nSc+1)>>1;
370 left_mask = ((__mmask16)1<<left_byte)-1;
371 _mm_mask_storeu_epi8(pOut , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 0));
376 byte_pack3b(const __m512i comp_data)
378 const __m512i k_shift_left = _mm512_set_epi64(0x0000000300060001, 0x0004000700020005,
379 0x0000000300060001, 0x0004000700020005,
380 0x0000000300060001, 0x0004000700020005,
381 0x0000000300060001, 0x0004000700020005);
382 const auto comp_data_packed = _mm512_sllv_epi16(comp_data, k_shift_left);
385 const __m512i k_shift_right = _mm512_set_epi64(0x0000000000020000, 0x0000000100000000,
386 0x0000000000020000, 0x0000000100000000,
387 0x0000000000020000, 0x0000000100000000,
388 0x0000000000020000, 0x0000000100000000);
389 const auto comp_data_packed2 = _mm512_srlv_epi16(comp_data, k_shift_right);
391 const __m512i k_byte_shufflemask1 = _mm512_set_epi64(0x0000000000000000, 0x00000000000A0400,
392 0x0000000000000000, 0x00000000000A0400,
393 0x0000000000000000, 0x00000000000A0400,
394 0x0000000000000000, 0x00000000000A0400);
395 constexpr uint64_t k_bytemask1 = 0x0007000700070007;
396 const auto comp_data_shuff1 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask1);
398 const __m512i k_byte_shufflemask2 = _mm512_set_epi64(0x0000000000000000, 0x00000000000C0602,
399 0x0000000000000000, 0x00000000000C0602,
400 0x0000000000000000, 0x00000000000C0602,
401 0x0000000000000000, 0x00000000000C0602);
402 const auto comp_data_shuff2 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask2);
404 const __m512i k_byte_shufflemask3 = _mm512_set_epi64(0x0000000000000000, 0x00000000000E0800,
405 0x0000000000000000, 0x00000000000E0800,
406 0x0000000000000000, 0x00000000000E0800,
407 0x0000000000000000, 0x00000000000E0800);
408 constexpr uint64_t k_bytemask2 = 0x0006000600060006;
409 const auto comp_data_shuff3 = _mm512_maskz_shuffle_epi8(k_bytemask2, comp_data_packed, k_byte_shufflemask3);
411 const __m512i k_byte_shufflemask4 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000A04,
412 0x0000000000000000, 0x0000000000000A04,
413 0x0000000000000000, 0x0000000000000A04,
414 0x0000000000000000, 0x0000000000000A04);
415 constexpr uint64_t k_bytemask3 = 0x0003000300030003;
416 const auto comp_data_shuff4 = _mm512_maskz_shuffle_epi8(k_bytemask3, comp_data_packed2, k_byte_shufflemask4);
418 /// Ternary blend of the two shuffled results
419 const __m512i k_ternlog_select1 = _mm512_set_epi64(0x0000000000000000, 0x000000000038701C,
420 0x0000000000000000, 0x000000000038701C,
421 0x0000000000000000, 0x000000000038701C,
422 0x0000000000000000, 0x000000000038701C);
424 const __m512i k_ternlog_select2 = _mm512_set_epi64(0x0000000000000000, 0x0000000000070E00,
425 0x0000000000000000, 0x0000000000070E00,
426 0x0000000000000000, 0x0000000000070E00,
427 0x0000000000000000, 0x0000000000070E00);
429 const __m512i k_ternlog_select3 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000103,
430 0x0000000000000000, 0x0000000000000103,
431 0x0000000000000000, 0x0000000000000103,
432 0x0000000000000000, 0x0000000000000103);
434 auto comp_data_packed3 = _mm512_ternarylogic_epi64(comp_data_shuff1, comp_data_shuff2, k_ternlog_select1, 0xd8);
435 auto comp_data_packed4 = _mm512_ternarylogic_epi64(comp_data_packed3, comp_data_shuff3, k_ternlog_select2, 0xd8);
436 return _mm512_ternarylogic_epi64(comp_data_packed4, comp_data_shuff4, k_ternlog_select3, 0xd8);
440 byte_pack3b_snc(const __m512i comp_data)
442 const __m512i k_shift_left = _mm512_set_epi64(0x0000000300060001, 0x0004000700020005,
443 0x0000000300060001, 0x0004000700020005,
444 0x0000000300060001, 0x0004000700020005,
445 0x0000000300060001, 0x0004000700020005);
446 const auto comp_data_packed = _mm512_sllv_epi16(comp_data, k_shift_left);
449 const __m512i k_shift_right = _mm512_set_epi64(0x0000000000020000, 0x0000000100000000,
450 0x0000000000020000, 0x0000000100000000,
451 0x0000000000020000, 0x0000000100000000,
452 0x0000000000020000, 0x0000000100000000);
453 const auto comp_data_packed2 = _mm512_srlv_epi16(comp_data, k_shift_right);
455 const __m512i k_byte_shufflemask1 = _mm512_set_epi64(0x0000000000000000, 0x00000000000A0400,
456 0x0000000000000000, 0x00000000000A0400,
457 0x0000000000000000, 0x00000000000A0400,
458 0x0000000000000000, 0x00000000000A0400);
459 constexpr uint64_t k_bytemask1 = 0x0007000700070007;
460 const auto comp_data_shuff1 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask1);
462 const __m512i k_byte_shufflemask2 = _mm512_set_epi64(0x0000000000000000, 0x00000000000C0602,
463 0x0000000000000000, 0x00000000000C0602,
464 0x0000000000000000, 0x00000000000C0602,
465 0x0000000000000000, 0x00000000000C0602);
466 const auto comp_data_shuff2 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask2);
468 const __m512i k_byte_shufflemask3 = _mm512_set_epi64(0x0000000000000000, 0x00000000000E0800,
469 0x0000000000000000, 0x00000000000E0800,
470 0x0000000000000000, 0x00000000000E0800,
471 0x0000000000000000, 0x00000000000E0800);
472 constexpr uint64_t k_bytemask2 = 0x0006000600060006;
473 const auto comp_data_shuff3 = _mm512_maskz_shuffle_epi8(k_bytemask2, comp_data_packed, k_byte_shufflemask3);
475 const __m512i k_byte_shufflemask4 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000A04,
476 0x0000000000000000, 0x0000000000000A04,
477 0x0000000000000000, 0x0000000000000A04,
478 0x0000000000000000, 0x0000000000000A04);
479 constexpr uint64_t k_bytemask3 = 0x0003000300030003;
480 const auto comp_data_shuff4 = _mm512_maskz_shuffle_epi8(k_bytemask3, comp_data_packed2, k_byte_shufflemask4);
482 /// Ternary blend of the two shuffled results
483 const __m512i k_ternlog_select1 = _mm512_set_epi64(0x0000000000000000, 0x000000000038701C,
484 0x0000000000000000, 0x000000000038701C,
485 0x0000000000000000, 0x000000000038701C,
486 0x0000000000000000, 0x000000000038701C);
488 const __m512i k_ternlog_select2 = _mm512_set_epi64(0x0000000000000000, 0x0000000000070E00,
489 0x0000000000000000, 0x0000000000070E00,
490 0x0000000000000000, 0x0000000000070E00,
491 0x0000000000000000, 0x0000000000070E00);
493 const __m512i k_ternlog_select3 = _mm512_set_epi64(0x0000000000000000, 0x0000000000000103,
494 0x0000000000000000, 0x0000000000000103,
495 0x0000000000000000, 0x0000000000000103,
496 0x0000000000000000, 0x0000000000000103);
498 auto comp_data_packed3 = _mm512_ternarylogic_epi64(comp_data_shuff1, comp_data_shuff2, k_ternlog_select1, 0xd8);
499 auto comp_data_packed4 = _mm512_ternarylogic_epi64(comp_data_packed3, comp_data_shuff3, k_ternlog_select2, 0xd8);
500 auto comp_data_packed5 = _mm512_ternarylogic_epi64(comp_data_packed4, comp_data_shuff4, k_ternlog_select3, 0xd8);
502 const auto k_byte_permute =
504 0x10020100, 0x21201211, 0x32313022, 0xFFFFFFFF,
505 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
506 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
507 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
509 return _mm512_permutexvar_epi8(k_byte_permute,comp_data_packed5);
512 void mod_compression_64qam_avx512(int16_t *pData, int8_t *pOut, int16_t unit, int32_t nSc)
514 int16_t bit_unit = unit>>2;
517 printf("modulation compression unit is too low!\n ");
520 __m512i symbol,symbol_unit ,bit_convert,byte_pack;
522 __mmask16 mask_store = 0x7;
524 nSc0 = nSc&0xfffffff0;
526 symbol_unit = _mm512_set1_epi16(bit_unit);
527 bit_convert = _mm512_set1_epi16(7);
528 for (int32_t iSc = 0 ; iSc<nSc0 ; iSc =iSc+16 ,pOut = pOut+12)
530 symbol = _mm512_loadu_epi16(pData);
531 mask32 = _mm512_movepi16_mask(symbol);
533 symbol = _mm512_div_epi16(symbol ,symbol_unit);
534 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
536 byte_pack = byte_pack3b(symbol);
537 _mm_mask_storeu_epi8(pOut , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 0));
538 _mm_mask_storeu_epi8(pOut+3 , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 1));
539 _mm_mask_storeu_epi8(pOut+6 , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 2));
540 _mm_mask_storeu_epi8(pOut+9 , mask_store ,_mm512_extracti64x2_epi64(byte_pack, 3));
545 __mmask16 k1 , left_mask;
546 k1 = ((__mmask16)1<<nSc1)-1;
547 symbol = _mm512_mask_loadu_epi32(_mm512_setzero_epi32() ,k1 ,pData);
548 mask32 = _mm512_movepi16_mask(symbol);
549 symbol = _mm512_div_epi16(symbol ,symbol_unit);
550 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
551 byte_pack = byte_pack3b(symbol);
552 left_mask = k1&mask_store;
553 _mm_mask_storeu_epi8(pOut , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 0));
554 left_mask = (k1>>4)&mask_store;
555 _mm_mask_storeu_epi8(pOut+3 , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 1));
556 left_mask = (k1>>8)&mask_store;
557 _mm_mask_storeu_epi8(pOut+6 , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 2));
558 left_mask = (k1>>12)&mask_store;
559 _mm_mask_storeu_epi8(pOut+9 , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 3));
563 void mod_compression_64qam_snc(int16_t *pData, int8_t *pOut, int16_t unit, int32_t nSc)
565 int16_t bit_unit = unit>>2;
568 printf("modulation compression unit is too low!\n ");
571 __m512i symbol,symbol_unit ,bit_convert,byte_pack;
573 __mmask16 mask_store = 0x7;
575 nSc0 = nSc&0xfffffff0;
577 symbol_unit = _mm512_set1_epi16(bit_unit);
578 bit_convert = _mm512_set1_epi16(7);
579 for (int32_t iSc = 0 ; iSc<nSc0 ; iSc =iSc+16 ,pOut = pOut+12)
581 symbol = _mm512_loadu_epi16(pData);
582 mask32 = _mm512_movepi16_mask(symbol);
584 symbol = _mm512_div_epi16(symbol ,symbol_unit);
585 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
586 _mm512_mask_storeu_epi32(pOut , mask_store ,byte_pack3b_snc(symbol));
591 __mmask16 k1 , left_mask;
592 int8_t left_byte = 0;
593 k1 = ((__mmask16)1<<nSc1)-1;
594 symbol = _mm512_mask_loadu_epi32(_mm512_setzero_epi32() ,k1 ,pData);
595 mask32 = _mm512_movepi16_mask(symbol);
596 symbol = _mm512_div_epi16(symbol ,symbol_unit);
597 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
598 byte_pack = byte_pack3b_snc(symbol);
599 left_byte = (nSc*3+3)>>2;
600 left_mask = ((__mmask16)1<<left_byte)-1;
601 _mm_mask_storeu_epi8(pOut , left_mask ,_mm512_extracti64x2_epi64(byte_pack, 0));
606 byte_pack4b(const __m512i comp_data)
608 const __m512i k_shift_Left = _mm512_set_epi64(0x0000000400000004, 0x0000000400000004,
609 0x0000000400000004, 0x0000000400000004,
610 0x0000000400000004, 0x0000000400000004,
611 0x0000000400000004, 0x0000000400000004);
612 const auto comp_data_packed = _mm512_sllv_epi16(comp_data, k_shift_Left);
614 const __m512i k_byte_shufflemask1 = _mm512_set_epi64(0x0000000000000000, 0x000000000c080400,
615 0x0000000000000000, 0x000000000c080400,
616 0x0000000000000000, 0x000000000c080400,
617 0x0000000000000000, 0x000000000c080400);
618 constexpr uint64_t k_bytemask1 = 0x000F000F000F000F;
619 const auto comp_data_shuff1 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask1);
621 const __m512i k_byte_shufflemask2 = _mm512_set_epi64(0x0000000000000000, 0x000000000E0A0602,
622 0x0000000000000000, 0x000000000E0A0602,
623 0x0000000000000000, 0x000000000E0A0602,
624 0x0000000000000000, 0x000000000E0A0602);
625 const auto comp_data_shuff2 = _mm512_maskz_shuffle_epi8(k_bytemask1, comp_data_packed, k_byte_shufflemask2);
627 /// Ternary blend of the two shuffled results
628 const __m512i k_ternlog_select = _mm512_set_epi64(0x0000000000000000, 0x000000000F0F0F0F,
629 0x0000000000000000, 0x000000000F0F0F0F,
630 0x0000000000000000, 0x000000000F0F0F0F,
631 0x0000000000000000, 0x000000000F0F0F0F);
632 const auto comp_data_packed2 = _mm512_ternarylogic_epi64(comp_data_shuff1, comp_data_shuff2, k_ternlog_select, 0xd8);
634 const __m512i k_dwordmask = _mm512_set_epi64(0x0000000F0000000F, 0x0000000F0000000F,
635 0x0000000F0000000F, 0x0000000F0000000F,
636 0x0000000F0000000F, 0x0000000F0000000F,
637 0x0000000C00000008, 0x0000000400000000);
638 return _mm512_permutevar_epi32 (k_dwordmask,comp_data_packed2);
641 void mod_compression_256qam_avx512(int16_t *pData, int8_t *pOut, int16_t unit, int32_t nSc)
643 int16_t bit_unit = unit>>3;
646 printf("modulation compression unit is too low!\n ");
649 __m512i symbol,symbol_unit ,bit_convert;
651 __mmask16 mask_store =0xF;
653 nSc0 = nSc&0xfffffff0;
655 symbol_unit = _mm512_set1_epi16(bit_unit);
656 bit_convert = _mm512_set1_epi16(15);
657 for (int32_t iSc = 0 ; iSc<nSc0 ; iSc =iSc+16)
659 symbol = _mm512_loadu_epi16(pData);
660 mask32 = _mm512_movepi16_mask(symbol);
662 symbol = _mm512_div_epi16(symbol ,symbol_unit);
663 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
665 _mm512_mask_storeu_epi32 (pOut,mask_store, byte_pack4b(symbol));
672 k1 = ((__mmask16)1<<nSc1)-1;
673 symbol = _mm512_mask_loadu_epi32(_mm512_setzero_epi32() ,k1 ,pData);
674 mask32 = _mm512_movepi16_mask(symbol);
675 symbol = _mm512_div_epi16(symbol ,symbol_unit);
676 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
677 _mm512_mask_storeu_epi8 (pOut ,(__mmask64)k1 ,byte_pack4b(symbol));
681 void mod_compression_256qam_snc(int16_t *pData, int8_t *pOut, int16_t unit, int32_t nSc)
683 int16_t bit_unit = unit>>3;
686 printf("modulation compression unit is too low!\n ");
689 __m512i symbol,symbol_unit ,bit_convert;
691 __mmask16 mask_store =0xF;
693 nSc0 = nSc&0xfffffff0;
695 symbol_unit = _mm512_set1_epi16(bit_unit);
696 bit_convert = _mm512_set1_epi16(15);
697 for (int32_t iSc = 0 ; iSc<nSc0 ; iSc =iSc+16)
699 symbol = _mm512_loadu_epi16(pData);
700 mask32 = _mm512_movepi16_mask(symbol);
702 symbol = _mm512_div_epi16(symbol ,symbol_unit);
703 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
705 _mm512_mask_storeu_epi32 (pOut,mask_store, byte_pack4b(symbol));
712 k1 = ((__mmask16)1<<nSc1)-1;
713 symbol = _mm512_mask_loadu_epi32(_mm512_setzero_epi32() ,k1 ,pData);
714 mask32 = _mm512_movepi16_mask(symbol);
715 symbol = _mm512_div_epi16(symbol ,symbol_unit);
716 symbol =_mm512_mask_add_epi16(symbol,mask32 ,symbol,bit_convert);
717 _mm512_mask_storeu_epi8 (pOut ,(__mmask64)k1 ,byte_pack4b(symbol));
721 int xranlib_5gnr_mod_compression_snc(const struct xranlib_5gnr_mod_compression_request* request,
722 struct xranlib_5gnr_mod_compression_response* response){
724 switch(request->modulation)
727 mod_compression_qpsk_avx512(request->data_in, response->data_out, request->unit, request->num_symbols);
730 mod_compression_16qam_snc(request->data_in, response->data_out, request->unit, request->num_symbols);
733 mod_compression_64qam_snc(request->data_in, response->data_out, request->unit, request->num_symbols);
736 mod_compression_256qam_snc(request->data_in, response->data_out, request->unit, request->num_symbols);
739 printf("Error invalid modulation compression request\n");
745 int xranlib_5gnr_mod_compression(const struct xranlib_5gnr_mod_compression_request* request,
746 struct xranlib_5gnr_mod_compression_response* response){
748 return (xranlib_5gnr_mod_compression_c(request, response));
750 if(_may_i_use_cpu_feature(_FEATURE_AVX512IFMA52))
751 return (xranlib_5gnr_mod_compression_snc(request, response));
753 return (xranlib_5gnr_mod_compression_avx512(request, response));
758 int xranlib_5gnr_mod_compression_c(const struct xranlib_5gnr_mod_compression_request* request,
759 struct xranlib_5gnr_mod_compression_response* response){
761 switch(request->modulation)
764 mod_compression_qpsk_c(request->data_in, response->data_out, request->unit, request->num_symbols);
767 mod_compression_16qam_c(request->data_in, response->data_out, request->unit,request->num_symbols);
770 mod_compression_64qam_c(request->data_in, response->data_out, request->unit, request->num_symbols);
773 mod_compression_256qam_c(request->data_in, response->data_out, request->unit, request->num_symbols);
776 printf("Error invalid modulation compression request\n");
782 int xranlib_5gnr_mod_compression_avx512(const struct xranlib_5gnr_mod_compression_request* request,
783 struct xranlib_5gnr_mod_compression_response* response){
785 switch(request->modulation)
788 mod_compression_qpsk_avx512(request->data_in, response->data_out, request->unit, request->num_symbols);
791 mod_compression_16qam_avx512(request->data_in, response->data_out, request->unit, request->num_symbols);
794 mod_compression_64qam_avx512(request->data_in, response->data_out, request->unit, request->num_symbols);
797 mod_compression_256qam_avx512(request->data_in, response->data_out, request->unit, request->num_symbols);
800 printf("Error invalid modulation compression request\n");
807 mod_decompression_qpsk_c(int8_t *pData,int16_t *pOut,int16_t unit, int32_t nSc ,int16_t re_mask)
809 int16_t symbol_unit[2] = {0};
810 symbol_unit[0] = (unit>>1);
811 symbol_unit[1] = (unit>>1)*-1;
812 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
814 uint8_t mask_pos= iSc %12;
815 if (1 == ((re_mask >> mask_pos)&0x1))
817 uint8_t symbol_pos= iSc &0x3;
818 uint32_t byte_pos= iSc >>2;
819 uint8_t bit_i = (pData[byte_pos]>>(7-(symbol_pos*2)))&0x1;
820 pOut[iSc*2] = symbol_unit[bit_i];
821 uint8_t bit_q = (pData[byte_pos]>>(6-(symbol_pos*2)))&0x1;
822 pOut[iSc*2+1] = symbol_unit[bit_q];
828 mod_decompression_16qam_c(int8_t *pData,int16_t *pOut,int16_t unit, int32_t nSc)
830 int16_t symbol_unit[4] = {0};
831 symbol_unit[0] = (unit>>2);
832 symbol_unit[1] = (unit>>2)*3;
833 symbol_unit[3] = (unit>>2)*-1;
834 symbol_unit[2] = (unit>>2)*-3;
835 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
837 uint8_t symbol_pos= iSc &0x1;
838 uint32_t byte_pos= iSc >>1;
839 uint8_t bit_i = (pData[byte_pos]>>(6-(symbol_pos*4)))&0x3;
840 pOut[iSc*2] = symbol_unit[bit_i];
841 uint8_t bit_q = (pData[byte_pos]>>(4-(symbol_pos*4)))&0x3;
842 pOut[iSc*2+1] = symbol_unit[bit_q];
847 mod_decompression_64qam_c(int8_t *pData,int16_t *pOut,int16_t unit, int32_t nSc)
849 int16_t symbol_unit[8] = {0};
850 symbol_unit[0] = (unit>>3);
851 symbol_unit[1] = (unit>>3)*3;
852 symbol_unit[2] = (unit>>3)*5;
853 symbol_unit[3] = (unit>>3)*7;
854 symbol_unit[7] = (unit>>3)*-1;
855 symbol_unit[6] = (unit>>3)*-3;
856 symbol_unit[5] = (unit>>3)*-5;
857 symbol_unit[4] = (unit>>3)*-7;
858 uint8_t bit_i , bit_q ;
859 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
861 uint8_t symbol_pos= iSc %4;
864 bit_i = (pData[0]>>5)&0x7;
865 bit_q = (pData[0]>>2)&0x7;
867 else if (1 == symbol_pos)
869 bit_i = ((pData[0]&0x3)<<1)|((pData[1]>>7)&0x1);
870 bit_q = (pData[1]>>4)&0x7;
872 else if (2 == symbol_pos)
874 bit_q = ((pData[1]&0x1)<<2)|((pData[2]>>6)&0x3);
875 bit_i = (pData[1]>>1)&0x7;
877 else if (3 == symbol_pos)
879 bit_i = (pData[2]>>3)&0x7;
880 bit_q = pData[2]&0x7;
883 pOut[iSc*2] = symbol_unit[bit_i];
884 pOut[iSc*2+1] = symbol_unit[bit_q];
889 mod_decompression_256qam_c(int8_t *pData,int16_t *pOut,int16_t unit,int32_t nSc)
891 int16_t symbol_unit[16] = {0};
892 symbol_unit[0] = (unit>>4);
893 symbol_unit[1] = (unit>>4)*3;
894 symbol_unit[2] = (unit>>4)*5;
895 symbol_unit[3] = (unit>>4)*7;
896 symbol_unit[4] = (unit>>4)*9;
897 symbol_unit[5] = (unit>>4)*11;
898 symbol_unit[6] = (unit>>4)*13;
899 symbol_unit[7] = (unit>>4)*15;
900 symbol_unit[15] = (unit>>4)*-1;
901 symbol_unit[14] = (unit>>4)*-3;
902 symbol_unit[13] = (unit>>4)*-5;
903 symbol_unit[12] = (unit>>4)*-7;
904 symbol_unit[11] = (unit>>4)*-9;
905 symbol_unit[10] = (unit>>4)*-11;
906 symbol_unit[9] = (unit>>4)*-13;
907 symbol_unit[8] = (unit>>4)*-15;
908 for (int32_t iSc = 0 ; iSc<nSc ; iSc ++)
910 uint8_t bit_i = (pData[iSc]>>4)&0xF;
911 uint8_t bit_q = pData[iSc]&0xF;
912 pOut[iSc*2] = symbol_unit[bit_i];
913 pOut[iSc*2+1] = symbol_unit[bit_q];
917 int xranlib_5gnr_mod_decompression(const struct xranlib_5gnr_mod_decompression_request* request,
918 struct xranlib_5gnr_mod_decompression_response* response){
920 switch(request->modulation)
923 mod_decompression_qpsk_c(request->data_in, response->data_out, request->unit, request->num_symbols, request->re_mask);
926 mod_decompression_16qam_c(request->data_in, response->data_out, request->unit,request->num_symbols);
929 mod_decompression_64qam_c(request->data_in, response->data_out, request->unit, request->num_symbols);
932 mod_decompression_256qam_c(request->data_in, response->data_out, request->unit, request->num_symbols);
935 printf("Error invalid modulation compression request\n");