OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_encoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_block_encoder_avx2.cpp
35//***************************************************************************/
36
37#include "ojph_arch.h"
38#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
39
40#include <cassert>
41#include <cstring>
42#include <cstdint>
43#include <climits>
44#include <immintrin.h>
45
46#include "ojph_mem.h"
47#include "ojph_arch.h"
48#include "ojph_block_encoder.h"
49#include "ojph_message.h"
50
51#ifdef OJPH_COMPILER_MSVC
52 #define likely(x) (x)
53 #define unlikely(x) (x)
54#else
55 #define likely(x) __builtin_expect((x), 1)
56 #define unlikely(x) __builtin_expect((x), 0)
57#endif
58
59namespace ojph {
60 namespace local {
61
63 // tables
65
66 //VLC encoding
67 // index is (c_q << 8) + (rho << 4) + eps
68 // data is (cwd << 8) + (cwd_len << 4) + eps
69 // table 0 is for the initial line of quads
70 static ui32 vlc_tbl0[2048];
71 static ui32 vlc_tbl1[2048];
72
73 //UVLC encoding
74 static ui32 ulvc_cwd_pre[33];
75 static int ulvc_cwd_pre_len[33];
76 static ui32 ulvc_cwd_suf[33];
77 static int ulvc_cwd_suf_len[33];
78
80 static bool vlc_init_tables()
81 {
82 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
83 vlc_src_table tbl0[] = {
84 #include "table0.h"
85 };
86 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
87
88 si32 pattern_popcnt[16];
89 for (ui32 i = 0; i < 16; ++i)
90 pattern_popcnt[i] = (si32)population_count(i);
91
92 vlc_src_table* src_tbl = tbl0;
93 ui32 *tgt_tbl = vlc_tbl0;
94 size_t tbl_size = tbl0_size;
95 for (int i = 0; i < 2048; ++i)
96 {
97 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
98 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
99 tgt_tbl[i] = 0;
100 else
101 {
102 vlc_src_table *best_entry = NULL;
103 if (emb) // u_off = 1
104 {
105 int best_e_k = -1;
106 for (size_t j = 0; j < tbl_size; ++j)
107 {
108 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
109 if (src_tbl[j].u_off == 1)
110 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
111 {
112 //now we need to find the smallest cwd with the highest
113 // number of bits set in e_k
114 int ones_count = pattern_popcnt[src_tbl[j].e_k];
115 if (ones_count >= best_e_k)
116 {
117 best_entry = src_tbl + j;
118 best_e_k = ones_count;
119 }
120 }
121 }
122 }
123 else // u_off = 0
124 {
125 for (size_t j = 0; j < tbl_size; ++j)
126 {
127 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
128 if (src_tbl[j].u_off == 0)
129 {
130 best_entry = src_tbl + j;
131 break;
132 }
133 }
134 }
135 assert(best_entry);
136 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
137 + best_entry->e_k);
138 }
139 }
140
141 vlc_src_table tbl1[] = {
142 #include "table1.h"
143 };
144 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
145
146 src_tbl = tbl1;
147 tgt_tbl = vlc_tbl1;
148 tbl_size = tbl1_size;
149 for (int i = 0; i < 2048; ++i)
150 {
151 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
152 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
153 tgt_tbl[i] = 0;
154 else
155 {
156 vlc_src_table *best_entry = NULL;
157 if (emb) // u_off = 1
158 {
159 int best_e_k = -1;
160 for (size_t j = 0; j < tbl_size; ++j)
161 {
162 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
163 if (src_tbl[j].u_off == 1)
164 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
165 {
166 //now we need to find the smallest cwd with the highest
167 // number of bits set in e_k
168 int ones_count = pattern_popcnt[src_tbl[j].e_k];
169 if (ones_count >= best_e_k)
170 {
171 best_entry = src_tbl + j;
172 best_e_k = ones_count;
173 }
174 }
175 }
176 }
177 else // u_off = 0
178 {
179 for (size_t j = 0; j < tbl_size; ++j)
180 {
181 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
182 if (src_tbl[j].u_off == 0)
183 {
184 best_entry = src_tbl + j;
185 break;
186 }
187 }
188 }
189 assert(best_entry);
190 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
191 + best_entry->e_k);
192 }
193 }
194
195
196 return true;
197 }
198
200 static bool uvlc_init_tables()
201 {
202 //code goes from 0 to 31, extension and 32 are not supported here
203 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
204 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
205 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
206 ulvc_cwd_pre_len[2] = 2;
207 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
208 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
209 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
210 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
211 ulvc_cwd_suf_len[2] = 0;
212 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
213 for (int i = 5; i < 33; ++i)
214 {
215 ulvc_cwd_pre[i] = 0;
216 ulvc_cwd_pre_len[i] = 3;
217 ulvc_cwd_suf[i] = (ui32)(i-5);
218 ulvc_cwd_suf_len[i] = 5;
219 }
220 return true;
221 }
222
224 static bool tables_initialized = false;
225
228 if (!tables_initialized) {
229 memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
230 memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
233 }
234 return tables_initialized;
235 }
236
238 //
240 struct mel_struct {
241 //storage
242 ui8* buf; //pointer to data buffer
243 ui32 pos; //position of next writing within buf
244 ui32 buf_size; //size of buffer, which we must not exceed
245
246 // all these can be replaced by bytes
247 int remaining_bits; //number of empty bits in tmp
248 int tmp; //temporary storage of coded bits
249 int run; //number of 0 run
250 int k; //state
251 int threshold; //threshold where one bit must be coded
252 };
253
255 static inline void
256 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
257 {
258 melp->buf = data;
259 melp->pos = 0;
260 melp->buf_size = buffer_size;
261 melp->remaining_bits = 8;
262 melp->tmp = 0;
263 melp->run = 0;
264 melp->k = 0;
265 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
266 }
267
269 static inline void
270 mel_emit_bit(mel_struct* melp, int v)
271 {
272 melp->tmp = (melp->tmp << 1) + v;
273 melp->remaining_bits--;
274 if (melp->remaining_bits == 0) {
275 melp->buf[melp->pos++] = (ui8)melp->tmp;
276 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
277 melp->tmp = 0;
278 }
279 }
280
282 static inline void
283 mel_encode(mel_struct* melp, bool bit)
284 {
285 //MEL exponent
286 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
287
288 if (bit == false) {
289 ++melp->run;
290 if (melp->run >= melp->threshold) {
291 mel_emit_bit(melp, 1);
292 melp->run = 0;
293 melp->k = ojph_min(12, melp->k + 1);
294 melp->threshold = 1 << mel_exp[melp->k];
295 }
296 } else {
297 mel_emit_bit(melp, 0);
298 int t = mel_exp[melp->k];
299 while (t > 0) {
300 mel_emit_bit(melp, (melp->run >> --t) & 1);
301 }
302 melp->run = 0;
303 melp->k = ojph_max(0, melp->k - 1);
304 melp->threshold = 1 << mel_exp[melp->k];
305 }
306 }
307
309 //
311 struct vlc_struct_avx2 {
312 //storage
313 ui8* buf; //pointer to data buffer
314 ui32 pos; //position of next writing within buf
315 ui32 buf_size; //size of buffer, which we must not exceed
316
317 int used_bits; //number of occupied bits in tmp
318 ui64 tmp; //temporary storage of coded bits
319 bool last_greater_than_8F; //true if last byte us greater than 0x8F
320 };
321
323 static inline void
324 vlc_init(vlc_struct_avx2* vlcp, ui32 buffer_size, ui8* data)
325 {
326 vlcp->buf = data + buffer_size - 1; //points to last byte
327 vlcp->pos = 1; //locations will be all -pos
328 vlcp->buf_size = buffer_size;
329
330 vlcp->buf[0] = 0xFF;
331 vlcp->used_bits = 4;
332 vlcp->tmp = 0xF;
333 vlcp->last_greater_than_8F = true;
334 }
335
337 static inline void
338 vlc_encode(vlc_struct_avx2* vlcp, ui32 cwd, int cwd_len)
339 {
340 vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
341 vlcp->used_bits += cwd_len;
342
343 while (vlcp->used_bits >= 8) {
344 ui8 tmp;
345
346 if (unlikely(vlcp->last_greater_than_8F)) {
347 tmp = vlcp->tmp & 0x7F;
348
349 if (likely(tmp != 0x7F)) {
350 tmp = vlcp->tmp & 0xFF;
351 *(vlcp->buf - vlcp->pos) = tmp;
352 vlcp->last_greater_than_8F = tmp > 0x8F;
353 vlcp->tmp >>= 8;
354 vlcp->used_bits -= 8;
355 } else {
356 *(vlcp->buf - vlcp->pos) = tmp;
357 vlcp->last_greater_than_8F = false;
358 vlcp->tmp >>= 7;
359 vlcp->used_bits -= 7;
360 }
361
362 } else {
363 tmp = vlcp->tmp & 0xFF;
364 *(vlcp->buf - vlcp->pos) = tmp;
365 vlcp->last_greater_than_8F = tmp > 0x8F;
366 vlcp->tmp >>= 8;
367 vlcp->used_bits -= 8;
368 }
369
370 vlcp->pos++;
371 }
372 }
373
375 //
377 static inline void
378 terminate_mel_vlc(mel_struct* melp, vlc_struct_avx2* vlcp)
379 {
380 if (melp->run > 0)
381 mel_emit_bit(melp, 1);
382
383 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
384 *(vlcp->buf - vlcp->pos) = 0x7f;
385 vlcp->pos++;
386 vlcp->tmp >>= 7;
387 vlcp->used_bits -= 7;
388 }
389
390 melp->tmp = melp->tmp << melp->remaining_bits;
391 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
392 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
393 if ((mel_mask | vlc_mask) == 0)
394 return; //last mel byte cannot be 0xFF, since then
395 //melp->remaining_bits would be < 8
396 if (melp->pos >= melp->buf_size)
397 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
398 ui8 vlcp_tmp = (ui8)vlcp->tmp;
399 int fuse = melp->tmp | vlcp_tmp;
400 if ( ( ((fuse ^ melp->tmp) & mel_mask)
401 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
402 && (fuse != 0xFF) && vlcp->pos > 1)
403 {
404 melp->buf[melp->pos++] = (ui8)fuse;
405 }
406 else
407 {
408 if (vlcp->pos >= vlcp->buf_size)
409 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
410 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
411 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
412 vlcp->pos++;
413 }
414 }
415
417//
419 struct ms_struct {
420 //storage
421 ui8* buf; //pointer to data buffer
422 ui32 pos; //position of next writing within buf
423 ui32 buf_size; //size of buffer, which we must not exceed
424
425 int max_bits; //maximum number of bits that can be store in tmp
426 int used_bits; //number of occupied bits in tmp
427 ui32 tmp; //temporary storage of coded bits
428 };
429
431 static inline void
432 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
433 {
434 msp->buf = data;
435 msp->pos = 0;
436 msp->buf_size = buffer_size;
437 msp->max_bits = 8;
438 msp->used_bits = 0;
439 msp->tmp = 0;
440 }
441
443 static inline void
444 ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
445 {
446 while (cwd_len > 0)
447 {
448 if (msp->pos >= msp->buf_size)
449 OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
450 int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
451 msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
452 msp->used_bits += t;
453 cwd >>= t;
454 cwd_len -= t;
455 if (msp->used_bits >= msp->max_bits)
456 {
457 msp->buf[msp->pos++] = (ui8)msp->tmp;
458 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
459 msp->tmp = 0;
460 msp->used_bits = 0;
461 }
462 }
463 }
464
466 static inline void
468 {
469 if (msp->used_bits)
470 {
471 int t = msp->max_bits - msp->used_bits; //unused bits
472 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
473 msp->used_bits += t;
474 if (msp->tmp != 0xFF)
475 {
476 if (msp->pos >= msp->buf_size)
477 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
478 msp->buf[msp->pos++] = (ui8)msp->tmp;
479 }
480 }
481 else if (msp->max_bits == 7)
482 msp->pos--;
483 }
484
485#define ZERO _mm256_setzero_si256()
486#define ONE _mm256_set1_epi32(1)
487
488// https://stackoverflow.com/a/58827596
489inline __m256i avx2_lzcnt_epi32(__m256i v) {
490 // prevent value from being rounded up to the next power of two
491 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
492
493 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
494 v = _mm256_srli_epi32(v, 23); // shift down the exponent
495 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
496 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
497
498 return v;
499}
500
501inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
502 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
503}
504
505static void proc_pixel(__m256i *src_vec, ui32 p,
506 __m256i *eq_vec, __m256i *s_vec,
507 __m256i &rho_vec, __m256i &e_qmax_vec)
508{
509 __m256i val_vec[4];
510 __m256i _eq_vec[4];
511 __m256i _s_vec[4];
512 __m256i _rho_vec[4];
513
514 for (ui32 i = 0; i < 4; ++i) {
515 /* val = t + t; //multiply by 2 and get rid of sign */
516 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
517
518 /* val >>= p; // 2 \mu_p + x */
519 val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p);
520
521 /* val &= ~1u; // 2 \mu_p */
522 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u));
523
524 /* if (val) { */
525 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
526
527 /* rho[i] = 1 << i;
528 * rho is processed below.
529 */
530
531 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
532 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
533 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
534 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
535
536 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
537 * e_qmax is processed below
538 */
539
540 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
541 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
542 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
543 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
544
545 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
546 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
547 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
548 /* } */
549 }
550
551 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
552
553 /* Reorder from
554 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7]
555 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7]
556 * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15]
557 * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15]
558 * to
559 * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14]
560 * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14]
561 * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15]
562 * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15]
563 */
564 __m256i tmp1, tmp2;
565 for (ui32 i = 0; i < 2; ++i) {
566 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
567 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
568 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
569 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
570
571 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
572 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
573 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
574 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
575
576 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
577 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
578 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
579 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
580 }
581
582 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
583 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
584 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
585 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
586 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
587 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
588 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
589 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
590 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
591}
592
593/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
594 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
595 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
596 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
597 *
598 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
599 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
600 *
601 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
602 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
603 *
604 * [..]
605 */
606static void rotate_matrix(__m256i *matrix)
607{
608 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
609 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
610 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
611 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
612
613 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
614 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
615 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
616 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
617
618 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
619 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
620 matrix[0] = tmp1;
621
622 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
623 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
624 matrix[1] = tmp1;
625}
626
627static void proc_ms_encode(ms_struct *msp,
628 __m256i &tuple_vec,
629 __m256i &uq_vec,
630 __m256i &rho_vec,
631 __m256i *s_vec)
632{
633 __m256i m_vec[4];
634
635 /* Prepare parameters for ms_encode */
636 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
637 auto tmp = _mm256_and_si256(tuple_vec, ONE);
638 tmp = _mm256_sub_epi32(uq_vec, tmp);
639 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
640 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
641 m_vec[0] = _mm256_and_si256(mask, tmp);
642
643 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
644 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
645 tmp = _mm256_srli_epi32(tmp, 1);
646 tmp = _mm256_sub_epi32(uq_vec, tmp);
647 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
648 mask = avx2_cmpneq_epi32(tmp1, ZERO);
649 m_vec[1] = _mm256_and_si256(mask, tmp);
650
651 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
652 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
653 tmp = _mm256_srli_epi32(tmp, 2);
654 tmp = _mm256_sub_epi32(uq_vec, tmp);
655 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
656 mask = avx2_cmpneq_epi32(tmp1, ZERO);
657 m_vec[2] = _mm256_and_si256(mask, tmp);
658
659 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
660 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
661 tmp = _mm256_srli_epi32(tmp, 3);
662 tmp = _mm256_sub_epi32(uq_vec, tmp);
663 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
664 mask = avx2_cmpneq_epi32(tmp1, ZERO);
665 m_vec[3] = _mm256_and_si256(mask, tmp);
666
667 rotate_matrix(m_vec);
668 /* s_vec from
669 * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
670 * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
671 * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
672 * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
673 * to
674 * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
675 * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
676 * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
677 * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
678 */
679 rotate_matrix(s_vec);
680
681 ui32 cwd[8];
682 int cwd_len[8];
683 ui64 _cwd = 0;
684 int _cwd_len = 0;
685
686 /* Each iteration process 8 bytes * 2 lines */
687 for (ui32 i = 0; i < 4; ++i) {
688 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
689 * cwd_len = m
690 */
691 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
692 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
693 tmp = _mm256_sub_epi32(tmp, ONE);
694 tmp = _mm256_and_si256(tmp, s_vec[i]);
695 _mm256_storeu_si256((__m256i*)cwd, tmp);
696
697 for (ui32 j = 0; j < 4; ++j) {
698 ui32 idx = j * 2;
699 _cwd = cwd[idx];
700 _cwd_len = cwd_len[idx];
701 _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len;
702 _cwd_len += cwd_len[idx + 1];
703 ms_encode(msp, _cwd, _cwd_len);
704 }
705 }
706}
707
708static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
709 __m256i &e_qmax_vec)
710{
711 /* if (u_q[i] > 0) {
712 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
713 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
714 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
715 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
716 * }
717 */
718 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
719
720 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
721 auto eps_vec = _mm256_srli_epi32(mask, 31);
722
723 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
724 auto tmp = _mm256_srli_epi32(mask, 31);
725 tmp = _mm256_slli_epi32(tmp, 1);
726 eps_vec = _mm256_or_si256(eps_vec, tmp);
727
728 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
729 tmp = _mm256_srli_epi32(mask, 31);
730 tmp = _mm256_slli_epi32(tmp, 2);
731 eps_vec = _mm256_or_si256(eps_vec, tmp);
732
733 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
734 tmp = _mm256_srli_epi32(mask, 31);
735 tmp = _mm256_slli_epi32(tmp, 3);
736 eps_vec = _mm256_or_si256(eps_vec, tmp);
737
738 return _mm256_and_si256(u_q_mask, eps_vec);
739}
740
741static void update_lep(ui32 x, __m256i &prev_e_val_vec,
742 __m256i *eq_vec, __m256i *e_val_vec,
743 const __m256i left_shift)
744{
745 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
746 * lep[0] = (ui8)e_q[3];
747 * Compare e_q[1] with e_q[3] of the prevous round.
748 */
749 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
750 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
751 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
752 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
753}
754
755
756static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec,
757 __m256i &rho_vec, __m256i *cx_val_vec,
758 const __m256i left_shift)
759{
760 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
761 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
762 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
763 */
764 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
765 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
766 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
767
768 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
769 tmp = _mm256_srli_epi32(tmp, 3);
770
771 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
772 tmp1 = _mm256_srli_epi32(tmp1, 1);
773 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
774}
775
776static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
777 __m256i &eps_vec, ui32 *vlc_tbl)
778{
779 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
780 auto tmp = _mm256_slli_epi32(cq_vec, 8);
781 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
782 tmp = _mm256_add_epi32(tmp, tmp1);
783 tmp = _mm256_add_epi32(tmp, eps_vec);
784 return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4);
785}
786
787static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
788 const __m256i right_shift)
789{
790 ojph_unused(x);
791 ojph_unused(cx_val_vec);
792 ojph_unused(right_shift);
793
794 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
795 auto tmp = _mm256_srli_epi32(rho_vec, 1);
796 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
797 return _mm256_or_si256(tmp, tmp1);
798}
799
800static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
801 const __m256i right_shift)
802{
803 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
804 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
805 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
806 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
807
808#ifdef OJPH_ARCH_X86_64
809 tmp = _mm256_insert_epi64(tmp,
810 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
811#elif (defined OJPH_ARCH_I386)
812 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
813 tmp = _mm256_insert_epi32(tmp, lsb, 6);
814 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
815 tmp = _mm256_insert_epi32(tmp, msb, 7);
816#else
817 #error Error unsupport compiler
818#endif
819 tmp = _mm256_slli_epi32(tmp, 2);
820 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
821 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
822 tmp = _mm256_add_epi32(tmp1, tmp);
823
824 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
825 tmp1 = _mm256_srli_epi32(tmp1, 1);
826 tmp = _mm256_or_si256(tmp, tmp1);
827
828 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
829 tmp1 = _mm256_srli_epi32(tmp1, 2);
830
831 return _mm256_or_si256(tmp, tmp1);
832}
833
834using fn_proc_cq = __m256i (*)(ui32, __m256i *, __m256i &, const __m256i);
835
836static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec,
837 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
838 const __m256i right_shift)
839{
840 int32_t mel_need_encode[8];
841 int32_t mel_need_encode2[8];
842 int32_t mel_bit[8];
843 int32_t mel_bit2[8];
844 /* Prepare mel_encode params */
845 /* if (c_q[i] == 0) { */
846 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
847 /* mel_encode(&mel, rho[i] != 0); */
848 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
849 /* } */
850
851 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
852 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
853 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
854 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
855
856 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
857 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
858 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
859
860 ui32 i_max = 8 - (ignore / 2);
861
862 for (ui32 i = 0; i < i_max; i += 2) {
863 if (mel_need_encode[i]) {
864 mel_encode(melp, mel_bit[i]);
865 }
866
867 if (i + 1 < i_max) {
868 if (mel_need_encode[i + 1]) {
869 mel_encode(melp, mel_bit[i + 1]);
870 }
871 }
872
873 if (mel_need_encode2[i]) {
874 mel_encode(melp, mel_bit2[i]);
875 }
876 }
877}
878
879static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec,
880 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
881 const __m256i right_shift)
882{
883 ojph_unused(u_q_vec);
884 ojph_unused(right_shift);
885 int32_t mel_need_encode[8];
886 int32_t mel_bit[8];
887
888 /* Prepare mel_encode params */
889 /* if (c_q[i] == 0) { */
890 _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
891 /* mel_encode(&mel, rho[i] != 0); */
892 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
893 /* } */
894
895 ui32 i_max = 8 - (ignore / 2);
896
897 for (ui32 i = 0; i < i_max; ++i) {
898 if (mel_need_encode[i]) {
899 mel_encode(melp, mel_bit[i]);
900 }
901 }
902}
903
904using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &,
905 __m256i, ui32, const __m256i);
906
907static void proc_vlc_encode1(vlc_struct_avx2 *vlcp, ui32 *tuple,
908 ui32 *u_q, ui32 ignore)
909{
910 ui32 i_max = 8 - (ignore / 2);
911
912 for (ui32 i = 0; i < i_max; i += 2) {
913 /* 7 bits */
914 ui32 val = tuple[i + 0] >> 4;
915 int size = tuple[i + 0] & 7;
916
917 if (i + 1 < i_max) {
918 /* 7 bits */
919 val |= (tuple[i + 1] >> 4) << size;
920 size += tuple[i + 1] & 7;
921 }
922
923 if (u_q[i] > 2 && u_q[i + 1] > 2) {
924 /* 3 bits */
925 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
926 size += ulvc_cwd_pre_len[u_q[i] - 2];
927
928 /* 3 bits */
929 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
930 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
931
932 /* 5 bits */
933 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
934 size += ulvc_cwd_suf_len[u_q[i] - 2];
935
936 /* 5 bits */
937 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
938 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
939
940 } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
941 /* 3 bits */
942 val |= (ulvc_cwd_pre[u_q[i]]) << size;
943 size += ulvc_cwd_pre_len[u_q[i]];
944
945 /* 1 bit */
946 val |= (u_q[i + 1] - 1) << size;
947 size += 1;
948
949 /* 5 bits */
950 val |= (ulvc_cwd_suf[u_q[i]]) << size;
951 size += ulvc_cwd_suf_len[u_q[i]];
952
953 } else {
954 /* 3 bits */
955 val |= (ulvc_cwd_pre[u_q[i]]) << size;
956 size += ulvc_cwd_pre_len[u_q[i]];
957
958 /* 3 bits */
959 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
960 size += ulvc_cwd_pre_len[u_q[i + 1]];
961
962 /* 5 bits */
963 val |= (ulvc_cwd_suf[u_q[i]]) << size;
964 size += ulvc_cwd_suf_len[u_q[i]];
965
966 /* 5 bits */
967 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
968 size += ulvc_cwd_suf_len[u_q[i + 1]];
969 }
970
971 vlc_encode(vlcp, val, size);
972 }
973}
974
975static void proc_vlc_encode2(vlc_struct_avx2 *vlcp, ui32 *tuple,
976 ui32 *u_q, ui32 ignore)
977{
978 ui32 i_max = 8 - (ignore / 2);
979
980 for (ui32 i = 0; i < i_max; i += 2) {
981 /* 7 bits */
982 ui32 val = tuple[i + 0] >> 4;
983 int size = tuple[i + 0] & 7;
984
985 if (i + 1 < i_max) {
986 /* 7 bits */
987 val |= (tuple[i + 1] >> 4) << size;
988 size += tuple[i + 1] & 7;
989 }
990
991 /* 3 bits */
992 val |= ulvc_cwd_pre[u_q[i]] << size;
993 size += ulvc_cwd_pre_len[u_q[i]];
994
995 /* 3 bits */
996 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
997 size += ulvc_cwd_pre_len[u_q[i + 1]];
998
999 /* 5 bits */
1000 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
1001 size += ulvc_cwd_suf_len[u_q[i + 0]];
1002
1003 /* 5 bits */
1004 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
1005 size += ulvc_cwd_suf_len[u_q[i + 1]];
1006
1007 vlc_encode(vlcp, val, size);
1008 }
1009}
1010
1011using fn_proc_vlc_encode = void (*)(vlc_struct_avx2 *, ui32 *, ui32 *, ui32);
1012
1013void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
1014 ui32 num_passes, ui32 _width, ui32 height,
1015 ui32 stride, ui32* lengths,
1016 ojph::mem_elastic_allocator *elastic,
1017 ojph::coded_lists *& coded)
1018{
1019 ojph_unused(num_passes); //currently not used
1020
1021 ui32 width = (_width + 15) & ~15u;
1022 ui32 ignore = width - _width;
1023 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1024 const int mel_vlc_size = 3072; //more than enough
1025 const int mel_size = 192;
1026 const int vlc_size = mel_vlc_size - mel_size;
1027
1028 ui8 ms_buf[ms_size];
1029 ui8 mel_vlc_buf[mel_vlc_size];
1030 ui8 *mel_buf = mel_vlc_buf;
1031 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1032
1033 mel_struct mel;
1034 mel_init(&mel, mel_size, mel_buf);
1035 vlc_struct_avx2 vlc;
1036 vlc_init(&vlc, vlc_size, vlc_buf);
1037 ms_struct ms;
1038 ms_init(&ms, ms_size, ms_buf);
1039
1040 const ui32 p = 30 - missing_msbs;
1041
1042 //e_val: E values for a line (these are the highest set bit)
1043 //cx_val: is the context values
1044 //Each byte stores the info for the 2 sample. For E, it is maximum
1045 // of the two samples, while for cx, it is the OR of these two samples.
1046 //The maximum is between the pixel at the bottom left of one quad
1047 // and the bottom right of the earlier quad. The same is true for cx.
1048 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1049 // one for the non-existing earlier quad, and one for beyond the
1050 // the end
1051 const __m256i right_shift = _mm256_set_epi32(
1052 0, 7, 6, 5, 4, 3, 2, 1
1053 );
1054
1055 const __m256i left_shift = _mm256_set_epi32(
1056 6, 5, 4, 3, 2, 1, 0, 7
1057 );
1058
1059 ui32 n_loop = (width + 15) / 16;
1060
1061 __m256i e_val_vec[65];
1062 for (ui32 i = 0; i <ojph_min(64, n_loop); ++i) {
1063 e_val_vec[i] = ZERO;
1064 }
1065 __m256i prev_e_val_vec = ZERO;
1066
1067 __m256i cx_val_vec[65];
1068 __m256i prev_cx_val_vec = ZERO;
1069
1070 ui32 prev_cq = 0;
1071
1072 __m256i eq_vec[4];
1073 __m256i s_vec[4];
1074 __m256i src_vec[4];
1075
1076 ui32 *vlc_tbl = vlc_tbl0;
1077 fn_proc_cq proc_cq = proc_cq1;
1078 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1079 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1080
1081 /* 2 lines per iteration */
1082 for (ui32 y = 0; y < height; y += 2)
1083 {
1084 e_val_vec[n_loop] = prev_e_val_vec;
1085 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1086 __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1087 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1088
1089 prev_e_val_vec = ZERO;
1090 prev_cx_val_vec = ZERO;
1091
1092 ui32 *sp = buf + y * stride;
1093
1094 /* 16 bytes per iteration */
1095 for (ui32 x = 0; x < n_loop; ++x) {
1096
1097 /* t = sp[i]; */
1098 if ((x == (n_loop - 1)) && (_width % 16)) {
1099 ui32 tmp_buf[16] = { 0 };
1100 memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32));
1101 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1102 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1103 if (y + 1 < height) {
1104 memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32));
1105 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1106 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1107 }
1108 else {
1109 src_vec[1] = ZERO;
1110 src_vec[3] = ZERO;
1111 }
1112 }
1113 else {
1114 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1115 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1116
1117 if (y + 1 < height) {
1118 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1119 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1120 }
1121 else {
1122 src_vec[1] = ZERO;
1123 src_vec[3] = ZERO;
1124 }
1125 sp += 16;
1126 }
1127
1128 /* src_vec layout:
1129 * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5],.[0, 6],.[0, 7]
1130 * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5],.[1, 6],.[1, 7]
1131 * src_vec[2]:[0, 8],[0, 9],[0,10],[0,11],[0,12],[0,13],.[0,14], [0,15]
1132 * src_vec[3]:[1, 8],[1, 9],[1,10],[1,11],[1,12],[1,13],.[1,14], [1,15]
1133 */
1134 __m256i rho_vec, e_qmax_vec;
1135 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1136
1137 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1138 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1139 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1140
1141 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1142 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1143
1144 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1145 tmp = _mm256_max_epi32(max_e_vec, ONE);
1146 __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1147 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1148
1149 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1150 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1151 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1152 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1153
1154 /* cq[1 - 16] = cq_vec
1155 * cq[0] = prev_cq_vec[0]
1156 */
1157 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1158
1159 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1160 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1161 prev_cq = (ui32)_mm256_extract_epi32(tmp, 7);
1162
1163 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1164 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1165
1166 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1167 /* u_q[i] = Uq[i] - kappa[i]; */
1168 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1169 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1170
1171 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1172 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1173 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1174
1175 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1176 right_shift);
1177
1178 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1179
1180 // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
1181 // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
1182 ui32 u_q[8];
1183 ui32 tuple[8];
1184 /* The tuple is scaled by 4 due to:
1185 * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
1186 * So in the vlc_encode, the tuple will only be scaled by 2.
1187 */
1188 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1189 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1190 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1191
1192 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1193 }
1194
1195 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1196 tmp = _mm256_slli_epi32(tmp, 2);
1197 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1198 prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1199
1200 proc_cq = proc_cq2;
1201 vlc_tbl = vlc_tbl1;
1202 proc_mel_encode = proc_mel_encode2;
1203 proc_vlc_encode = proc_vlc_encode2;
1204 }
1205
1206 ms_terminate(&ms);
1207 terminate_mel_vlc(&mel, &vlc);
1208
1209 //copy to elastic
1210 lengths[0] = mel.pos + vlc.pos + ms.pos;
1211 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1212 memcpy(coded->buf, ms.buf, ms.pos);
1213 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1214 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1215
1216 // put in the interface locator word
1217 ui32 num_bytes = mel.pos + vlc.pos;
1218 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1219 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1220 coded->buf[lengths[0]-2] =
1221 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1222
1223 coded->avail_size -= lengths[0];
1224}
1225
1226} /* namespace local */
1227} /* namespace ojph */
1228
1229#endif
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition ojph_mem.cpp:113
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
static bool tables_initialized
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition ojph_arch.h:152
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define ojph_min(a, b)
Definition ojph_defs.h:76
#define ojph_unused(x)
Definition ojph_defs.h:78
#define OJPH_ERROR(t,...)