39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
56 __m256i avx2_mm256_srai_epi64(__m256i a,
int amt, __m256i m)
60 __m256i x = _mm256_srli_epi64(a, amt);
61 x = _mm256_xor_si256(x, m);
62 __m256i result = _mm256_sub_epi64(x, m);
68 const ui32 src_line_offset,
70 const ui32 dst_line_offset,
77 const si32 *sp = src_line->i32 + src_line_offset;
78 si32 *dp = dst_line->i32 + dst_line_offset;
79 __m256i sh = _mm256_set1_epi32((
si32)shift);
80 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
82 __m256i s = _mm256_loadu_si256((__m256i*)sp);
83 s = _mm256_add_epi32(s, sh);
84 _mm256_storeu_si256((__m256i*)dp, s);
89 const si32 *sp = src_line->i32 + src_line_offset;
90 si64 *dp = dst_line->i64 + dst_line_offset;
91 __m256i sh = _mm256_set1_epi64x(shift);
92 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
95 s = _mm256_loadu_si256((__m256i*)sp);
97 t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0));
98 t = _mm256_add_epi64(t, sh);
99 _mm256_storeu_si256((__m256i*)dp, t);
101 t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1));
102 t = _mm256_add_epi64(t, sh);
103 _mm256_storeu_si256((__m256i*)dp + 1, t);
111 const si64 *sp = src_line->i64 + src_line_offset;
112 si32 *dp = dst_line->i32 + dst_line_offset;
113 __m256i low_bits = _mm256_set_epi64x(0, (
si64)ULLONG_MAX,
114 0, (
si64)ULLONG_MAX);
115 __m256i sh = _mm256_set1_epi64x(shift);
116 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
119 s = _mm256_loadu_si256((__m256i*)sp);
120 s = _mm256_add_epi64(s, sh);
122 t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
123 t = _mm256_and_si256(low_bits, t);
125 s = _mm256_loadu_si256((__m256i*)sp + 1);
126 s = _mm256_add_epi64(s, sh);
128 s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
129 s = _mm256_andnot_si256(low_bits, s);
131 t = _mm256_or_si256(s, t);
132 t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
133 _mm256_storeu_si256((__m256i*)dp, t);
140 const ui32 src_line_offset,
142 const ui32 dst_line_offset,
149 const si32 *sp = src_line->i32 + src_line_offset;
150 si32 *dp = dst_line->i32 + dst_line_offset;
151 __m256i sh = _mm256_set1_epi32((
si32)(-shift));
152 __m256i zero = _mm256_setzero_si256();
153 for (
int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
155 __m256i s = _mm256_loadu_si256((__m256i*)sp);
156 __m256i c = _mm256_cmpgt_epi32(zero, s);
157 __m256i v_m_sh = _mm256_sub_epi32(sh, s);
158 v_m_sh = _mm256_and_si256(c, v_m_sh);
159 s = _mm256_andnot_si256(c, s);
160 s = _mm256_or_si256(s, v_m_sh);
161 _mm256_storeu_si256((__m256i*)dp, s);
166 const si32 *sp = src_line->i32 + src_line_offset;
167 si64 *dp = dst_line->i64 + dst_line_offset;
168 __m256i sh = _mm256_set1_epi64x(-shift);
169 __m256i zero = _mm256_setzero_si256();
170 for (
int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
172 __m256i s, t, u0, u1, c, v_m_sh;
173 s = _mm256_loadu_si256((__m256i*)sp);
175 t = _mm256_cmpgt_epi32(zero, s);
176 u0 = _mm256_unpacklo_epi32(s, t);
177 c = _mm256_unpacklo_epi32(t, t);
179 v_m_sh = _mm256_sub_epi64(sh, u0);
180 v_m_sh = _mm256_and_si256(c, v_m_sh);
181 u0 = _mm256_andnot_si256(c, u0);
182 u0 = _mm256_or_si256(u0, v_m_sh);
184 u1 = _mm256_unpackhi_epi32(s, t);
185 c = _mm256_unpackhi_epi32(t, t);
187 v_m_sh = _mm256_sub_epi64(sh, u1);
188 v_m_sh = _mm256_and_si256(c, v_m_sh);
189 u1 = _mm256_andnot_si256(c, u1);
190 u1 = _mm256_or_si256(u1, v_m_sh);
192 t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0);
193 _mm256_storeu_si256((__m256i*)dp, t);
195 t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1);
196 _mm256_storeu_si256((__m256i*)dp + 1, t);
204 const si64 *sp = src_line->i64 + src_line_offset;
205 si32 *dp = dst_line->i32 + dst_line_offset;
206 __m256i sh = _mm256_set1_epi64x(-shift);
207 __m256i zero = _mm256_setzero_si256();
208 __m256i half_mask = _mm256_set_epi64x(0, (
si64)ULLONG_MAX,
209 0, (
si64)ULLONG_MAX);
210 for (
int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
214 __m256i s, t, p, n, m, tm;
215 s = _mm256_loadu_si256((__m256i*)sp);
217 m = _mm256_cmpgt_epi64(zero, s);
218 tm = _mm256_sub_epi64(sh, s);
219 n = _mm256_and_si256(m, tm);
220 p = _mm256_andnot_si256(m, s);
221 tm = _mm256_or_si256(n, p);
222 tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
223 t = _mm256_and_si256(half_mask, tm);
225 s = _mm256_loadu_si256((__m256i*)sp + 1);
226 m = _mm256_cmpgt_epi64(zero, s);
227 tm = _mm256_sub_epi64(sh, s);
228 n = _mm256_and_si256(m, tm);
229 p = _mm256_andnot_si256(m, s);
230 tm = _mm256_or_si256(n, p);
231 tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
232 tm = _mm256_andnot_si256(half_mask, tm);
234 t = _mm256_or_si256(t, tm);
235 t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
236 _mm256_storeu_si256((__m256i*)dp, t);
243 __m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
247 __m256 ct = _mm256_cmp_ps(x, y, _CMP_NLT_UQ);
248 __m256i c = _mm256_castps_si256(ct);
249 __m256i d = _mm256_and_si256(c, a);
250 __m256i e = _mm256_andnot_si256(c, b);
251 return _mm256_or_si256(d, e);
256 __m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
260 __m256 ct = _mm256_cmp_ps(x, y, _CMP_NGE_UQ);
261 __m256i c = _mm256_castps_si256(ct);
262 __m256i d = _mm256_and_si256(c, a);
263 __m256i e = _mm256_andnot_si256(c, b);
264 return _mm256_or_si256(d, e);
268 template<
bool NLT_TYPE3>
270 void local_avx2_irv_convert_to_integer(
const line_buf *src_line,
271 line_buf *dst_line,
ui32 dst_line_offset,
272 ui32 bit_depth,
bool is_signed,
ui32 width)
279 assert(bit_depth <= 32);
280 const float* sp = src_line->f32;
281 si32* dp = dst_line->i32 + dst_line_offset;
288 si32 neg_limit = (
si32)INT_MIN >> (32 - bit_depth);
289 __m256 mul = _mm256_set1_ps((
float)(1ull << bit_depth));
290 __m256 fl_up_lim = _mm256_set1_ps(-(
float)neg_limit);
291 __m256 fl_low_lim = _mm256_set1_ps((
float)neg_limit);
292 __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
293 __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
297 __m256i zero = _mm256_setzero_si256();
299 _mm256_set1_epi32(-(
si32)((1ULL << (bit_depth - 1)) + 1));
300 for (
int i = (
int)width; i > 0; i -= 8, sp += 8, dp += 8) {
301 __m256 t = _mm256_loadu_ps(sp);
302 t = _mm256_mul_ps(t, mul);
303 __m256i u = _mm256_cvtps_epi32(t);
304 u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
305 u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
308 __m256i c = _mm256_cmpgt_epi32(zero, u);
309 __m256i neg = _mm256_sub_epi32(bias, u);
310 neg = _mm256_and_si256(c, neg);
311 u = _mm256_andnot_si256(c, u);
312 u = _mm256_or_si256(neg, u);
314 _mm256_storeu_si256((__m256i*)dp, u);
319 __m256i half = _mm256_set1_epi32((
si32)(1ULL << (bit_depth - 1)));
320 for (
int i = (
int)width; i > 0; i -= 8, sp += 8, dp += 8) {
321 __m256 t = _mm256_loadu_ps(sp);
322 t = _mm256_mul_ps(t, mul);
323 __m256i u = _mm256_cvtps_epi32(t);
324 u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
325 u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
326 u = _mm256_add_epi32(u, half);
327 _mm256_storeu_si256((__m256i*)dp, u);
334 line_buf *dst_line,
ui32 dst_line_offset,
335 ui32 bit_depth,
bool is_signed,
ui32 width)
337 local_avx2_irv_convert_to_integer<false>(src_line, dst_line,
338 dst_line_offset, bit_depth, is_signed, width);
343 line_buf *dst_line,
ui32 dst_line_offset,
344 ui32 bit_depth,
bool is_signed,
ui32 width)
346 local_avx2_irv_convert_to_integer<true>(src_line, dst_line,
347 dst_line_offset, bit_depth, is_signed, width);
351 template<
bool NLT_TYPE3>
353 void local_avx2_irv_convert_to_float(
const line_buf *src_line,
354 ui32 src_line_offset, line_buf *dst_line,
355 ui32 bit_depth,
bool is_signed,
ui32 width)
362 assert(bit_depth <= 32);
363 __m256 mul = _mm256_set1_ps((
float)(1.0 / (
double)(1ULL << bit_depth)));
365 const si32* sp = src_line->i32 + src_line_offset;
366 float* dp = dst_line->f32;
369 __m256i zero = _mm256_setzero_si256();
371 _mm256_set1_epi32(-(
si32)((1ULL << (bit_depth - 1)) + 1));
372 for (
int i = (
int)width; i > 0; i -= 8, sp += 8, dp += 8) {
373 __m256i t = _mm256_loadu_si256((__m256i*)sp);
376 __m256i c = _mm256_cmpgt_epi32(zero, t);
377 __m256i neg = _mm256_sub_epi32(bias, t);
378 neg = _mm256_and_si256(c, neg);
379 c = _mm256_andnot_si256(c, t);
380 t = _mm256_or_si256(neg, c);
382 __m256 v = _mm256_cvtepi32_ps(t);
383 v = _mm256_mul_ps(v, mul);
384 _mm256_storeu_ps(dp, v);
389 __m256i half = _mm256_set1_epi32((
si32)(1ULL << (bit_depth - 1)));
390 for (
int i = (
int)width; i > 0; i -= 8, sp += 8, dp += 8) {
391 __m256i t = _mm256_loadu_si256((__m256i*)sp);
392 t = _mm256_sub_epi32(t, half);
393 __m256 v = _mm256_cvtepi32_ps(t);
394 v = _mm256_mul_ps(v, mul);
395 _mm256_storeu_ps(dp, v);
402 ui32 src_line_offset, line_buf *dst_line,
403 ui32 bit_depth,
bool is_signed,
ui32 width)
405 local_avx2_irv_convert_to_float<false>(src_line, src_line_offset,
406 dst_line, bit_depth, is_signed, width);
411 ui32 src_line_offset, line_buf *dst_line,
412 ui32 bit_depth,
bool is_signed,
ui32 width)
414 local_avx2_irv_convert_to_float<true>(src_line, src_line_offset,
415 dst_line, bit_depth, is_signed, width);
423 line_buf *y, line_buf *cb, line_buf *cr,
441 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
442 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
443 for (
int i = (repeat + 7) >> 3; i > 0; --i)
445 __m256i mr = _mm256_load_si256((__m256i*)rp);
446 __m256i mg = _mm256_load_si256((__m256i*)gp);
447 __m256i mb = _mm256_load_si256((__m256i*)bp);
448 __m256i t = _mm256_add_epi32(mr, mb);
449 t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
450 _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2));
451 t = _mm256_sub_epi32(mb, mg);
452 _mm256_store_si256((__m256i*)cbp, t);
453 t = _mm256_sub_epi32(mr, mg);
454 _mm256_store_si256((__m256i*)crp, t);
456 rp += 8; gp += 8; bp += 8;
457 yp += 8; cbp += 8; crp += 8;
468 __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
469 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
470 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
471 for (
int i = (repeat + 7) >> 3; i > 0; --i)
473 __m256i mr32 = _mm256_load_si256((__m256i*)rp);
474 __m256i mg32 = _mm256_load_si256((__m256i*)gp);
475 __m256i mb32 = _mm256_load_si256((__m256i*)bp);
476 __m256i mr, mg, mb, t;
477 mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0));
478 mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0));
479 mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0));
481 t = _mm256_add_epi64(mr, mb);
482 t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
483 _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
484 t = _mm256_sub_epi64(mb, mg);
485 _mm256_store_si256((__m256i*)cbp, t);
486 t = _mm256_sub_epi64(mr, mg);
487 _mm256_store_si256((__m256i*)crp, t);
489 yp += 4; cbp += 4; crp += 4;
491 mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1));
492 mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1));
493 mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1));
495 t = _mm256_add_epi64(mr, mb);
496 t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
497 _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
498 t = _mm256_sub_epi64(mb, mg);
499 _mm256_store_si256((__m256i*)cbp, t);
500 t = _mm256_sub_epi64(mr, mg);
501 _mm256_store_si256((__m256i*)crp, t);
503 rp += 8; gp += 8; bp += 8;
504 yp += 4; cbp += 4; crp += 4;
513 line_buf *r, line_buf *g, line_buf *b,
531 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
532 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
533 for (
int i = (repeat + 7) >> 3; i > 0; --i)
535 __m256i my = _mm256_load_si256((__m256i*)yp);
536 __m256i mcb = _mm256_load_si256((__m256i*)cbp);
537 __m256i mcr = _mm256_load_si256((__m256i*)crp);
539 __m256i t = _mm256_add_epi32(mcb, mcr);
540 t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
541 _mm256_store_si256((__m256i*)gp, t);
542 __m256i u = _mm256_add_epi32(mcb, t);
543 _mm256_store_si256((__m256i*)bp, u);
544 u = _mm256_add_epi32(mcr, t);
545 _mm256_store_si256((__m256i*)rp, u);
547 yp += 8; cbp += 8; crp += 8;
548 rp += 8; gp += 8; bp += 8;
559 __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
560 __m256i low_bits = _mm256_set_epi64x(0, (
si64)ULLONG_MAX,
561 0, (
si64)ULLONG_MAX);
562 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
563 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
564 for (
int i = (repeat + 7) >> 3; i > 0; --i)
566 __m256i my, mcb, mcr, tr, tg, tb;
567 my = _mm256_load_si256((__m256i*)yp);
568 mcb = _mm256_load_si256((__m256i*)cbp);
569 mcr = _mm256_load_si256((__m256i*)crp);
571 tg = _mm256_add_epi64(mcb, mcr);
572 tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
573 tb = _mm256_add_epi64(mcb, tg);
574 tr = _mm256_add_epi64(mcr, tg);
577 mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
578 mr = _mm256_and_si256(low_bits, mr);
579 mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
580 mg = _mm256_and_si256(low_bits, mg);
581 mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
582 mb = _mm256_and_si256(low_bits, mb);
584 yp += 4; cbp += 4; crp += 4;
586 my = _mm256_load_si256((__m256i*)yp);
587 mcb = _mm256_load_si256((__m256i*)cbp);
588 mcr = _mm256_load_si256((__m256i*)crp);
590 tg = _mm256_add_epi64(mcb, mcr);
591 tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
592 tb = _mm256_add_epi64(mcb, tg);
593 tr = _mm256_add_epi64(mcr, tg);
595 tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
596 tr = _mm256_andnot_si256(low_bits, tr);
597 mr = _mm256_or_si256(mr, tr);
598 mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0));
600 tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
601 tg = _mm256_andnot_si256(low_bits, tg);
602 mg = _mm256_or_si256(mg, tg);
603 mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0));
605 tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
606 tb = _mm256_andnot_si256(low_bits, tb);
607 mb = _mm256_or_si256(mb, tb);
608 mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0));
610 _mm256_store_si256((__m256i*)rp, mr);
611 _mm256_store_si256((__m256i*)gp, mg);
612 _mm256_store_si256((__m256i*)bp, mb);
614 yp += 4; cbp += 4; crp += 4;
615 rp += 8; gp += 8; bp += 8;
void avx2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void avx2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void avx2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void avx2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void avx2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)