39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
59 static inline __m128i sse2_mm_srai_epi64(__m128i a,
int amt, __m128i m)
63 __m128i x = _mm_srli_epi64(a, amt);
64 x = _mm_xor_si128(x, m);
65 __m128i result = _mm_sub_epi64(x, m);
71 void sse2_deinterleave32(
float* dpl,
float* dph,
float* sp,
int width)
73 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
75 __m128 a = _mm_load_ps(sp);
76 __m128 b = _mm_load_ps(sp + 4);
77 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
78 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
86 void sse2_interleave32(
float* dp,
float* spl,
float* sph,
int width) \
88 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
90 __m128 a = _mm_load_ps(spl);
91 __m128 b = _mm_load_ps(sph);
92 __m128 c = _mm_unpacklo_ps(a, b);
93 __m128 d = _mm_unpackhi_ps(a, b);
95 _mm_store_ps(dp + 4, d);
101 void sse2_deinterleave64(
double* dpl,
double* dph,
double* sp,
int width)
103 for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
105 __m128d a = _mm_load_pd(sp);
106 __m128d b = _mm_load_pd(sp + 2);
107 __m128d c = _mm_shuffle_pd(a, b, 0);
108 __m128d d = _mm_shuffle_pd(a, b, 3);
109 _mm_store_pd(dpl, c);
110 _mm_store_pd(dph, d);
116 void sse2_interleave64(
double* dp,
double* spl,
double* sph,
int width)
118 for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
120 __m128d a = _mm_load_pd(spl);
121 __m128d b = _mm_load_pd(sph);
122 __m128d c = _mm_unpacklo_pd(a, b);
123 __m128d d = _mm_unpackhi_pd(a, b);
125 _mm_store_pd(dp + 2, d);
131 void sse2_rev_vert_step32(
const lifting_step* s,
const line_buf* sig,
132 const line_buf* other,
const line_buf* aug,
133 ui32 repeat,
bool synthesis)
135 const si32 a = s->rev.Aatk;
136 const si32 b = s->rev.Batk;
137 const ui8 e = s->rev.Eatk;
138 __m128i vb = _mm_set1_epi32(b);
140 si32* dst = aug->i32;
141 const si32* src1 = sig->i32, * src2 = other->i32;
149 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
151 __m128i s1 = _mm_load_si128((__m128i*)src1);
152 __m128i s2 = _mm_load_si128((__m128i*)src2);
153 __m128i d = _mm_load_si128((__m128i*)dst);
154 __m128i t = _mm_add_epi32(s1, s2);
155 __m128i v = _mm_add_epi32(vb, t);
156 __m128i w = _mm_srai_epi32(v, e);
157 d = _mm_sub_epi32(d, w);
158 _mm_store_si128((__m128i*)dst, d);
161 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
163 __m128i s1 = _mm_load_si128((__m128i*)src1);
164 __m128i s2 = _mm_load_si128((__m128i*)src2);
165 __m128i d = _mm_load_si128((__m128i*)dst);
166 __m128i t = _mm_add_epi32(s1, s2);
167 __m128i v = _mm_add_epi32(vb, t);
168 __m128i w = _mm_srai_epi32(v, e);
169 d = _mm_add_epi32(d, w);
170 _mm_store_si128((__m128i*)dst, d);
173 else if (a == -1 && b == 1 && e == 1)
177 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
179 __m128i s1 = _mm_load_si128((__m128i*)src1);
180 __m128i s2 = _mm_load_si128((__m128i*)src2);
181 __m128i d = _mm_load_si128((__m128i*)dst);
182 __m128i t = _mm_add_epi32(s1, s2);
183 __m128i w = _mm_srai_epi32(t, e);
184 d = _mm_add_epi32(d, w);
185 _mm_store_si128((__m128i*)dst, d);
188 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
190 __m128i s1 = _mm_load_si128((__m128i*)src1);
191 __m128i s2 = _mm_load_si128((__m128i*)src2);
192 __m128i d = _mm_load_si128((__m128i*)dst);
193 __m128i t = _mm_add_epi32(s1, s2);
194 __m128i w = _mm_srai_epi32(t, e);
195 d = _mm_sub_epi32(d, w);
196 _mm_store_si128((__m128i*)dst, d);
203 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
205 __m128i s1 = _mm_load_si128((__m128i*)src1);
206 __m128i s2 = _mm_load_si128((__m128i*)src2);
207 __m128i d = _mm_load_si128((__m128i*)dst);
208 __m128i t = _mm_add_epi32(s1, s2);
209 __m128i v = _mm_sub_epi32(vb, t);
210 __m128i w = _mm_srai_epi32(v, e);
211 d = _mm_sub_epi32(d, w);
212 _mm_store_si128((__m128i*)dst, d);
215 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
217 __m128i s1 = _mm_load_si128((__m128i*)src1);
218 __m128i s2 = _mm_load_si128((__m128i*)src2);
219 __m128i d = _mm_load_si128((__m128i*)dst);
220 __m128i t = _mm_add_epi32(s1, s2);
221 __m128i v = _mm_sub_epi32(vb, t);
222 __m128i w = _mm_srai_epi32(v, e);
223 d = _mm_add_epi32(d, w);
224 _mm_store_si128((__m128i*)dst, d);
232 for (
ui32 i = repeat; i > 0; --i)
233 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
235 for (
ui32 i = repeat; i > 0; --i)
236 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
242 void sse2_rev_vert_step64(
const lifting_step* s,
const line_buf* sig,
243 const line_buf* other,
const line_buf* aug,
244 ui32 repeat,
bool synthesis)
246 const si64 a = s->rev.Aatk;
247 const si64 b = s->rev.Batk;
248 const ui8 e = s->rev.Eatk;
249 __m128i vb = _mm_set1_epi64x(b);
250 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
252 si64* dst = aug->i64;
253 const si64* src1 = sig->i64, * src2 = other->i64;
261 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
263 __m128i s1 = _mm_load_si128((__m128i*)src1);
264 __m128i s2 = _mm_load_si128((__m128i*)src2);
265 __m128i d = _mm_load_si128((__m128i*)dst);
266 __m128i t = _mm_add_epi64(s1, s2);
267 __m128i v = _mm_add_epi64(vb, t);
268 __m128i w = sse2_mm_srai_epi64(v, e, ve);
269 d = _mm_sub_epi64(d, w);
270 _mm_store_si128((__m128i*)dst, d);
273 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
275 __m128i s1 = _mm_load_si128((__m128i*)src1);
276 __m128i s2 = _mm_load_si128((__m128i*)src2);
277 __m128i d = _mm_load_si128((__m128i*)dst);
278 __m128i t = _mm_add_epi64(s1, s2);
279 __m128i v = _mm_add_epi64(vb, t);
280 __m128i w = sse2_mm_srai_epi64(v, e, ve);
281 d = _mm_add_epi64(d, w);
282 _mm_store_si128((__m128i*)dst, d);
285 else if (a == -1 && b == 1 && e == 1)
289 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
291 __m128i s1 = _mm_load_si128((__m128i*)src1);
292 __m128i s2 = _mm_load_si128((__m128i*)src2);
293 __m128i d = _mm_load_si128((__m128i*)dst);
294 __m128i t = _mm_add_epi64(s1, s2);
295 __m128i w = sse2_mm_srai_epi64(t, e, ve);
296 d = _mm_add_epi64(d, w);
297 _mm_store_si128((__m128i*)dst, d);
300 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
302 __m128i s1 = _mm_load_si128((__m128i*)src1);
303 __m128i s2 = _mm_load_si128((__m128i*)src2);
304 __m128i d = _mm_load_si128((__m128i*)dst);
305 __m128i t = _mm_add_epi64(s1, s2);
306 __m128i w = sse2_mm_srai_epi64(t, e, ve);
307 d = _mm_sub_epi64(d, w);
308 _mm_store_si128((__m128i*)dst, d);
315 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
317 __m128i s1 = _mm_load_si128((__m128i*)src1);
318 __m128i s2 = _mm_load_si128((__m128i*)src2);
319 __m128i d = _mm_load_si128((__m128i*)dst);
320 __m128i t = _mm_add_epi64(s1, s2);
321 __m128i v = _mm_sub_epi64(vb, t);
322 __m128i w = sse2_mm_srai_epi64(v, e, ve);
323 d = _mm_sub_epi64(d, w);
324 _mm_store_si128((__m128i*)dst, d);
327 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
329 __m128i s1 = _mm_load_si128((__m128i*)src1);
330 __m128i s2 = _mm_load_si128((__m128i*)src2);
331 __m128i d = _mm_load_si128((__m128i*)dst);
332 __m128i t = _mm_add_epi64(s1, s2);
333 __m128i v = _mm_sub_epi64(vb, t);
334 __m128i w = sse2_mm_srai_epi64(v, e, ve);
335 d = _mm_add_epi64(d, w);
336 _mm_store_si128((__m128i*)dst, d);
342 for (
ui32 i = repeat; i > 0; --i)
343 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
345 for (
ui32 i = repeat; i > 0; --i)
346 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
352 const line_buf* other,
const line_buf* aug,
353 ui32 repeat,
bool synthesis)
362 sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
369 sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
375 void sse2_rev_horz_ana32(
const param_atk* atk,
const line_buf* ldst,
376 const line_buf* hdst,
const line_buf* src,
377 ui32 width,
bool even)
383 float* dpl = even ? ldst->f32 : hdst->f32;
384 float* dph = even ? hdst->f32 : ldst->f32;
385 float* sp = src->f32;
387 sse2_deinterleave32(dpl, dph, sp, w);
390 si32* hp = hdst->i32, * lp = ldst->i32;
391 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
392 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
393 ui32 num_steps = atk->get_num_steps();
394 for (
ui32 j = num_steps; j > 0; --j)
399 const si32 b = s->rev.Batk;
400 const ui8 e = s->rev.Eatk;
401 __m128i vb = _mm_set1_epi32(b);
405 lp[l_width] = lp[l_width - 1];
411 int i = (int)h_width;
414 for (; i > 0; i -= 4, sp += 4, dp += 4)
416 __m128i s1 = _mm_load_si128((__m128i*)sp);
417 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
418 __m128i d = _mm_load_si128((__m128i*)dp);
419 __m128i t = _mm_add_epi32(s1, s2);
420 __m128i v = _mm_add_epi32(vb, t);
421 __m128i w = _mm_srai_epi32(v, e);
422 d = _mm_add_epi32(d, w);
423 _mm_store_si128((__m128i*)dp, d);
428 for (; i > 0; i -= 4, sp += 4, dp += 4)
430 __m128i s1 = _mm_load_si128((__m128i*)sp);
431 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
432 __m128i d = _mm_load_si128((__m128i*)dp);
433 __m128i t = _mm_add_epi32(s1, s2);
434 __m128i v = _mm_add_epi32(vb, t);
435 __m128i w = _mm_srai_epi32(v, e);
436 d = _mm_add_epi32(d, w);
437 _mm_store_si128((__m128i*)dp, d);
441 else if (a == -1 && b == 1 && e == 1)
443 int i = (int)h_width;
445 for (; i > 0; i -= 4, sp += 4, dp += 4)
447 __m128i s1 = _mm_load_si128((__m128i*)sp);
448 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
449 __m128i d = _mm_load_si128((__m128i*)dp);
450 __m128i t = _mm_add_epi32(s1, s2);
451 __m128i w = _mm_srai_epi32(t, e);
452 d = _mm_sub_epi32(d, w);
453 _mm_store_si128((__m128i*)dp, d);
456 for (; i > 0; i -= 4, sp += 4, dp += 4)
458 __m128i s1 = _mm_load_si128((__m128i*)sp);
459 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
460 __m128i d = _mm_load_si128((__m128i*)dp);
461 __m128i t = _mm_add_epi32(s1, s2);
462 __m128i w = _mm_srai_epi32(t, e);
463 d = _mm_sub_epi32(d, w);
464 _mm_store_si128((__m128i*)dp, d);
469 int i = (int)h_width;
471 for (; i > 0; i -= 4, sp += 4, dp += 4)
473 __m128i s1 = _mm_load_si128((__m128i*)sp);
474 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
475 __m128i d = _mm_load_si128((__m128i*)dp);
476 __m128i t = _mm_add_epi32(s1, s2);
477 __m128i v = _mm_sub_epi32(vb, t);
478 __m128i w = _mm_srai_epi32(v, e);
479 d = _mm_add_epi32(d, w);
480 _mm_store_si128((__m128i*)dp, d);
483 for (; i > 0; i -= 4, sp += 4, dp += 4)
485 __m128i s1 = _mm_load_si128((__m128i*)sp);
486 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
487 __m128i d = _mm_load_si128((__m128i*)dp);
488 __m128i t = _mm_add_epi32(s1, s2);
489 __m128i v = _mm_sub_epi32(vb, t);
490 __m128i w = _mm_srai_epi32(v, e);
491 d = _mm_add_epi32(d, w);
492 _mm_store_si128((__m128i*)dp, d);
499 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
500 *dp += (b + a * (sp[0] + sp[1])) >> e;
502 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
503 *dp += (b + a * (sp[-1] + sp[0])) >> e;
507 si32* t = lp; lp = hp; hp = t;
509 ui32 w = l_width; l_width = h_width; h_width = w;
514 ldst->i32[0] = src->i32[0];
516 hdst->i32[0] = src->i32[0] << 1;
522 void sse2_rev_horz_ana64(
const param_atk* atk,
const line_buf* ldst,
523 const line_buf* hdst,
const line_buf* src,
524 ui32 width,
bool even)
530 double* dpl = (
double*)(even ? ldst->p : hdst->p);
531 double* dph = (
double*)(even ? hdst->p : ldst->p);
532 double* sp = (
double*)src->p;
534 sse2_deinterleave64(dpl, dph, sp, w);
537 si64* hp = hdst->i64, * lp = ldst->i64;
538 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
539 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
540 ui32 num_steps = atk->get_num_steps();
541 for (
ui32 j = num_steps; j > 0; --j)
546 const si32 b = s->rev.Batk;
547 const ui8 e = s->rev.Eatk;
548 __m128i vb = _mm_set1_epi64x(b);
549 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
553 lp[l_width] = lp[l_width - 1];
559 int i = (int)h_width;
562 for (; i > 0; i -= 2, sp += 2, dp += 2)
564 __m128i s1 = _mm_load_si128((__m128i*)sp);
565 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
566 __m128i d = _mm_load_si128((__m128i*)dp);
567 __m128i t = _mm_add_epi64(s1, s2);
568 __m128i v = _mm_add_epi64(vb, t);
569 __m128i w = sse2_mm_srai_epi64(v, e, ve);
570 d = _mm_add_epi64(d, w);
571 _mm_store_si128((__m128i*)dp, d);
576 for (; i > 0; i -= 2, sp += 2, dp += 2)
578 __m128i s1 = _mm_load_si128((__m128i*)sp);
579 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
580 __m128i d = _mm_load_si128((__m128i*)dp);
581 __m128i t = _mm_add_epi64(s1, s2);
582 __m128i v = _mm_add_epi64(vb, t);
583 __m128i w = sse2_mm_srai_epi64(v, e, ve);
584 d = _mm_add_epi64(d, w);
585 _mm_store_si128((__m128i*)dp, d);
589 else if (a == -1 && b == 1 && e == 1)
591 int i = (int)h_width;
593 for (; i > 0; i -= 2, sp += 2, dp += 2)
595 __m128i s1 = _mm_load_si128((__m128i*)sp);
596 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
597 __m128i d = _mm_load_si128((__m128i*)dp);
598 __m128i t = _mm_add_epi64(s1, s2);
599 __m128i w = sse2_mm_srai_epi64(t, e, ve);
600 d = _mm_sub_epi64(d, w);
601 _mm_store_si128((__m128i*)dp, d);
604 for (; i > 0; i -= 2, sp += 2, dp += 2)
606 __m128i s1 = _mm_load_si128((__m128i*)sp);
607 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
608 __m128i d = _mm_load_si128((__m128i*)dp);
609 __m128i t = _mm_add_epi64(s1, s2);
610 __m128i w = sse2_mm_srai_epi64(t, e, ve);
611 d = _mm_sub_epi64(d, w);
612 _mm_store_si128((__m128i*)dp, d);
617 int i = (int)h_width;
619 for (; i > 0; i -= 2, sp += 2, dp += 2)
621 __m128i s1 = _mm_load_si128((__m128i*)sp);
622 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
623 __m128i d = _mm_load_si128((__m128i*)dp);
624 __m128i t = _mm_add_epi64(s1, s2);
625 __m128i v = _mm_sub_epi64(vb, t);
626 __m128i w = sse2_mm_srai_epi64(v, e, ve);
627 d = _mm_add_epi64(d, w);
628 _mm_store_si128((__m128i*)dp, d);
631 for (; i > 0; i -= 2, sp += 2, dp += 2)
633 __m128i s1 = _mm_load_si128((__m128i*)sp);
634 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
635 __m128i d = _mm_load_si128((__m128i*)dp);
636 __m128i t = _mm_add_epi64(s1, s2);
637 __m128i v = _mm_sub_epi64(vb, t);
638 __m128i w = sse2_mm_srai_epi64(v, e, ve);
639 d = _mm_add_epi64(d, w);
640 _mm_store_si128((__m128i*)dp, d);
647 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
648 *dp += (b + a * (sp[0] + sp[1])) >> e;
650 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
651 *dp += (b + a * (sp[-1] + sp[0])) >> e;
655 si64* t = lp; lp = hp; hp = t;
657 ui32 w = l_width; l_width = h_width; h_width = w;
662 ldst->i64[0] = src->i64[0];
664 hdst->i64[0] = src->i64[0] << 1;
670 const line_buf* hdst,
const line_buf* src,
671 ui32 width,
bool even)
677 sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
684 sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
689 void sse2_rev_horz_syn32(
const param_atk* atk,
const line_buf* dst,
690 const line_buf* lsrc,
const line_buf* hsrc,
691 ui32 width,
bool even)
696 si32* oth = hsrc->i32, * aug = lsrc->i32;
697 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
698 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
699 ui32 num_steps = atk->get_num_steps();
700 for (
ui32 j = 0; j < num_steps; ++j)
704 const si32 b = s->rev.Batk;
705 const ui8 e = s->rev.Eatk;
706 __m128i vb = _mm_set1_epi32(b);
710 oth[oth_width] = oth[oth_width - 1];
712 const si32* sp = oth;
716 int i = (int)aug_width;
719 for (; i > 0; i -= 4, sp += 4, dp += 4)
721 __m128i s1 = _mm_load_si128((__m128i*)sp);
722 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
723 __m128i d = _mm_load_si128((__m128i*)dp);
724 __m128i t = _mm_add_epi32(s1, s2);
725 __m128i v = _mm_add_epi32(vb, t);
726 __m128i w = _mm_srai_epi32(v, e);
727 d = _mm_sub_epi32(d, w);
728 _mm_store_si128((__m128i*)dp, d);
733 for (; i > 0; i -= 4, sp += 4, dp += 4)
735 __m128i s1 = _mm_load_si128((__m128i*)sp);
736 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
737 __m128i d = _mm_load_si128((__m128i*)dp);
738 __m128i t = _mm_add_epi32(s1, s2);
739 __m128i v = _mm_add_epi32(vb, t);
740 __m128i w = _mm_srai_epi32(v, e);
741 d = _mm_sub_epi32(d, w);
742 _mm_store_si128((__m128i*)dp, d);
746 else if (a == -1 && b == 1 && e == 1)
748 int i = (int)aug_width;
750 for (; i > 0; i -= 4, sp += 4, dp += 4)
752 __m128i s1 = _mm_load_si128((__m128i*)sp);
753 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
754 __m128i d = _mm_load_si128((__m128i*)dp);
755 __m128i t = _mm_add_epi32(s1, s2);
756 __m128i w = _mm_srai_epi32(t, e);
757 d = _mm_add_epi32(d, w);
758 _mm_store_si128((__m128i*)dp, d);
761 for (; i > 0; i -= 4, sp += 4, dp += 4)
763 __m128i s1 = _mm_load_si128((__m128i*)sp);
764 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
765 __m128i d = _mm_load_si128((__m128i*)dp);
766 __m128i t = _mm_add_epi32(s1, s2);
767 __m128i w = _mm_srai_epi32(t, e);
768 d = _mm_add_epi32(d, w);
769 _mm_store_si128((__m128i*)dp, d);
774 int i = (int)aug_width;
776 for (; i > 0; i -= 4, sp += 4, dp += 4)
778 __m128i s1 = _mm_load_si128((__m128i*)sp);
779 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
780 __m128i d = _mm_load_si128((__m128i*)dp);
781 __m128i t = _mm_add_epi32(s1, s2);
782 __m128i v = _mm_sub_epi32(vb, t);
783 __m128i w = _mm_srai_epi32(v, e);
784 d = _mm_sub_epi32(d, w);
785 _mm_store_si128((__m128i*)dp, d);
788 for (; i > 0; i -= 4, sp += 4, dp += 4)
790 __m128i s1 = _mm_load_si128((__m128i*)sp);
791 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
792 __m128i d = _mm_load_si128((__m128i*)dp);
793 __m128i t = _mm_add_epi32(s1, s2);
794 __m128i v = _mm_sub_epi32(vb, t);
795 __m128i w = _mm_srai_epi32(v, e);
796 d = _mm_sub_epi32(d, w);
797 _mm_store_si128((__m128i*)dp, d);
806 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
807 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
809 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
810 *dp -= (b + a * (sp[0] + sp[1])) >> e;
814 si32* t = aug; aug = oth; oth = t;
816 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
821 float* dp = dst->f32;
822 float* spl = even ? lsrc->f32 : hsrc->f32;
823 float* sph = even ? hsrc->f32 : lsrc->f32;
825 sse2_interleave32(dp, spl, sph, w);
830 dst->i32[0] = lsrc->i32[0];
832 dst->i32[0] = hsrc->i32[0] >> 1;
837 void sse2_rev_horz_syn64(
const param_atk* atk,
const line_buf* dst,
838 const line_buf* lsrc,
const line_buf* hsrc,
839 ui32 width,
bool even)
844 si64* oth = hsrc->i64, * aug = lsrc->i64;
845 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
846 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
847 ui32 num_steps = atk->get_num_steps();
848 for (
ui32 j = 0; j < num_steps; ++j)
852 const si32 b = s->rev.Batk;
853 const ui8 e = s->rev.Eatk;
854 __m128i vb = _mm_set1_epi64x(b);
855 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
859 oth[oth_width] = oth[oth_width - 1];
861 const si64* sp = oth;
865 int i = (int)aug_width;
868 for (; i > 0; i -= 2, sp += 2, dp += 2)
870 __m128i s1 = _mm_load_si128((__m128i*)sp);
871 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
872 __m128i d = _mm_load_si128((__m128i*)dp);
873 __m128i t = _mm_add_epi64(s1, s2);
874 __m128i v = _mm_add_epi64(vb, t);
875 __m128i w = sse2_mm_srai_epi64(v, e, ve);
876 d = _mm_sub_epi64(d, w);
877 _mm_store_si128((__m128i*)dp, d);
882 for (; i > 0; i -= 2, sp += 2, dp += 2)
884 __m128i s1 = _mm_load_si128((__m128i*)sp);
885 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
886 __m128i d = _mm_load_si128((__m128i*)dp);
887 __m128i t = _mm_add_epi64(s1, s2);
888 __m128i v = _mm_add_epi64(vb, t);
889 __m128i w = sse2_mm_srai_epi64(v, e, ve);
890 d = _mm_sub_epi64(d, w);
891 _mm_store_si128((__m128i*)dp, d);
895 else if (a == -1 && b == 1 && e == 1)
897 int i = (int)aug_width;
899 for (; i > 0; i -= 2, sp += 2, dp += 2)
901 __m128i s1 = _mm_load_si128((__m128i*)sp);
902 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
903 __m128i d = _mm_load_si128((__m128i*)dp);
904 __m128i t = _mm_add_epi64(s1, s2);
905 __m128i w = sse2_mm_srai_epi64(t, e, ve);
906 d = _mm_add_epi64(d, w);
907 _mm_store_si128((__m128i*)dp, d);
910 for (; i > 0; i -= 2, sp += 2, dp += 2)
912 __m128i s1 = _mm_load_si128((__m128i*)sp);
913 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
914 __m128i d = _mm_load_si128((__m128i*)dp);
915 __m128i t = _mm_add_epi64(s1, s2);
916 __m128i w = sse2_mm_srai_epi64(t, e, ve);
917 d = _mm_add_epi64(d, w);
918 _mm_store_si128((__m128i*)dp, d);
923 int i = (int)aug_width;
925 for (; i > 0; i -= 2, sp += 2, dp += 2)
927 __m128i s1 = _mm_load_si128((__m128i*)sp);
928 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
929 __m128i d = _mm_load_si128((__m128i*)dp);
930 __m128i t = _mm_add_epi64(s1, s2);
931 __m128i v = _mm_sub_epi64(vb, t);
932 __m128i w = sse2_mm_srai_epi64(v, e, ve);
933 d = _mm_sub_epi64(d, w);
934 _mm_store_si128((__m128i*)dp, d);
937 for (; i > 0; i -= 2, sp += 2, dp += 2)
939 __m128i s1 = _mm_load_si128((__m128i*)sp);
940 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
941 __m128i d = _mm_load_si128((__m128i*)dp);
942 __m128i t = _mm_add_epi64(s1, s2);
943 __m128i v = _mm_sub_epi64(vb, t);
944 __m128i w = sse2_mm_srai_epi64(v, e, ve);
945 d = _mm_sub_epi64(d, w);
946 _mm_store_si128((__m128i*)dp, d);
953 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
954 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
956 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
957 *dp -= (b + a * (sp[0] + sp[1])) >> e;
961 si64* t = aug; aug = oth; oth = t;
963 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
968 double* dp = (
double*)dst->p;
969 double* spl = (
double*)(even ? lsrc->p : hsrc->p);
970 double* sph = (
double*)(even ? hsrc->p : lsrc->p);
972 sse2_interleave64(dp, spl, sph, w);
977 dst->i64[0] = lsrc->i64[0];
979 dst->i64[0] = hsrc->i64[0] >> 1;
985 const line_buf* lsrc,
const line_buf* hsrc,
986 ui32 width,
bool even)
992 sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
999 sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)