OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_colour_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_sse2.cpp
34// Author: Aous Naman
35// Date: 11 October 2019
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <cmath>
43
44#include "ojph_defs.h"
45#include "ojph_mem.h"
46#include "ojph_colour.h"
47
48#include <emmintrin.h>
49
50namespace ojph {
51 namespace local {
52
54 void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
55 ui32 width)
56 {
57 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
58 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
59 __m128 shift = _mm_set1_ps(0.5f);
60 __m128 m = _mm_set1_ps(mul);
61 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
62 {
63 __m128 t = _mm_loadu_ps(sp);
64 __m128 s = _mm_add_ps(t, shift);
65 s = _mm_mul_ps(s, m);
66 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
67 }
68 _MM_SET_ROUNDING_MODE(rounding_mode);
69 }
70
72 void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
73 ui32 width)
74 {
75 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
76 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
77 __m128 m = _mm_set1_ps(mul);
78 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
79 {
80 __m128 t = _mm_loadu_ps(sp);
81 __m128 s = _mm_mul_ps(t, m);
82 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
83 }
84 _MM_SET_ROUNDING_MODE(rounding_mode);
85 }
86
88 static inline
89 __m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
90 {
91 __m128 ct = _mm_cmpge_ps(x, y); // 0xFFFFFFFF for x >= y
92 __m128i c = _mm_castps_si128(ct); // does not generate any code
93 __m128i d = _mm_and_si128(c, a); // keep only a, where x >= y
94 __m128i e = _mm_andnot_si128(c, b); // keep only b, where x < y
95 return _mm_or_si128(d, e); // combine
96 }
97
99 static inline
100 __m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
101 {
102 __m128 ct = _mm_cmplt_ps(x, y); // 0xFFFFFFFF for x < y
103 __m128i c = _mm_castps_si128(ct); // does not generate any code
104 __m128i d = _mm_and_si128(c, a); // keep only a, where x < y
105 __m128i e = _mm_andnot_si128(c, b); // keep only b, where x >= y
106 return _mm_or_si128(d, e); // combine
107 }
108
110 template <bool NLT_TYPE3>
111 static inline
112 void local_sse2_irv_convert_to_integer(const line_buf *src_line,
113 line_buf *dst_line, ui32 dst_line_offset,
114 ui32 bit_depth, bool is_signed, ui32 width)
115 {
116 assert((src_line->flags & line_buf::LFT_32BIT) &&
117 (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
118 (dst_line->flags & line_buf::LFT_32BIT) &&
119 (dst_line->flags & line_buf::LFT_INTEGER));
120
121 assert(bit_depth <= 32);
122 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
123 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
124
125 const float* sp = src_line->f32;
126 si32* dp = dst_line->i32 + dst_line_offset;
127 // There is the possibility that converting to integer will
128 // exceed the dynamic range of 32bit integer; therefore, care must be
129 // exercised.
130 // We look if the floating point number is outside the half-closed
131 // interval [-0.5f, 0.5f). If so, we limit the resulting integer
132 // to the maximum/minimum that number supports.
133 si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
134 __m128 mul = _mm_set1_ps((float)(1ull << bit_depth));
135 __m128 fl_up_lim = _mm_set1_ps(-(float)neg_limit); // val < upper
136 __m128 fl_low_lim = _mm_set1_ps((float)neg_limit); // val >= lower
137 __m128i s32_up_lim = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
138 __m128i s32_low_lim = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
139
140 if (is_signed)
141 {
142 __m128i zero = _mm_setzero_si128();
143 __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
144 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
145 __m128 t = _mm_loadu_ps(sp);
146 t = _mm_mul_ps(t, mul);
147 __m128i u = _mm_cvtps_epi32(t);
148 u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
149 u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
150 if (NLT_TYPE3)
151 {
152 __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
153 __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
154 neg = _mm_and_si128(c, neg); //keep only - bias - value
155 u = _mm_andnot_si128(c, u); //keep only +ve or 0
156 u = _mm_or_si128(neg, u); //combine
157 }
158 _mm_storeu_si128((__m128i*)dp, u);
159 }
160 }
161 else
162 {
163 __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
164 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
165 __m128 t = _mm_loadu_ps(sp);
166 t = _mm_mul_ps(t, mul);
167 __m128i u = _mm_cvtps_epi32(t);
168 u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
169 u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
170 u = _mm_add_epi32(u, half);
171 _mm_storeu_si128((__m128i*)dp, u);
172 }
173 }
174
175 _MM_SET_ROUNDING_MODE(rounding_mode);
176 }
177
179 void sse2_irv_convert_to_integer(const line_buf *src_line,
180 line_buf *dst_line, ui32 dst_line_offset,
181 ui32 bit_depth, bool is_signed, ui32 width)
182 {
183 local_sse2_irv_convert_to_integer<false>(src_line, dst_line,
184 dst_line_offset, bit_depth, is_signed, width);
185 }
186
188 void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
189 line_buf *dst_line, ui32 dst_line_offset,
190 ui32 bit_depth, bool is_signed, ui32 width)
191 {
192 local_sse2_irv_convert_to_integer<true>(src_line, dst_line,
193 dst_line_offset, bit_depth, is_signed, width);
194 }
195
197 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
198 static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
199 {
200 // note than m must be obtained using
201 // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
202 __m128i x = _mm_srli_epi64(a, amt);
203 x = _mm_xor_si128(x, m);
204 __m128i result = _mm_sub_epi64(x, m);
205 return result;
206 }
207
209 static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
210 {
211 __m128i t;
212 t = _mm_cmplt_epi32(a, zero); // get -ve
213 t = _mm_unpacklo_epi32(a, t);
214 return t;
215 }
216
218 static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
219 {
220 __m128i t;
221 t = _mm_cmplt_epi32(a, zero); // get -ve
222 t = _mm_unpackhi_epi32(a, t);
223 return t;
224 }
225
227 void sse2_rev_convert(const line_buf *src_line,
228 const ui32 src_line_offset,
229 line_buf *dst_line,
230 const ui32 dst_line_offset,
231 si64 shift, ui32 width)
232 {
233 if (src_line->flags & line_buf::LFT_32BIT)
234 {
235 if (dst_line->flags & line_buf::LFT_32BIT)
236 {
237 const si32 *sp = src_line->i32 + src_line_offset;
238 si32 *dp = dst_line->i32 + dst_line_offset;
239 __m128i sh = _mm_set1_epi32((si32)shift);
240 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
241 {
242 __m128i s = _mm_loadu_si128((__m128i*)sp);
243 s = _mm_add_epi32(s, sh);
244 _mm_storeu_si128((__m128i*)dp, s);
245 }
246 }
247 else
248 {
249 const si32 *sp = src_line->i32 + src_line_offset;
250 si64 *dp = dst_line->i64 + dst_line_offset;
251 __m128i zero = _mm_setzero_si128();
252 __m128i sh = _mm_set1_epi64x(shift);
253 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
254 {
255 __m128i s, t;
256 s = _mm_loadu_si128((__m128i*)sp);
257
258 t = sse2_cvtlo_epi32_epi64(s, zero);
259 t = _mm_add_epi64(t, sh);
260 _mm_storeu_si128((__m128i*)dp, t);
261
262 t = sse2_cvthi_epi32_epi64(s, zero);
263 t = _mm_add_epi64(t, sh);
264 _mm_storeu_si128((__m128i*)dp + 1, t);
265 }
266 }
267 }
268 else
269 {
270 assert(src_line->flags | line_buf::LFT_64BIT);
271 assert(dst_line->flags | line_buf::LFT_32BIT);
272 const si64 *sp = src_line->i64 + src_line_offset;
273 si32 *dp = dst_line->i32 + dst_line_offset;
274 __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
275 __m128i sh = _mm_set1_epi64x(shift);
276 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
277 {
278 __m128i s, t;
279 s = _mm_loadu_si128((__m128i*)sp);
280 s = _mm_add_epi64(s, sh);
281
282 t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
283 t = _mm_and_si128(low_bits, t);
284
285 s = _mm_loadu_si128((__m128i*)sp + 1);
286 s = _mm_add_epi64(s, sh);
287
288 s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
289 s = _mm_andnot_si128(low_bits, s);
290
291 t = _mm_or_si128(s, t);
292 _mm_storeu_si128((__m128i*)dp, t);
293 }
294 }
295 }
296
298 void sse2_rev_convert_nlt_type3(const line_buf *src_line,
299 const ui32 src_line_offset,
300 line_buf *dst_line,
301 const ui32 dst_line_offset,
302 si64 shift, ui32 width)
303 {
304 if (src_line->flags & line_buf::LFT_32BIT)
305 {
306 if (dst_line->flags & line_buf::LFT_32BIT)
307 {
308 const si32 *sp = src_line->i32 + src_line_offset;
309 si32 *dp = dst_line->i32 + dst_line_offset;
310 __m128i sh = _mm_set1_epi32((si32)(-shift));
311 __m128i zero = _mm_setzero_si128();
312 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
313 {
314 __m128i s = _mm_loadu_si128((__m128i*)sp);
315 __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value
316 __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value
317 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
318 s = _mm_andnot_si128(c, s); // keep only +ve or 0
319 s = _mm_or_si128(s, v_m_sh); // combine
320 _mm_storeu_si128((__m128i*)dp, s);
321 }
322 }
323 else
324 {
325 const si32 *sp = src_line->i32 + src_line_offset;
326 si64 *dp = dst_line->i64 + dst_line_offset;
327 __m128i sh = _mm_set1_epi64x(-shift);
328 __m128i zero = _mm_setzero_si128();
329 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
330 {
331 __m128i s, t, u, c, v_m_sh;
332 s = _mm_loadu_si128((__m128i*)sp);
333
334 t = _mm_cmplt_epi32(s, zero); // find -ve 32bit -1
335 u = _mm_unpacklo_epi32(s, t); // correct 64bit data
336 c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value
337
338 v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
339 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
340 u = _mm_andnot_si128(c, u); // keep only +ve or 0
341 u = _mm_or_si128(u, v_m_sh); // combine
342
343 _mm_storeu_si128((__m128i*)dp, u);
344 u = _mm_unpackhi_epi32(s, t); // correct 64bit data
345 c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value
346
347 v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
348 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
349 u = _mm_andnot_si128(c, u); // keep only +ve or 0
350 u = _mm_or_si128(u, v_m_sh); // combine
351
352 _mm_storeu_si128((__m128i*)dp + 1, u);
353 }
354 }
355 }
356 else
357 {
358 assert(src_line->flags | line_buf::LFT_64BIT);
359 assert(dst_line->flags | line_buf::LFT_32BIT);
360 const si64 *sp = src_line->i64 + src_line_offset;
361 si32 *dp = dst_line->i32 + dst_line_offset;
362 __m128i sh = _mm_set1_epi64x(-shift);
363 __m128i zero = _mm_setzero_si128();
364 __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX);
365 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
366 {
367 // s for source, t for target, p for positive, n for negative,
368 // m for mask, and tm for temp
369 __m128i s, t, p, n, m, tm;
370 s = _mm_loadu_si128((__m128i*)sp);
371
372 tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
373 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
374 tm = _mm_sub_epi64(sh, s); // - shift - value
375 n = _mm_and_si128(m, tm); // -ve
376 p = _mm_andnot_si128(m, s); // +ve
377 tm = _mm_or_si128(n, p);
378 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
379 t = _mm_and_si128(half_mask, tm);
380
381 s = _mm_loadu_si128((__m128i*)sp + 1);
382 tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
383 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
384 tm = _mm_sub_epi64(sh, s); // - shift - value
385 n = _mm_and_si128(m, tm); // -ve
386 p = _mm_andnot_si128(m, s); // +ve
387 tm = _mm_or_si128(n, p);
388 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
389 tm = _mm_andnot_si128(half_mask, tm);
390
391 t = _mm_or_si128(t, tm);
392 _mm_storeu_si128((__m128i*)dp, t);
393 }
394 }
395 }
396
398 template<bool NLT_TYPE3>
399 static inline
400 void local_sse2_irv_convert_to_float(const line_buf *src_line,
401 ui32 src_line_offset, line_buf *dst_line,
402 ui32 bit_depth, bool is_signed, ui32 width)
403 {
404 assert((src_line->flags & line_buf::LFT_32BIT) &&
405 (src_line->flags & line_buf::LFT_INTEGER) &&
406 (dst_line->flags & line_buf::LFT_32BIT) &&
407 (dst_line->flags & line_buf::LFT_INTEGER) == 0);
408
409 assert(bit_depth <= 32);
410 __m128 mul = _mm_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
411
412 const si32* sp = src_line->i32 + src_line_offset;
413 float* dp = dst_line->f32;
414 if (is_signed)
415 {
416 __m128i zero = _mm_setzero_si128();
417 __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
418 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
419 __m128i t = _mm_loadu_si128((__m128i*)sp);
420 if (NLT_TYPE3)
421 {
422 __m128i c = _mm_cmplt_epi32(t, zero); // 0xFFFFFFFF for -ve value
423 __m128i neg = _mm_sub_epi32(bias, t); // - bias - value
424 neg = _mm_and_si128(c, neg); // keep only - bias - value
425 c = _mm_andnot_si128(c, t); // keep only +ve or 0
426 t = _mm_or_si128(neg, c); // combine
427 }
428 __m128 v = _mm_cvtepi32_ps(t);
429 v = _mm_mul_ps(v, mul);
430 _mm_storeu_ps(dp, v);
431 }
432 }
433 else
434 {
435 __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
436 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
437 __m128i t = _mm_loadu_si128((__m128i*)sp);
438 t = _mm_sub_epi32(t, half);
439 __m128 v = _mm_cvtepi32_ps(t);
440 v = _mm_mul_ps(v, mul);
441 _mm_storeu_ps(dp, v);
442 }
443 }
444 }
445
447 void sse2_irv_convert_to_float(const line_buf *src_line,
448 ui32 src_line_offset, line_buf *dst_line,
449 ui32 bit_depth, bool is_signed, ui32 width)
450 {
451 local_sse2_irv_convert_to_float<false>(src_line, src_line_offset,
452 dst_line, bit_depth, is_signed, width);
453 }
454
456 void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
457 ui32 src_line_offset, line_buf *dst_line,
458 ui32 bit_depth, bool is_signed, ui32 width)
459 {
460 local_sse2_irv_convert_to_float<true>(src_line, src_line_offset,
461 dst_line, bit_depth, is_signed, width);
462 }
463
465 void sse2_rct_forward(const line_buf *r,
466 const line_buf *g,
467 const line_buf *b,
468 line_buf *y, line_buf *cb, line_buf *cr,
469 ui32 repeat)
470 {
471 assert((y->flags & line_buf::LFT_INTEGER) &&
472 (cb->flags & line_buf::LFT_INTEGER) &&
473 (cr->flags & line_buf::LFT_INTEGER) &&
474 (r->flags & line_buf::LFT_INTEGER) &&
475 (g->flags & line_buf::LFT_INTEGER) &&
476 (b->flags & line_buf::LFT_INTEGER));
477
478 if (y->flags & line_buf::LFT_32BIT)
479 {
480 assert((y->flags & line_buf::LFT_32BIT) &&
481 (cb->flags & line_buf::LFT_32BIT) &&
482 (cr->flags & line_buf::LFT_32BIT) &&
483 (r->flags & line_buf::LFT_32BIT) &&
484 (g->flags & line_buf::LFT_32BIT) &&
485 (b->flags & line_buf::LFT_32BIT));
486 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
487 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
488 for (int i = (repeat + 3) >> 2; i > 0; --i)
489 {
490 __m128i mr = _mm_load_si128((__m128i*)rp);
491 __m128i mg = _mm_load_si128((__m128i*)gp);
492 __m128i mb = _mm_load_si128((__m128i*)bp);
493 __m128i t = _mm_add_epi32(mr, mb);
494 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
495 _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
496 t = _mm_sub_epi32(mb, mg);
497 _mm_store_si128((__m128i*)cbp, t);
498 t = _mm_sub_epi32(mr, mg);
499 _mm_store_si128((__m128i*)crp, t);
500
501 rp += 4; gp += 4; bp += 4;
502 yp += 4; cbp += 4; crp += 4;
503 }
504 }
505 else
506 {
507 assert((y->flags & line_buf::LFT_64BIT) &&
508 (cb->flags & line_buf::LFT_64BIT) &&
509 (cr->flags & line_buf::LFT_64BIT) &&
510 (r->flags & line_buf::LFT_32BIT) &&
511 (g->flags & line_buf::LFT_32BIT) &&
512 (b->flags & line_buf::LFT_32BIT));
513 __m128i zero = _mm_setzero_si128();
514 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
515 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
516 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
517 for (int i = (repeat + 3) >> 2; i > 0; --i)
518 {
519 __m128i mr32 = _mm_load_si128((__m128i*)rp);
520 __m128i mg32 = _mm_load_si128((__m128i*)gp);
521 __m128i mb32 = _mm_load_si128((__m128i*)bp);
522 __m128i mr, mg, mb, t;
523 mr = sse2_cvtlo_epi32_epi64(mr32, zero);
524 mg = sse2_cvtlo_epi32_epi64(mg32, zero);
525 mb = sse2_cvtlo_epi32_epi64(mb32, zero);
526
527 t = _mm_add_epi64(mr, mb);
528 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
529 _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
530 t = _mm_sub_epi64(mb, mg);
531 _mm_store_si128((__m128i*)cbp, t);
532 t = _mm_sub_epi64(mr, mg);
533 _mm_store_si128((__m128i*)crp, t);
534
535 yp += 2; cbp += 2; crp += 2;
536
537 mr = sse2_cvthi_epi32_epi64(mr32, zero);
538 mg = sse2_cvthi_epi32_epi64(mg32, zero);
539 mb = sse2_cvthi_epi32_epi64(mb32, zero);
540
541 t = _mm_add_epi64(mr, mb);
542 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
543 _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
544 t = _mm_sub_epi64(mb, mg);
545 _mm_store_si128((__m128i*)cbp, t);
546 t = _mm_sub_epi64(mr, mg);
547 _mm_store_si128((__m128i*)crp, t);
548
549 rp += 4; gp += 4; bp += 4;
550 yp += 2; cbp += 2; crp += 2;
551 }
552 }
553 }
554
556 void sse2_rct_backward(const line_buf *y,
557 const line_buf *cb,
558 const line_buf *cr,
559 line_buf *r, line_buf *g, line_buf *b,
560 ui32 repeat)
561 {
562 assert((y->flags & line_buf::LFT_INTEGER) &&
563 (cb->flags & line_buf::LFT_INTEGER) &&
564 (cr->flags & line_buf::LFT_INTEGER) &&
565 (r->flags & line_buf::LFT_INTEGER) &&
566 (g->flags & line_buf::LFT_INTEGER) &&
567 (b->flags & line_buf::LFT_INTEGER));
568
569 if (y->flags & line_buf::LFT_32BIT)
570 {
571 assert((y->flags & line_buf::LFT_32BIT) &&
572 (cb->flags & line_buf::LFT_32BIT) &&
573 (cr->flags & line_buf::LFT_32BIT) &&
574 (r->flags & line_buf::LFT_32BIT) &&
575 (g->flags & line_buf::LFT_32BIT) &&
576 (b->flags & line_buf::LFT_32BIT));
577 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
578 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
579 for (int i = (repeat + 3) >> 2; i > 0; --i)
580 {
581 __m128i my = _mm_load_si128((__m128i*)yp);
582 __m128i mcb = _mm_load_si128((__m128i*)cbp);
583 __m128i mcr = _mm_load_si128((__m128i*)crp);
584
585 __m128i t = _mm_add_epi32(mcb, mcr);
586 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
587 _mm_store_si128((__m128i*)gp, t);
588 __m128i u = _mm_add_epi32(mcb, t);
589 _mm_store_si128((__m128i*)bp, u);
590 u = _mm_add_epi32(mcr, t);
591 _mm_store_si128((__m128i*)rp, u);
592
593 yp += 4; cbp += 4; crp += 4;
594 rp += 4; gp += 4; bp += 4;
595 }
596 }
597 else
598 {
599 assert((y->flags & line_buf::LFT_64BIT) &&
600 (cb->flags & line_buf::LFT_64BIT) &&
601 (cr->flags & line_buf::LFT_64BIT) &&
602 (r->flags & line_buf::LFT_32BIT) &&
603 (g->flags & line_buf::LFT_32BIT) &&
604 (b->flags & line_buf::LFT_32BIT));
605 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
606 __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
607 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
608 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
609 for (int i = (repeat + 3) >> 2; i > 0; --i)
610 {
611 __m128i my, mcb, mcr, tr, tg, tb;
612 my = _mm_load_si128((__m128i*)yp);
613 mcb = _mm_load_si128((__m128i*)cbp);
614 mcr = _mm_load_si128((__m128i*)crp);
615
616 tg = _mm_add_epi64(mcb, mcr);
617 tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
618 tb = _mm_add_epi64(mcb, tg);
619 tr = _mm_add_epi64(mcr, tg);
620
621 __m128i mr, mg, mb;
622 mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
623 mr = _mm_and_si128(low_bits, mr);
624 mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
625 mg = _mm_and_si128(low_bits, mg);
626 mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
627 mb = _mm_and_si128(low_bits, mb);
628
629 yp += 2; cbp += 2; crp += 2;
630
631 my = _mm_load_si128((__m128i*)yp);
632 mcb = _mm_load_si128((__m128i*)cbp);
633 mcr = _mm_load_si128((__m128i*)crp);
634
635 tg = _mm_add_epi64(mcb, mcr);
636 tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
637 tb = _mm_add_epi64(mcb, tg);
638 tr = _mm_add_epi64(mcr, tg);
639
640 tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
641 tr = _mm_andnot_si128(low_bits, tr);
642 mr = _mm_or_si128(mr, tr);
643 tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
644 tg = _mm_andnot_si128(low_bits, tg);
645 mg = _mm_or_si128(mg, tg);
646 tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
647 tb = _mm_andnot_si128(low_bits, tb);
648 mb = _mm_or_si128(mb, tb);
649
650 _mm_store_si128((__m128i*)rp, mr);
651 _mm_store_si128((__m128i*)gp, mg);
652 _mm_store_si128((__m128i*)bp, mb);
653
654 yp += 2; cbp += 2; crp += 2;
655 rp += 4; gp += 4; bp += 4;
656 }
657 }
658 }
659 }
660}
661
662#endif
void sse2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void sse2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void sse2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void sse2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54