OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_codestream_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_codestream_sse2.cpp
34// Author: Aous Naman
35// Date: 15 May 2022
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <immintrin.h>
43#include "ojph_defs.h"
44
45namespace ojph {
46 namespace local {
47
50 {
51 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
52 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
53 x0 = _mm_or_si128(x0, x1);
54 x1 = _mm_shuffle_epi32(x0, 0x55); // x1 = x0[1,1,1,1]
55 x0 = _mm_or_si128(x0, x1);
56 _mm_storeu_si128((__m128i*)address, x0);
57 return *address;
58 // A single movd t, xmm0 can do the trick, but it is not available
59 // in SSE2 intrinsics. extract_epi32 is available in sse4.1
60 // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
61 // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
62 // return t;
63 }
64
67 {
68 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
69 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
70 x0 = _mm_or_si128(x0, x1);
71 _mm_storeu_si128((__m128i*)address, x0);
72 return *address;
73 // A single movd t, xmm0 can do the trick, but it is not available
74 // in SSE2 intrinsics. extract_epi32 is available in sse4.1
75 // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
76 // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
77 // return t;
78 }
79
81 void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
82 float delta_inv, ui32 count, ui32* max_val)
83 {
84 ojph_unused(delta_inv);
85
86 // convert to sign and magnitude and keep max_val
87 ui32 shift = 31 - K_max;
88 __m128i m0 = _mm_set1_epi32(INT_MIN);
89 __m128i zero = _mm_setzero_si128();
90 __m128i one = _mm_set1_epi32(1);
91 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
92 __m128i *p = (__m128i*)sp;
93 for ( ; count >= 4; count -= 4, p += 1, dp += 4)
94 {
95 __m128i v = _mm_loadu_si128(p);
96 __m128i sign = _mm_cmplt_epi32(v, zero);
97 __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
98 __m128i ones = _mm_and_si128(sign, one);
99 val = _mm_add_epi32(val, ones); // 2's complement
100 sign = _mm_and_si128(sign, m0);
101 val = _mm_slli_epi32(val, (int)shift);
102 tmax = _mm_or_si128(tmax, val);
103 val = _mm_or_si128(val, sign);
104 _mm_storeu_si128((__m128i*)dp, val);
105 }
106 if (count)
107 {
108 __m128i v = _mm_loadu_si128(p);
109 __m128i sign = _mm_cmplt_epi32(v, zero);
110 __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
111 __m128i ones = _mm_and_si128(sign, one);
112 val = _mm_add_epi32(val, ones); // 2's complement
113 sign = _mm_and_si128(sign, m0);
114 val = _mm_slli_epi32(val, (int)shift);
115
116 __m128i c = _mm_set1_epi32((si32)count);
117 __m128i idx = _mm_set_epi32(3, 2, 1, 0);
118 __m128i mask = _mm_cmpgt_epi32(c, idx);
119 c = _mm_and_si128(val, mask);
120 tmax = _mm_or_si128(tmax, c);
121
122 val = _mm_or_si128(val, sign);
123 _mm_storeu_si128((__m128i*)dp, val);
124 }
125 _mm_storeu_si128((__m128i*)max_val, tmax);
126 }
127
129 void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
130 float delta_inv, ui32 count, ui32* max_val)
131 {
132 ojph_unused(K_max);
133
134 //quantize and convert to sign and magnitude and keep max_val
135
136 __m128 d = _mm_set1_ps(delta_inv);
137 __m128i zero = _mm_setzero_si128();
138 __m128i one = _mm_set1_epi32(1);
139 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
140 float *p = (float*)sp;
141 for ( ; count >= 4; count -= 4, p += 4, dp += 4)
142 {
143 __m128 vf = _mm_loadu_ps(p);
144 vf = _mm_mul_ps(vf, d); // multiply
145 __m128i val = _mm_cvtps_epi32(vf); // convert to int
146 __m128i sign = _mm_cmplt_epi32(val, zero); // get sign
147 val = _mm_xor_si128(val, sign); // negate 1's complement
148 __m128i ones = _mm_and_si128(sign, one);
149 val = _mm_add_epi32(val, ones); // 2's complement
150 tmax = _mm_or_si128(tmax, val);
151 sign = _mm_slli_epi32(sign, 31);
152 val = _mm_or_si128(val, sign);
153 _mm_storeu_si128((__m128i*)dp, val);
154 }
155 if (count)
156 {
157 __m128 vf = _mm_loadu_ps(p);
158 vf = _mm_mul_ps(vf, d); // multiply
159 __m128i val = _mm_cvtps_epi32(vf); // convert to int
160 __m128i sign = _mm_cmplt_epi32(val, zero); // get sign
161 val = _mm_xor_si128(val, sign); // negate 1's complement
162 __m128i ones = _mm_and_si128(sign, one);
163 val = _mm_add_epi32(val, ones); // 2's complement
164
165 __m128i c = _mm_set1_epi32((si32)count);
166 __m128i idx = _mm_set_epi32(3, 2, 1, 0);
167 __m128i mask = _mm_cmpgt_epi32(c, idx);
168 c = _mm_and_si128(val, mask);
169 tmax = _mm_or_si128(tmax, c);
170
171 sign = _mm_slli_epi32(sign, 31);
172 val = _mm_or_si128(val, sign);
173 _mm_storeu_si128((__m128i*)dp, val);
174 }
175 _mm_storeu_si128((__m128i*)max_val, tmax);
176 }
177
179 void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
180 float delta, ui32 count)
181 {
182 ojph_unused(delta);
183 ui32 shift = 31 - K_max;
184 __m128i m1 = _mm_set1_epi32(INT_MAX);
185 __m128i zero = _mm_setzero_si128();
186 __m128i one = _mm_set1_epi32(1);
187 si32 *p = (si32*)dp;
188 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
189 {
190 __m128i v = _mm_load_si128((__m128i*)sp);
191 __m128i val = _mm_and_si128(v, m1);
192 val = _mm_srli_epi32(val, (int)shift);
193 __m128i sign = _mm_cmplt_epi32(v, zero);
194 val = _mm_xor_si128(val, sign); // negate 1's complement
195 __m128i ones = _mm_and_si128(sign, one);
196 val = _mm_add_epi32(val, ones); // 2's complement
197 _mm_storeu_si128((__m128i*)p, val);
198 }
199 }
200
202 void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
203 float delta, ui32 count)
204 {
205 ojph_unused(K_max);
206 __m128i m1 = _mm_set1_epi32(INT_MAX);
207 __m128 d = _mm_set1_ps(delta);
208 float *p = (float*)dp;
209 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
210 {
211 __m128i v = _mm_load_si128((__m128i*)sp);
212 __m128i vali = _mm_and_si128(v, m1);
213 __m128 valf = _mm_cvtepi32_ps(vali);
214 valf = _mm_mul_ps(valf, d);
215 __m128i sign = _mm_andnot_si128(m1, v);
216 valf = _mm_or_ps(valf, _mm_castsi128_ps(sign));
217 _mm_storeu_ps(p, valf);
218 }
219 }
220
222 void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
223 float delta_inv, ui32 count, ui64* max_val)
224 {
225 ojph_unused(delta_inv);
226
227 // convert to sign and magnitude and keep max_val
228 ui32 shift = 63 - K_max;
229 __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
230 __m128i zero = _mm_setzero_si128();
231 __m128i one = _mm_set1_epi64x(1);
232 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
233 __m128i *p = (__m128i*)sp;
234 for ( ; count >= 2; count -= 2, p += 1, dp += 2)
235 {
236 __m128i v = _mm_loadu_si128(p);
237 __m128i sign = _mm_cmplt_epi32(v, zero);
238 sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
239 __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
240 __m128i ones = _mm_and_si128(sign, one);
241 val = _mm_add_epi64(val, ones); // 2's complement
242 sign = _mm_and_si128(sign, m0);
243 val = _mm_slli_epi64(val, (int)shift);
244 tmax = _mm_or_si128(tmax, val);
245 val = _mm_or_si128(val, sign);
246 _mm_storeu_si128((__m128i*)dp, val);
247 }
248 if (count)
249 {
250 __m128i v = _mm_loadu_si128(p);
251 __m128i sign = _mm_cmplt_epi32(v, zero);
252 sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
253 __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
254 __m128i ones = _mm_and_si128(sign, one);
255 val = _mm_add_epi64(val, ones); // 2's complement
256 sign = _mm_and_si128(sign, m0);
257 val = _mm_slli_epi64(val, (int)shift);
258
259 __m128i c = _mm_set_epi32(0, 0, (si32)0xFFFFFFFF, (si32)0xFFFFFFFF);
260 c = _mm_and_si128(val, c);
261 tmax = _mm_or_si128(tmax, c);
262
263 val = _mm_or_si128(val, sign);
264 _mm_storeu_si128((__m128i*)dp, val);
265 }
266 _mm_storeu_si128((__m128i*)max_val, tmax);
267 }
268
270 void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
271 float delta, ui32 count)
272 {
273 ojph_unused(delta);
274 ui32 shift = 63 - K_max;
275 __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
276 __m128i zero = _mm_setzero_si128();
277 __m128i one = _mm_set1_epi64x(1);
278 si64 *p = (si64*)dp;
279 for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
280 {
281 __m128i v = _mm_load_si128((__m128i*)sp);
282 __m128i val = _mm_and_si128(v, m1);
283 val = _mm_srli_epi64(val, (int)shift);
284 __m128i sign = _mm_cmplt_epi32(v, zero);
285 sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
286 val = _mm_xor_si128(val, sign); // negate 1's complement
287 __m128i ones = _mm_and_si128(sign, one);
288 val = _mm_add_epi64(val, ones); // 2's complement
289 _mm_storeu_si128((__m128i*)p, val);
290 }
291 }
292 }
293}
294
295#endif
void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui32 sse2_find_max_val32(ui32 *address)
ui64 sse2_find_max_val64(ui64 *address)
void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
#define ojph_unused(x)
Definition ojph_defs.h:78