1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_colour_sse2.cpp
34 // Author: Aous Naman
35 // Date: 11 October 2019
36 //***************************************************************************/
37 
38 #include <cmath>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_colour.h"
43 
44 #ifdef OJPH_COMPILER_MSVC
45 #include <intrin.h>
46 #else
47 #include <x86intrin.h>
48 #endif
49 
50 namespace ojph {
51   namespace local {
52 
53     //////////////////////////////////////////////////////////////////////////
sse2_cnvrt_float_to_si32_shftd(const float * sp,si32 * dp,float mul,int width)54     void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
55                                        int width)
56     {
57       uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
58       _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
59       __m128 shift = _mm_set1_ps(0.5f);
60       __m128 m = _mm_set1_ps(mul);
61       for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
62       {
63         __m128 t = _mm_loadu_ps(sp);
64         __m128 s = _mm_add_ps(t, shift);
65         s = _mm_mul_ps(s, m);
66         _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
67       }
68       _MM_SET_ROUNDING_MODE(rounding_mode);
69     }
70 
71     //////////////////////////////////////////////////////////////////////////
sse2_cnvrt_float_to_si32(const float * sp,si32 * dp,float mul,int width)72     void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
73                                  int width)
74     {
75       uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
76       _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
77       __m128 m = _mm_set1_ps(mul);
78       for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
79       {
80         __m128 t = _mm_loadu_ps(sp);
81         __m128 s = _mm_mul_ps(t, m);
82         _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
83       }
84       _MM_SET_ROUNDING_MODE(rounding_mode);
85     }
86 
87 
88     //////////////////////////////////////////////////////////////////////////
sse2_cnvrt_si32_to_si32_shftd(const si32 * sp,si32 * dp,int shift,int width)89     void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
90                                        int width)
91     {
92       __m128i sh = _mm_set1_epi32(shift);
93       for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
94       {
95         __m128i s = _mm_loadu_si128((__m128i*)sp);
96         s = _mm_add_epi32(s, sh);
97         _mm_storeu_si128((__m128i*)dp, s);
98       }
99     }
100 
101     //////////////////////////////////////////////////////////////////////////
sse2_rct_forward(const si32 * r,const si32 * g,const si32 * b,si32 * y,si32 * cb,si32 * cr,int repeat)102     void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
103                           si32 *y, si32 *cb, si32 *cr, int repeat)
104     {
105       for (int i = (repeat + 3) >> 2; i > 0; --i)
106       {
107         __m128i mr = _mm_load_si128((__m128i*)r);
108         __m128i mg = _mm_load_si128((__m128i*)g);
109         __m128i mb = _mm_load_si128((__m128i*)b);
110         __m128i t = _mm_add_epi32(mr, mb);
111         t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
112         _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2));
113         t = _mm_sub_epi32(mb, mg);
114         _mm_store_si128((__m128i*)cb, t);
115         t = _mm_sub_epi32(mr, mg);
116         _mm_store_si128((__m128i*)cr, t);
117 
118         r += 4; g += 4; b += 4;
119         y += 4; cb += 4; cr += 4;
120       }
121     }
122 
123     //////////////////////////////////////////////////////////////////////////
sse2_rct_backward(const si32 * y,const si32 * cb,const si32 * cr,si32 * r,si32 * g,si32 * b,int repeat)124     void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
125                            si32 *r, si32 *g, si32 *b, int repeat)
126     {
127       for (int i = (repeat + 3) >> 2; i > 0; --i)
128       {
129         __m128i my  = _mm_load_si128((__m128i*)y);
130         __m128i mcb = _mm_load_si128((__m128i*)cb);
131         __m128i mcr = _mm_load_si128((__m128i*)cr);
132 
133         __m128i t = _mm_add_epi32(mcb, mcr);
134         t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
135         _mm_store_si128((__m128i*)g, t);
136         __m128i u = _mm_add_epi32(mcb, t);
137         _mm_store_si128((__m128i*)b, u);
138         u = _mm_add_epi32(mcr, t);
139         _mm_store_si128((__m128i*)r, u);
140 
141         y += 4; cb += 4; cr += 4;
142         r += 4; g += 4; b += 4;
143       }
144     }
145 
146   }
147 }
148