1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_colour_avx.cpp
34 // Author: Aous Naman
35 // Date: 11 October 2019
36 //***************************************************************************/
37 
38 #include <cmath>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_colour.h"
43 #include "ojph_colour_local.h"
44 
45 #ifdef OJPH_COMPILER_MSVC
46 #include <intrin.h>
47 #else
48 #include <x86intrin.h>
49 #endif
50 
51 namespace ojph {
52   namespace local {
53 
54     //////////////////////////////////////////////////////////////////////////
avx_cnvrt_si32_to_float_shftd(const si32 * sp,float * dp,float mul,int width)55     void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
56                                        int width)
57     {
58       __m256 shift = _mm256_set1_ps(0.5f);
59       __m256 m = _mm256_set1_ps(mul);
60       for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
61       {
62         __m256i t = _mm256_loadu_si256((__m256i*)sp);
63         __m256 s = _mm256_cvtepi32_ps(t);
64         s = _mm256_mul_ps(s, m);
65         s = _mm256_sub_ps(s, shift);
66         _mm256_store_ps(dp, s);
67       }
68     }
69 
70     //////////////////////////////////////////////////////////////////////////
avx_cnvrt_si32_to_float(const si32 * sp,float * dp,float mul,int width)71     void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
72                                  int width)
73     {
74       __m256 m = _mm256_set1_ps(mul);
75       for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
76       {
77         __m256i t = _mm256_loadu_si256((__m256i*)sp);
78         __m256 s = _mm256_cvtepi32_ps(t);
79         s = _mm256_mul_ps(s, m);
80         _mm256_store_ps(dp, s);
81       }
82     }
83 
84     //////////////////////////////////////////////////////////////////////////
avx_cnvrt_float_to_si32_shftd(const float * sp,si32 * dp,float mul,int width)85     void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
86                                        int width)
87     {
88       __m256 shift = _mm256_set1_ps(0.5f);
89       __m256 m = _mm256_set1_ps(mul);
90       for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
91       {
92         __m256 t = _mm256_load_ps(sp);
93         __m256 s = _mm256_add_ps(t, shift);
94         s = _mm256_mul_ps(s, m);
95         s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
96         _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
97       }
98     }
99 
100     //////////////////////////////////////////////////////////////////////////
avx_cnvrt_float_to_si32(const float * sp,si32 * dp,float mul,int width)101     void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
102                                  int width)
103     {
104       __m256 m = _mm256_set1_ps(mul);
105       for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
106       {
107         __m256 t = _mm256_load_ps(sp);
108         __m256 s = _mm256_mul_ps(t, m);
109         s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
110         _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
111       }
112     }
113 
114     //////////////////////////////////////////////////////////////////////////
avx_ict_forward(const float * r,const float * g,const float * b,float * y,float * cb,float * cr,int repeat)115     void avx_ict_forward(const float *r, const float *g, const float *b,
116                          float *y, float *cb, float *cr, int repeat)
117     {
118       __m256 alpha_rf = _mm256_set1_ps(CT_CNST::ALPHA_RF);
119       __m256 alpha_gf = _mm256_set1_ps(CT_CNST::ALPHA_GF);
120       __m256 alpha_bf = _mm256_set1_ps(CT_CNST::ALPHA_BF);
121       __m256 beta_cbf = _mm256_set1_ps(CT_CNST::BETA_CbF);
122       __m256 beta_crf = _mm256_set1_ps(CT_CNST::BETA_CrF);
123       for (int i = (repeat + 7) >> 3; i > 0; --i)
124       {
125         __m256 mr = _mm256_load_ps(r);
126         __m256 mb = _mm256_load_ps(b);
127         __m256 my = _mm256_mul_ps(alpha_rf, mr);
128         my = _mm256_add_ps(my, _mm256_mul_ps(alpha_gf, _mm256_load_ps(g)));
129         my = _mm256_add_ps(my, _mm256_mul_ps(alpha_bf, mb));
130         _mm256_store_ps(y, my);
131         _mm256_store_ps(cb, _mm256_mul_ps(beta_cbf, _mm256_sub_ps(mb, my)));
132         _mm256_store_ps(cr, _mm256_mul_ps(beta_crf, _mm256_sub_ps(mr, my)));
133 
134         r += 8; g += 8; b += 8;
135         y += 8; cb += 8; cr += 8;
136       }
137     }
138 
139     //////////////////////////////////////////////////////////////////////////
avx_ict_backward(const float * y,const float * cb,const float * cr,float * r,float * g,float * b,int repeat)140     void avx_ict_backward(const float *y, const float *cb, const float *cr,
141                           float *r, float *g, float *b, int repeat)
142     {
143       __m256 gamma_cr2g = _mm256_set1_ps(CT_CNST::GAMMA_CR2G);
144       __m256 gamma_cb2g = _mm256_set1_ps(CT_CNST::GAMMA_CB2G);
145       __m256 gamma_cr2r = _mm256_set1_ps(CT_CNST::GAMMA_CR2R);
146       __m256 gamma_cb2b = _mm256_set1_ps(CT_CNST::GAMMA_CB2B);
147       for (int i = (repeat + 7) >> 3; i > 0; --i)
148       {
149         __m256 my = _mm256_load_ps(y);
150         __m256 mcr = _mm256_load_ps(cr);
151         __m256 mcb = _mm256_load_ps(cb);
152         __m256 mg = _mm256_sub_ps(my, _mm256_mul_ps(gamma_cr2g, mcr));
153         _mm256_store_ps(g, _mm256_sub_ps(mg, _mm256_mul_ps(gamma_cb2g, mcb)));
154         _mm256_store_ps(r, _mm256_add_ps(my, _mm256_mul_ps(gamma_cr2r, mcr)));
155         _mm256_store_ps(b, _mm256_add_ps(my, _mm256_mul_ps(gamma_cb2b, mcb)));
156 
157         y += 8; cb += 8; cr += 8;
158         r += 8; g += 8; b += 8;
159       }
160     }
161 
162   }
163 }
164