1 //***************************************************************************/ 2 // This software is released under the 2-Clause BSD license, included 3 // below. 4 // 5 // Copyright (c) 2019, Aous Naman 6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia 7 // Copyright (c) 2019, The University of New South Wales, Australia 8 // 9 // Redistribution and use in source and binary forms, with or without 10 // modification, are permitted provided that the following conditions are 11 // met: 12 // 13 // 1. Redistributions of source code must retain the above copyright 14 // notice, this list of conditions and the following disclaimer. 15 // 16 // 2. Redistributions in binary form must reproduce the above copyright 17 // notice, this list of conditions and the following disclaimer in the 18 // documentation and/or other materials provided with the distribution. 19 // 20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 //***************************************************************************/ 32 // This file is part of the OpenJPH software implementation. 33 // File: ojph_colour_sse.cpp 34 // Author: Aous Naman 35 // Date: 11 October 2019 36 //***************************************************************************/ 37 38 #include <cmath> 39 40 #include "ojph_defs.h" 41 #include "ojph_arch.h" 42 #include "ojph_colour.h" 43 #include "ojph_colour_local.h" 44 45 #ifdef OJPH_COMPILER_MSVC 46 #include <intrin.h> 47 #else 48 #include <x86intrin.h> 49 #endif 50 51 namespace ojph { 52 namespace local { 53 54 ////////////////////////////////////////////////////////////////////////// sse_cnvrt_si32_to_float_shftd(const si32 * sp,float * dp,float mul,int width)55 void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, 56 int width) 57 { 58 __m128 shift = _mm_set1_ps(0.5f); 59 __m128 m = _mm_set1_ps(mul); 60 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) 61 { 62 __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp)); 63 __m128 s = _mm_cvtepi32_ps(t); 64 s = _mm_mul_ps(s, m); 65 s = _mm_sub_ps(s, shift); 66 _mm_store_ps(dp, s); 67 } 68 } 69 70 ////////////////////////////////////////////////////////////////////////// sse_cnvrt_si32_to_float(const si32 * sp,float * dp,float mul,int width)71 void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, 72 int width) 73 { 74 __m128 m = _mm_set1_ps(mul); 75 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) 76 { 77 __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp)); 78 __m128 s = _mm_cvtepi32_ps(t); 79 s = _mm_mul_ps(s, m); 80 _mm_store_ps(dp, s); 81 } 82 } 83 84 ////////////////////////////////////////////////////////////////////////// sse_cnvrt_float_to_si32_shftd(const float * sp,si32 * dp,float mul,int width)85 void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, 86 int width) 87 { 88 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); 89 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 90 __m128 shift = _mm_set1_ps(0.5f); 91 __m128 m = _mm_set1_ps(mul); 92 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4) 93 { 94 __m128 t = _mm_load_ps(sp); 95 __m128 s = _mm_add_ps(t, shift); 96 s = _mm_mul_ps(s, m); 97 // the following is a poorly designed code, but it is the only 98 // code that I am aware of that compiles on VS 32 and 64 modes 99 t = s; 100 *dp++ = _mm_cvtss_si32(t); 101 t = _mm_shuffle_ps(s, s, 1); 102 *dp++ = _mm_cvtss_si32(t); 103 t = _mm_shuffle_ps(s, s, 2); 104 *dp++ = _mm_cvtss_si32(t); 105 t = _mm_shuffle_ps(s, s, 3); 106 *dp++ = _mm_cvtss_si32(t); 107 } 108 _MM_SET_ROUNDING_MODE(rounding_mode); 109 } 110 111 ////////////////////////////////////////////////////////////////////////// sse_cnvrt_float_to_si32(const float * sp,si32 * dp,float mul,int width)112 void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, 113 int width) 114 { 115 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); 116 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 117 __m128 m = _mm_set1_ps(mul); 118 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4) 119 { 120 __m128 t = _mm_load_ps(sp); 121 __m128 s = _mm_mul_ps(t, m); 122 // the following is a poorly designed code, but it is the only 123 // code that I am aware of that compiles on VS 32 and 64 modes 124 t = s; 125 *dp++ = _mm_cvtss_si32(t); 126 t = _mm_shuffle_ps(s, s, 1); 127 *dp++ = _mm_cvtss_si32(t); 128 t = _mm_shuffle_ps(s, s, 2); 129 *dp++ = _mm_cvtss_si32(t); 130 t = _mm_shuffle_ps(s, s, 3); 131 *dp++ = _mm_cvtss_si32(t); 132 } 133 _MM_SET_ROUNDING_MODE(rounding_mode); 134 } 135 136 ////////////////////////////////////////////////////////////////////////// sse_ict_forward(const float * r,const float * g,const float * b,float * y,float * cb,float * cr,int repeat)137 void sse_ict_forward(const float *r, const float *g, const float *b, 138 float *y, float *cb, float *cr, int repeat) 139 { 140 __m128 alpha_rf = _mm_set1_ps(CT_CNST::ALPHA_RF); 141 __m128 alpha_gf = _mm_set1_ps(CT_CNST::ALPHA_GF); 142 __m128 alpha_bf = _mm_set1_ps(CT_CNST::ALPHA_BF); 143 __m128 beta_cbf = _mm_set1_ps(CT_CNST::BETA_CbF); 144 __m128 beta_crf = _mm_set1_ps(CT_CNST::BETA_CrF); 145 for (int i = (repeat + 3) >> 2; i > 0; --i) 146 { 147 __m128 mr = _mm_load_ps(r); 148 __m128 mb = _mm_load_ps(b); 149 __m128 my = _mm_mul_ps(alpha_rf, mr); 150 my = _mm_add_ps(my, _mm_mul_ps(alpha_gf, _mm_load_ps(g))); 151 my = _mm_add_ps(my, _mm_mul_ps(alpha_bf, mb)); 152 _mm_store_ps(y, my); 153 _mm_store_ps(cb, _mm_mul_ps(beta_cbf, _mm_sub_ps(mb, my))); 154 _mm_store_ps(cr, _mm_mul_ps(beta_crf, _mm_sub_ps(mr, my))); 155 156 r += 4; g += 4; b += 4; 157 y += 4; cb += 4; cr += 4; 158 } 159 } 160 161 ////////////////////////////////////////////////////////////////////////// sse_ict_backward(const float * y,const float * cb,const float * cr,float * r,float * g,float * b,int repeat)162 void sse_ict_backward(const float *y, const float *cb, const float *cr, 163 float *r, float *g, float *b, int repeat) 164 { 165 __m128 gamma_cr2g = _mm_set1_ps(CT_CNST::GAMMA_CR2G); 166 __m128 gamma_cb2g = _mm_set1_ps(CT_CNST::GAMMA_CB2G); 167 __m128 gamma_cr2r = _mm_set1_ps(CT_CNST::GAMMA_CR2R); 168 __m128 gamma_cb2b = _mm_set1_ps(CT_CNST::GAMMA_CB2B); 169 for (int i = (repeat + 3) >> 2; i > 0; --i) 170 { 171 __m128 my = _mm_load_ps(y); 172 __m128 mcr = _mm_load_ps(cr); 173 __m128 mcb = _mm_load_ps(cb); 174 __m128 mg = _mm_sub_ps(my, _mm_mul_ps(gamma_cr2g, mcr)); 175 _mm_store_ps(g, _mm_sub_ps(mg, _mm_mul_ps(gamma_cb2g, mcb))); 176 _mm_store_ps(r, _mm_add_ps(my, _mm_mul_ps(gamma_cr2r, mcr))); 177 _mm_store_ps(b, _mm_add_ps(my, _mm_mul_ps(gamma_cb2b, mcb))); 178 179 y += 4; cb += 4; cr += 4; 180 r += 4; g += 4; b += 4; 181 } 182 } 183 } 184 } 185