1 //***************************************************************************/ 2 // This software is released under the 2-Clause BSD license, included 3 // below. 4 // 5 // Copyright (c) 2019, Aous Naman 6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia 7 // Copyright (c) 2019, The University of New South Wales, Australia 8 // 9 // Redistribution and use in source and binary forms, with or without 10 // modification, are permitted provided that the following conditions are 11 // met: 12 // 13 // 1. Redistributions of source code must retain the above copyright 14 // notice, this list of conditions and the following disclaimer. 15 // 16 // 2. Redistributions in binary form must reproduce the above copyright 17 // notice, this list of conditions and the following disclaimer in the 18 // documentation and/or other materials provided with the distribution. 19 // 20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 //***************************************************************************/ 32 // This file is part of the OpenJPH software implementation. 33 // File: ojph_colour_avx.cpp 34 // Author: Aous Naman 35 // Date: 11 October 2019 36 //***************************************************************************/ 37 38 #include <cmath> 39 40 #include "ojph_defs.h" 41 #include "ojph_arch.h" 42 #include "ojph_colour.h" 43 #include "ojph_colour_local.h" 44 45 #ifdef OJPH_COMPILER_MSVC 46 #include <intrin.h> 47 #else 48 #include <x86intrin.h> 49 #endif 50 51 namespace ojph { 52 namespace local { 53 54 ////////////////////////////////////////////////////////////////////////// avx_cnvrt_si32_to_float_shftd(const si32 * sp,float * dp,float mul,int width)55 void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, 56 int width) 57 { 58 __m256 shift = _mm256_set1_ps(0.5f); 59 __m256 m = _mm256_set1_ps(mul); 60 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) 61 { 62 __m256i t = _mm256_loadu_si256((__m256i*)sp); 63 __m256 s = _mm256_cvtepi32_ps(t); 64 s = _mm256_mul_ps(s, m); 65 s = _mm256_sub_ps(s, shift); 66 _mm256_store_ps(dp, s); 67 } 68 } 69 70 ////////////////////////////////////////////////////////////////////////// avx_cnvrt_si32_to_float(const si32 * sp,float * dp,float mul,int width)71 void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, 72 int width) 73 { 74 __m256 m = _mm256_set1_ps(mul); 75 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) 76 { 77 __m256i t = _mm256_loadu_si256((__m256i*)sp); 78 __m256 s = _mm256_cvtepi32_ps(t); 79 s = _mm256_mul_ps(s, m); 80 _mm256_store_ps(dp, s); 81 } 82 } 83 84 ////////////////////////////////////////////////////////////////////////// avx_cnvrt_float_to_si32_shftd(const float * sp,si32 * dp,float mul,int width)85 void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, 86 int width) 87 { 88 __m256 shift = _mm256_set1_ps(0.5f); 89 __m256 m = _mm256_set1_ps(mul); 90 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) 91 { 92 __m256 t = _mm256_load_ps(sp); 93 __m256 s = _mm256_add_ps(t, shift); 94 s = _mm256_mul_ps(s, m); 95 s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 96 _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s)); 97 } 98 } 99 100 ////////////////////////////////////////////////////////////////////////// avx_cnvrt_float_to_si32(const float * sp,si32 * dp,float mul,int width)101 void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, 102 int width) 103 { 104 __m256 m = _mm256_set1_ps(mul); 105 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) 106 { 107 __m256 t = _mm256_load_ps(sp); 108 __m256 s = _mm256_mul_ps(t, m); 109 s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 110 _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s)); 111 } 112 } 113 114 ////////////////////////////////////////////////////////////////////////// avx_ict_forward(const float * r,const float * g,const float * b,float * y,float * cb,float * cr,int repeat)115 void avx_ict_forward(const float *r, const float *g, const float *b, 116 float *y, float *cb, float *cr, int repeat) 117 { 118 __m256 alpha_rf = _mm256_set1_ps(CT_CNST::ALPHA_RF); 119 __m256 alpha_gf = _mm256_set1_ps(CT_CNST::ALPHA_GF); 120 __m256 alpha_bf = _mm256_set1_ps(CT_CNST::ALPHA_BF); 121 __m256 beta_cbf = _mm256_set1_ps(CT_CNST::BETA_CbF); 122 __m256 beta_crf = _mm256_set1_ps(CT_CNST::BETA_CrF); 123 for (int i = (repeat + 7) >> 3; i > 0; --i) 124 { 125 __m256 mr = _mm256_load_ps(r); 126 __m256 mb = _mm256_load_ps(b); 127 __m256 my = _mm256_mul_ps(alpha_rf, mr); 128 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_gf, _mm256_load_ps(g))); 129 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_bf, mb)); 130 _mm256_store_ps(y, my); 131 _mm256_store_ps(cb, _mm256_mul_ps(beta_cbf, _mm256_sub_ps(mb, my))); 132 _mm256_store_ps(cr, _mm256_mul_ps(beta_crf, _mm256_sub_ps(mr, my))); 133 134 r += 8; g += 8; b += 8; 135 y += 8; cb += 8; cr += 8; 136 } 137 } 138 139 ////////////////////////////////////////////////////////////////////////// avx_ict_backward(const float * y,const float * cb,const float * cr,float * r,float * g,float * b,int repeat)140 void avx_ict_backward(const float *y, const float *cb, const float *cr, 141 float *r, float *g, float *b, int repeat) 142 { 143 __m256 gamma_cr2g = _mm256_set1_ps(CT_CNST::GAMMA_CR2G); 144 __m256 gamma_cb2g = _mm256_set1_ps(CT_CNST::GAMMA_CB2G); 145 __m256 gamma_cr2r = _mm256_set1_ps(CT_CNST::GAMMA_CR2R); 146 __m256 gamma_cb2b = _mm256_set1_ps(CT_CNST::GAMMA_CB2B); 147 for (int i = (repeat + 7) >> 3; i > 0; --i) 148 { 149 __m256 my = _mm256_load_ps(y); 150 __m256 mcr = _mm256_load_ps(cr); 151 __m256 mcb = _mm256_load_ps(cb); 152 __m256 mg = _mm256_sub_ps(my, _mm256_mul_ps(gamma_cr2g, mcr)); 153 _mm256_store_ps(g, _mm256_sub_ps(mg, _mm256_mul_ps(gamma_cb2g, mcb))); 154 _mm256_store_ps(r, _mm256_add_ps(my, _mm256_mul_ps(gamma_cr2r, mcr))); 155 _mm256_store_ps(b, _mm256_add_ps(my, _mm256_mul_ps(gamma_cb2b, mcb))); 156 157 y += 8; cb += 8; cr += 8; 158 r += 8; g += 8; b += 8; 159 } 160 } 161 162 } 163 } 164