1 //***************************************************************************/ 2 // This software is released under the 2-Clause BSD license, included 3 // below. 4 // 5 // Copyright (c) 2019, Aous Naman 6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia 7 // Copyright (c) 2019, The University of New South Wales, Australia 8 // 9 // Redistribution and use in source and binary forms, with or without 10 // modification, are permitted provided that the following conditions are 11 // met: 12 // 13 // 1. Redistributions of source code must retain the above copyright 14 // notice, this list of conditions and the following disclaimer. 15 // 16 // 2. Redistributions in binary form must reproduce the above copyright 17 // notice, this list of conditions and the following disclaimer in the 18 // documentation and/or other materials provided with the distribution. 19 // 20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 //***************************************************************************/ 32 // This file is part of the OpenJPH software implementation. 33 // File: ojph_colour_sse2.cpp 34 // Author: Aous Naman 35 // Date: 11 October 2019 36 //***************************************************************************/ 37 38 #include <cmath> 39 40 #include "ojph_defs.h" 41 #include "ojph_arch.h" 42 #include "ojph_colour.h" 43 44 #ifdef OJPH_COMPILER_MSVC 45 #include <intrin.h> 46 #else 47 #include <x86intrin.h> 48 #endif 49 50 namespace ojph { 51 namespace local { 52 53 ////////////////////////////////////////////////////////////////////////// sse2_cnvrt_float_to_si32_shftd(const float * sp,si32 * dp,float mul,int width)54 void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, 55 int width) 56 { 57 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); 58 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 59 __m128 shift = _mm_set1_ps(0.5f); 60 __m128 m = _mm_set1_ps(mul); 61 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) 62 { 63 __m128 t = _mm_loadu_ps(sp); 64 __m128 s = _mm_add_ps(t, shift); 65 s = _mm_mul_ps(s, m); 66 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s)); 67 } 68 _MM_SET_ROUNDING_MODE(rounding_mode); 69 } 70 71 ////////////////////////////////////////////////////////////////////////// sse2_cnvrt_float_to_si32(const float * sp,si32 * dp,float mul,int width)72 void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, 73 int width) 74 { 75 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); 76 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 77 __m128 m = _mm_set1_ps(mul); 78 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) 79 { 80 __m128 t = _mm_loadu_ps(sp); 81 __m128 s = _mm_mul_ps(t, m); 82 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s)); 83 } 84 _MM_SET_ROUNDING_MODE(rounding_mode); 85 } 86 87 88 ////////////////////////////////////////////////////////////////////////// sse2_cnvrt_si32_to_si32_shftd(const si32 * sp,si32 * dp,int shift,int width)89 void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, 90 int width) 91 { 92 __m128i sh = _mm_set1_epi32(shift); 93 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) 94 { 95 __m128i s = _mm_loadu_si128((__m128i*)sp); 96 s = _mm_add_epi32(s, sh); 97 _mm_storeu_si128((__m128i*)dp, s); 98 } 99 } 100 101 ////////////////////////////////////////////////////////////////////////// sse2_rct_forward(const si32 * r,const si32 * g,const si32 * b,si32 * y,si32 * cb,si32 * cr,int repeat)102 void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, 103 si32 *y, si32 *cb, si32 *cr, int repeat) 104 { 105 for (int i = (repeat + 3) >> 2; i > 0; --i) 106 { 107 __m128i mr = _mm_load_si128((__m128i*)r); 108 __m128i mg = _mm_load_si128((__m128i*)g); 109 __m128i mb = _mm_load_si128((__m128i*)b); 110 __m128i t = _mm_add_epi32(mr, mb); 111 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1)); 112 _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2)); 113 t = _mm_sub_epi32(mb, mg); 114 _mm_store_si128((__m128i*)cb, t); 115 t = _mm_sub_epi32(mr, mg); 116 _mm_store_si128((__m128i*)cr, t); 117 118 r += 4; g += 4; b += 4; 119 y += 4; cb += 4; cr += 4; 120 } 121 } 122 123 ////////////////////////////////////////////////////////////////////////// sse2_rct_backward(const si32 * y,const si32 * cb,const si32 * cr,si32 * r,si32 * g,si32 * b,int repeat)124 void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, 125 si32 *r, si32 *g, si32 *b, int repeat) 126 { 127 for (int i = (repeat + 3) >> 2; i > 0; --i) 128 { 129 __m128i my = _mm_load_si128((__m128i*)y); 130 __m128i mcb = _mm_load_si128((__m128i*)cb); 131 __m128i mcr = _mm_load_si128((__m128i*)cr); 132 133 __m128i t = _mm_add_epi32(mcb, mcr); 134 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2)); 135 _mm_store_si128((__m128i*)g, t); 136 __m128i u = _mm_add_epi32(mcb, t); 137 _mm_store_si128((__m128i*)b, u); 138 u = _mm_add_epi32(mcr, t); 139 _mm_store_si128((__m128i*)r, u); 140 141 y += 4; cb += 4; cr += 4; 142 r += 4; g += 4; b += 4; 143 } 144 } 145 146 } 147 } 148