1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 26 янв. 2020 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_AVX2_FLOAT_H_ 23 #define DSP_ARCH_X86_AVX2_FLOAT_H_ 24 25 #ifndef DSP_ARCH_X86_AVX2_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_X86_AVX2_IMPL */ 28 29 namespace avx2 30 { 31 #define U8VEC(x) x, x, x, x, x, x, x, x 32 IF_ARCH_X86( 33 static uint32_t XC_SATURATE[] __lsp_aligned32 = 34 { 35 U8VEC(0x7fffffff), // X_ABS 36 U8VEC(0x80000000), // X_SIGN 37 U8VEC(0x7f800000), // X_P_INF 38 U8VEC(FLOAT_SAT_P_NAN_I), // SX_P_NAN 39 U8VEC(FLOAT_SAT_P_INF_I) // SX_P_INF 40 }; 41 ) 42 #undef U8VEC 43 44 #define SAT_BODY(DST, SRC) \ 45 __ASM_EMIT("xor %[off], %[off]") \ 46 /* x16 blocks */ \ 47 __ASM_EMIT("sub $16, %[count]") \ 48 __ASM_EMIT("jb 2f") \ 49 __ASM_EMIT("1:") \ 50 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%ymm0") /* ymm0 = s */ \ 51 __ASM_EMIT("vmovups 0x20(%[" SRC "], %[off]), %%ymm1") \ 52 __ASM_EMIT("vandps 0x00 + %[XC], %%ymm0, %%ymm2") /* ymm2 = abs(s) */ \ 53 __ASM_EMIT("vandps 0x00 + %[XC], %%ymm1, %%ymm3") \ 54 __ASM_EMIT("vandps 0x20 + %[XC], %%ymm0, %%ymm4") /* ymm4 = sign(s) */ \ 55 __ASM_EMIT("vandps 0x20 + %[XC], %%ymm1, %%ymm5") \ 56 __ASM_EMIT("vpcmpgtd 0x40 + %[XC], %%ymm2, %%ymm6") /* ymm6 = abs(s) > +Inf */ \ 57 __ASM_EMIT("vpcmpgtd 0x40 + %[XC], %%ymm3, %%ymm7") \ 58 __ASM_EMIT("vpcmpeqd 0x40 + %[XC], %%ymm2, %%ymm2") /* ymm2 = abs(s) == +Inf */ \ 59 __ASM_EMIT("vpcmpeqd 0x40 + %[XC], %%ymm3, %%ymm3") \ 60 __ASM_EMIT("vblendvps %%ymm6, 0x60 + %[XC], %%ymm0, %%ymm0") /* ymm0 = S = s & (abs(s) <= +Inf) | PNAN & (abs(s) > +Inf) */ \ 61 __ASM_EMIT("vblendvps %%ymm7, 0x60 + %[XC], %%ymm1, %%ymm1") \ 62 __ASM_EMIT("vblendvps %%ymm2, 0x80 + %[XC], %%ymm0, %%ymm0") /* ymm0 = S & (abs(S) != +Inf) | PINF & (abs(s) == +Inf) */ \ 63 __ASM_EMIT("vblendvps %%ymm3, 0x80 + %[XC], %%ymm1, %%ymm1") \ 64 __ASM_EMIT("vorps %%ymm4, %%ymm0, %%ymm0") \ 65 __ASM_EMIT("vorps %%ymm5, %%ymm1, %%ymm1") \ 66 __ASM_EMIT("vmovups %%ymm0, 0x00(%[" DST "], %[off])") \ 67 __ASM_EMIT("vmovups %%ymm1, 0x20(%[" DST "], %[off])") \ 68 __ASM_EMIT("add $0x40, %[off]") \ 69 __ASM_EMIT("sub $16, %[count]") \ 70 __ASM_EMIT("jae 1b") \ 71 __ASM_EMIT("2:") \ 72 /* x8 blocks */ \ 73 __ASM_EMIT("add $8, %[count]") \ 74 __ASM_EMIT("jl 4f") \ 75 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 76 __ASM_EMIT("vmovups 0x10(%[" SRC "], %[off]), %%xmm1") \ 77 __ASM_EMIT("vandps 0x00 + %[XC], %%xmm0, %%xmm2") /* xmm2 = abs(s) */ \ 78 __ASM_EMIT("vandps 0x00 + %[XC], %%xmm1, %%xmm3") \ 79 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm0, %%xmm4") /* xmm4 = sign(s) */ \ 80 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm1, %%xmm5") \ 81 __ASM_EMIT("vpcmpgtd 0x40 + %[XC], %%xmm2, %%xmm6") /* xmm6 = abs(s) > +Inf */ \ 82 __ASM_EMIT("vpcmpgtd 0x40 + %[XC], %%xmm3, %%xmm7") \ 83 __ASM_EMIT("vpcmpeqd 0x40 + %[XC], %%xmm2, %%xmm2") /* xmm2 = abs(s) == +Inf */ \ 84 __ASM_EMIT("vpcmpeqd 0x40 + %[XC], %%xmm3, %%xmm3") \ 85 __ASM_EMIT("vblendvps %%xmm6, 0x60 + %[XC], %%xmm0, %%xmm0") /* xmm0 = S = s & (abs(s) <= +Inf) | PNAN & (abs(s) > +Inf) */ \ 86 __ASM_EMIT("vblendvps %%xmm7, 0x60 + %[XC], %%xmm1, %%xmm1") \ 87 __ASM_EMIT("vblendvps %%xmm2, 0x80 + %[XC], %%xmm0, %%xmm0") /* xmm0 = S & (abs(S) != +Inf) | PINF & (abs(s) == +Inf) */ \ 88 __ASM_EMIT("vblendvps %%xmm3, 0x80 + %[XC], %%xmm1, %%xmm1") \ 89 __ASM_EMIT("vorps %%xmm4, %%xmm0, %%xmm0") \ 90 __ASM_EMIT("vorps %%xmm5, %%xmm1, %%xmm1") \ 91 __ASM_EMIT("vmovups %%xmm0, 0x00(%[" DST "], %[off])") \ 92 __ASM_EMIT("vmovups %%xmm1, 0x10(%[" DST "], %[off])") \ 93 __ASM_EMIT("sub $8, %[count]") \ 94 __ASM_EMIT("add $0x20, %[off]") \ 95 __ASM_EMIT("4:") \ 96 /* x4 block */ \ 97 __ASM_EMIT("add $4, %[count]") \ 98 __ASM_EMIT("jl 6f") \ 99 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 100 __ASM_EMIT("vandps 0x00 + %[XC], %%xmm0, %%xmm2") /* xmm2 = abs(s) */ \ 101 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm0, %%xmm4") /* xmm4 = sign(s) */ \ 102 __ASM_EMIT("vpcmpgtd 0x40 + %[XC], %%xmm2, %%xmm6") /* xmm6 = abs(s) > +Inf */ \ 103 __ASM_EMIT("vpcmpeqd 0x40 + %[XC], %%xmm2, %%xmm2") /* xmm2 = abs(s) == +Inf */ \ 104 __ASM_EMIT("vblendvps %%xmm6, 0x60 + %[XC], %%xmm0, %%xmm0") /* xmm0 = S = s & (abs(s) <= +Inf) | PNAN & (abs(s) > +Inf) */ \ 105 __ASM_EMIT("vblendvps %%xmm2, 0x80 + %[XC], %%xmm0, %%xmm0") /* xmm0 = S & (abs(S) != +Inf) | PINF & (abs(s) == +Inf) */ \ 106 __ASM_EMIT("vorps %%xmm4, %%xmm0, %%xmm0") \ 107 __ASM_EMIT("vmovups %%xmm0, 0x00(%[" DST "], %[off])") \ 108 __ASM_EMIT("sub $4, %[count]") \ 109 __ASM_EMIT("add $0x10, %[off]") \ 110 __ASM_EMIT("6:") \ 111 /* x1 block */ \ 112 __ASM_EMIT("add $3, %[count]") \ 113 __ASM_EMIT("jl 8f") \ 114 __ASM_EMIT("7:") \ 115 __ASM_EMIT("vmovss 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 116 __ASM_EMIT("vandps 0x00 + %[XC], %%xmm0, %%xmm2") /* xmm2 = abs(s) */ \ 117 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm0, %%xmm4") /* xmm4 = sign(s) */ \ 118 __ASM_EMIT("vpcmpgtd 0x40 + %[XC], %%xmm2, %%xmm6") /* xmm6 = abs(s) > +Inf */ \ 119 __ASM_EMIT("vpcmpeqd 0x40 + %[XC], %%xmm2, %%xmm2") /* xmm2 = abs(s) == +Inf */ \ 120 __ASM_EMIT("vblendvps %%xmm6, 0x60 + %[XC], %%xmm0, %%xmm0") /* xmm0 = S = s & (abs(s) <= +Inf) | PNAN & (abs(s) > +Inf) */ \ 121 __ASM_EMIT("vblendvps %%xmm2, 0x80 + %[XC], %%xmm0, %%xmm0") /* xmm0 = S & (abs(S) != +Inf) | PINF & (abs(s) == +Inf) */ \ 122 __ASM_EMIT("vorps %%xmm4, %%xmm0, %%xmm0") \ 123 __ASM_EMIT("vmovss %%xmm0, 0x00(%[" DST "], %[off])") \ 124 __ASM_EMIT("add $0x04, %[off]") \ 125 __ASM_EMIT("dec %[count]") \ 126 __ASM_EMIT("jge 7b") \ 127 __ASM_EMIT("8:") 128 saturate(float * dst,size_t count)129 void saturate(float *dst, size_t count) 130 { 131 IF_ARCH_X86(size_t off); 132 ARCH_X86_ASM( 133 SAT_BODY("dst", "dst") 134 : [off] "=&r" (off), [count] "+r" (count) 135 : [dst] "r" (dst), 136 [XC] "o" (XC_SATURATE) 137 : "cc", "memory", 138 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 139 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 140 ); 141 } 142 copy_saturated(float * dst,const float * src,size_t count)143 void copy_saturated(float *dst, const float *src, size_t count) 144 { 145 IF_ARCH_X86(size_t off); 146 ARCH_X86_ASM( 147 SAT_BODY("dst", "src") 148 : [off] "=&r" (off), [count] "+r" (count) 149 : [dst] "r" (dst), [src] "r" (src), 150 [XC] "o" (XC_SATURATE) 151 : "cc", "memory", 152 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 153 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 154 ); 155 } 156 157 #define LIMIT_SAT_BODY(DST, SRC) \ 158 __ASM_EMIT("xor %[off], %[off]") \ 159 /* x16 blocks */ \ 160 __ASM_EMIT("sub $16, %[count]") \ 161 __ASM_EMIT("jb 2f") \ 162 __ASM_EMIT("1:") \ 163 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%ymm0") /* ymm0 = s */ \ 164 __ASM_EMIT("vmovups 0x20(%[" SRC "], %[off]), %%ymm1") \ 165 __ASM_EMIT("vmovaps 0x00 + %[XC], %%ymm2") /* ymm2 = +1 */ \ 166 __ASM_EMIT("vandps 0x20 + %[XC], %%ymm0, %%ymm6") /* ymm6 = abs(s) */ \ 167 __ASM_EMIT("vmovaps %%ymm2, %%ymm3") \ 168 __ASM_EMIT("vandps 0x20 + %[XC], %%ymm1, %%ymm7") \ 169 __ASM_EMIT("vcmpps $2, %%ymm2, %%ymm6, %%ymm2") /* ymm2 = c = [ (abs(s) <= +1) & !isnan(s) ] */ \ 170 __ASM_EMIT("vcmpps $2, %%ymm3, %%ymm7, %%ymm3") \ 171 __ASM_EMIT("vandps 0x40 + %[XC], %%ymm0, %%ymm4") /* ymm4 = sign(s) */ \ 172 __ASM_EMIT("vandps 0x40 + %[XC], %%ymm1, %%ymm5") \ 173 __ASM_EMIT("vpcmpgtd 0x60 + %[XC], %%ymm6, %%ymm6") /* ymm6 = [ abs(s) > +Inf ] */ \ 174 __ASM_EMIT("vpcmpgtd 0x60 + %[XC], %%ymm7, %%ymm7") \ 175 __ASM_EMIT("vorps 0x00 + %[XC], %%ymm4, %%ymm4") /* ymm4 = +1 * sign(s) */ \ 176 __ASM_EMIT("vorps 0x00 + %[XC], %%ymm5, %%ymm5") \ 177 __ASM_EMIT("vandnps %%ymm4, %%ymm6, %%ymm4") /* ymm4 = r = +1 * sign(s) & [ abs(s) <= +Inf ] */ \ 178 __ASM_EMIT("vandnps %%ymm5, %%ymm7, %%ymm5") \ 179 __ASM_EMIT("vblendvps %%ymm2, %%ymm0, %%ymm4, %%ymm0") /* ymm0 = (s & c) | (r & !c) */ \ 180 __ASM_EMIT("vblendvps %%ymm3, %%ymm1, %%ymm5, %%ymm1") \ 181 __ASM_EMIT("vmovups %%ymm0, 0x00(%[" DST "], %[off])") \ 182 __ASM_EMIT("vmovups %%ymm1, 0x20(%[" DST "], %[off])") \ 183 __ASM_EMIT("add $0x40, %[off]") \ 184 __ASM_EMIT("sub $16, %[count]") \ 185 __ASM_EMIT("jae 1b") \ 186 __ASM_EMIT("2:") \ 187 /* x8 blocks */ \ 188 __ASM_EMIT("add $8, %[count]") \ 189 __ASM_EMIT("jl 4f") \ 190 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 191 __ASM_EMIT("vmovups 0x10(%[" SRC "], %[off]), %%xmm1") \ 192 __ASM_EMIT("vmovaps 0x00 + %[XC], %%xmm2") /* xmm2 = +1 */ \ 193 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm0, %%xmm6") /* xmm6 = abs(s) */ \ 194 __ASM_EMIT("vmovaps %%xmm2, %%xmm3") \ 195 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm1, %%xmm7") \ 196 __ASM_EMIT("vcmpps $2, %%xmm2, %%xmm6, %%xmm2") /* xmm2 = c = [ (abs(s) <= +1) & !isnan(s) ] */ \ 197 __ASM_EMIT("vcmpps $2, %%xmm3, %%xmm7, %%xmm3") \ 198 __ASM_EMIT("vandps 0x40 + %[XC], %%xmm0, %%xmm4") /* xmm4 = sign(s) */ \ 199 __ASM_EMIT("vandps 0x40 + %[XC], %%xmm1, %%xmm5") \ 200 __ASM_EMIT("vpcmpgtd 0x60 + %[XC], %%xmm6, %%xmm6") /* xmm6 = [ abs(s) > +Inf ] */ \ 201 __ASM_EMIT("vpcmpgtd 0x60 + %[XC], %%xmm7, %%xmm7") \ 202 __ASM_EMIT("vorps 0x00 + %[XC], %%xmm4, %%xmm4") /* xmm4 = +1 * sign(s) */ \ 203 __ASM_EMIT("vorps 0x00 + %[XC], %%xmm5, %%xmm5") \ 204 __ASM_EMIT("vandnps %%xmm4, %%xmm6, %%xmm4") /* xmm4 = r = +1 * sign(s) & [ abs(s) <= +Inf ] */ \ 205 __ASM_EMIT("vandnps %%xmm5, %%xmm7, %%xmm5") \ 206 __ASM_EMIT("vblendvps %%xmm2, %%xmm0, %%xmm4, %%xmm0") /* xmm0 = (s & c) | (r & !c) */ \ 207 __ASM_EMIT("vblendvps %%xmm3, %%xmm1, %%xmm5, %%xmm1") \ 208 __ASM_EMIT("vmovups %%xmm0, 0x00(%[" DST "], %[off])") \ 209 __ASM_EMIT("vmovups %%xmm1, 0x10(%[" DST "], %[off])") \ 210 __ASM_EMIT("sub $8, %[count]") \ 211 __ASM_EMIT("add $0x20, %[off]") \ 212 __ASM_EMIT("4:") \ 213 /* x4 block */ \ 214 __ASM_EMIT("add $4, %[count]") \ 215 __ASM_EMIT("jl 6f") \ 216 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 217 __ASM_EMIT("vmovaps 0x00 + %[XC], %%xmm2") /* xmm2 = +1 */ \ 218 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm0, %%xmm6") /* xmm6 = abs(s) */ \ 219 __ASM_EMIT("vcmpps $2, %%xmm2, %%xmm6, %%xmm2") /* xmm2 = c = [ (abs(s) <= +1) & !isnan(s) ] */ \ 220 __ASM_EMIT("vandps 0x40 + %[XC], %%xmm0, %%xmm4") /* xmm4 = sign(s) */ \ 221 __ASM_EMIT("vpcmpgtd 0x60 + %[XC], %%xmm6, %%xmm6") /* xmm6 = [ abs(s) > +Inf ] */ \ 222 __ASM_EMIT("vorps 0x00 + %[XC], %%xmm4, %%xmm4") /* xmm4 = +1 * sign(s) */ \ 223 __ASM_EMIT("vandnps %%xmm4, %%xmm6, %%xmm4") /* xmm4 = r = +1 * sign(s) & [ abs(s) <= +Inf ] */ \ 224 __ASM_EMIT("vblendvps %%xmm2, %%xmm0, %%xmm4, %%xmm0") /* xmm0 = (s & c) | (r & !c) */ \ 225 __ASM_EMIT("vmovups %%xmm0, 0x00(%[" DST "], %[off])") \ 226 __ASM_EMIT("sub $4, %[count]") \ 227 __ASM_EMIT("add $0x10, %[off]") \ 228 __ASM_EMIT("6:") \ 229 /* x1 block */ \ 230 __ASM_EMIT("add $3, %[count]") \ 231 __ASM_EMIT("jl 8f") \ 232 __ASM_EMIT("7:") \ 233 __ASM_EMIT("vmovss 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 234 __ASM_EMIT("vmovaps 0x00 + %[XC], %%xmm2") /* xmm2 = +1 */ \ 235 __ASM_EMIT("vandps 0x20 + %[XC], %%xmm0, %%xmm6") /* xmm6 = abs(s) */ \ 236 __ASM_EMIT("vcmpps $2, %%xmm2, %%xmm6, %%xmm2") /* xmm2 = c = [ (abs(s) <= +1) & !isnan(s) ] */ \ 237 __ASM_EMIT("vandps 0x40 + %[XC], %%xmm0, %%xmm4") /* xmm4 = sign(s) */ \ 238 __ASM_EMIT("vpcmpgtd 0x60 + %[XC], %%xmm6, %%xmm6") /* xmm6 = [ abs(s) > +Inf ] */ \ 239 __ASM_EMIT("vorps 0x00 + %[XC], %%xmm4, %%xmm4") /* xmm4 = +1 * sign(s) */ \ 240 __ASM_EMIT("vandnps %%xmm4, %%xmm6, %%xmm4") /* xmm4 = r = +1 * sign(s) & [ abs(s) <= +Inf ] */ \ 241 __ASM_EMIT("vblendvps %%xmm2, %%xmm0, %%xmm4, %%xmm0") /* xmm0 = (s & c) | (r & !c) */ \ 242 __ASM_EMIT("vmovss %%xmm0, 0x00(%[" DST "], %[off])") \ 243 __ASM_EMIT("add $0x04, %[off]") \ 244 __ASM_EMIT("dec %[count]") \ 245 __ASM_EMIT("jge 7b") \ 246 __ASM_EMIT("8:") 247 248 #define U8VEC(x) x, x, x, x, x, x, x, x 249 IF_ARCH_X86( 250 static uint32_t XLIM_SAT[] __lsp_aligned32 = 251 { 252 U8VEC(0x3f800000), // +1 253 U8VEC(0x7fffffff), // abs 254 U8VEC(0x80000000), // sign 255 U8VEC(0x7f800000) // +Inf 256 }; 257 ) 258 #undef U8VEC 259 limit_saturate1(float * dst,size_t count)260 void limit_saturate1(float *dst, size_t count) 261 { 262 IF_ARCH_X86(size_t off); 263 ARCH_X86_ASM( 264 LIMIT_SAT_BODY("dst", "dst") 265 : [off] "=&r" (off), [count] "+r" (count) 266 : [dst] "r" (dst), 267 [XC] "o" (XLIM_SAT) 268 : "cc", "memory", 269 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 270 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 271 ); 272 } 273 limit_saturate2(float * dst,const float * src,size_t count)274 void limit_saturate2(float *dst, const float *src, size_t count) 275 { 276 IF_ARCH_X86(size_t off); 277 ARCH_X86_ASM( 278 LIMIT_SAT_BODY("dst", "src") 279 : [off] "=&r" (off), [count] "+r" (count) 280 : [dst] "r" (dst), [src] "r" (src), 281 [XC] "o" (XLIM_SAT) 282 : "cc", "memory", 283 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 284 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 285 ); 286 } 287 288 #undef LIMIT_SAT_BODY 289 290 #define SANITIZE_BODY(DST, SRC) \ 291 __ASM_EMIT("xor %[off], %[off]") \ 292 __ASM_EMIT("sub $16, %[count]") \ 293 __ASM_EMIT("jb 2f") \ 294 /* 16x blocks */ \ 295 __ASM_EMIT("1:") \ 296 __ASM_EMIT("vmovdqu 0x00(%[" SRC "], %[off]), %%ymm0") /* ymm0 = s */ \ 297 __ASM_EMIT("vmovdqu 0x20(%[" SRC "], %[off]), %%ymm4") \ 298 __ASM_EMIT("vpand 0x00 + %[CVAL], %%ymm0, %%ymm1") /* ymm1 = abs(s) */ \ 299 __ASM_EMIT("vpand 0x00 + %[CVAL], %%ymm4, %%ymm5") \ 300 __ASM_EMIT("vpand 0x20 + %[CVAL], %%ymm0, %%ymm2") /* ymm2 = sign(s) */ \ 301 __ASM_EMIT("vpand 0x20 + %[CVAL], %%ymm4, %%ymm6") \ 302 __ASM_EMIT("vpcmpgtd 0x40 + %[CVAL], %%ymm1, %%ymm3") /* ymm3 = abs(s) > X_MAX */ \ 303 __ASM_EMIT("vpcmpgtd 0x40 + %[CVAL], %%ymm5, %%ymm7") \ 304 __ASM_EMIT("vpcmpgtd 0x60 + %[CVAL], %%ymm1, %%ymm1") /* ymm1 = abs(s) > X_MIN */ \ 305 __ASM_EMIT("vpcmpgtd 0x60 + %[CVAL], %%ymm5, %%ymm5") \ 306 __ASM_EMIT("vpandn %%ymm1, %%ymm3, %%ymm1") /* ymm1 = (abs(s) > X_MIN) & (abs(s) <= X_MAX) */ \ 307 __ASM_EMIT("vpandn %%ymm5, %%ymm7, %%ymm5") \ 308 __ASM_EMIT("vblendvps %%ymm1, %%ymm0, %%ymm2, %%ymm0") /* ymm0 = ((abs(s) > X_MIN) & (abs(s) <= X_MAX))) ? s : sign(s) */ \ 309 __ASM_EMIT("vblendvps %%ymm5, %%ymm4, %%ymm6, %%ymm4") \ 310 __ASM_EMIT("vmovdqu %%ymm0, 0x00(%[" DST "], %[off])") /* ymm0 = s */ \ 311 __ASM_EMIT("vmovdqu %%ymm4, 0x20(%[" DST "], %[off])") \ 312 __ASM_EMIT("add $0x40, %[off]") \ 313 __ASM_EMIT("sub $16, %[count]") \ 314 __ASM_EMIT("jae 1b") \ 315 /* 8x block */ \ 316 __ASM_EMIT("2:") \ 317 __ASM_EMIT("add $8, %[count]") \ 318 __ASM_EMIT("jl 4f") \ 319 __ASM_EMIT("vmovdqu 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 320 __ASM_EMIT("vmovdqu 0x10(%[" SRC "], %[off]), %%xmm4") \ 321 __ASM_EMIT("vpand 0x00 + %[CVAL], %%xmm0, %%xmm1") /* xmm1 = abs(s) */ \ 322 __ASM_EMIT("vpand 0x10 + %[CVAL], %%xmm4, %%xmm5") \ 323 __ASM_EMIT("vpand 0x20 + %[CVAL], %%xmm0, %%xmm2") /* xmm2 = sign(s) */ \ 324 __ASM_EMIT("vpand 0x30 + %[CVAL], %%xmm4, %%xmm6") \ 325 __ASM_EMIT("vpcmpgtd 0x40 + %[CVAL], %%xmm1, %%xmm3") /* xmm3 = abs(s) > X_MAX */ \ 326 __ASM_EMIT("vpcmpgtd 0x50 + %[CVAL], %%xmm5, %%xmm7") \ 327 __ASM_EMIT("vpcmpgtd 0x60 + %[CVAL], %%xmm1, %%xmm1") /* xmm1 = abs(s) > X_MIN */ \ 328 __ASM_EMIT("vpcmpgtd 0x70 + %[CVAL], %%xmm5, %%xmm5") \ 329 __ASM_EMIT("vpandn %%xmm1, %%xmm3, %%xmm1") /* xmm1 = (abs(s) > X_MIN) & (abs(s) <= X_MAX) */ \ 330 __ASM_EMIT("vpandn %%xmm5, %%xmm7, %%xmm5") \ 331 __ASM_EMIT("vblendvps %%xmm1, %%xmm0, %%xmm2, %%xmm0") /* xmm0 = ((abs(s) > X_MIN) & (abs(s) <= X_MAX))) ? s : sign(s) */ \ 332 __ASM_EMIT("vblendvps %%xmm5, %%xmm4, %%xmm6, %%xmm4") \ 333 __ASM_EMIT("vmovdqu %%xmm0, 0x00(%[" DST "], %[off])") /* xmm0 = s */ \ 334 __ASM_EMIT("vmovdqu %%xmm4, 0x10(%[" DST "], %[off])") \ 335 __ASM_EMIT("sub $8, %[count]") \ 336 __ASM_EMIT("add $0x20, %[off]") \ 337 /* 4x block */ \ 338 __ASM_EMIT("4:") \ 339 __ASM_EMIT("add $4, %[count]") \ 340 __ASM_EMIT("jl 6f") \ 341 __ASM_EMIT("vmovdqu 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 342 __ASM_EMIT("vpand 0x00 + %[CVAL], %%xmm0, %%xmm1") /* xmm1 = abs(s) */ \ 343 __ASM_EMIT("vpand 0x20 + %[CVAL], %%xmm0, %%xmm2") /* xmm2 = sign(s) */ \ 344 __ASM_EMIT("vpcmpgtd 0x40 + %[CVAL], %%xmm1, %%xmm3") /* xmm3 = abs(s) > X_MAX */ \ 345 __ASM_EMIT("vpcmpgtd 0x60 + %[CVAL], %%xmm1, %%xmm1") /* xmm1 = abs(s) > X_MIN */ \ 346 __ASM_EMIT("vpandn %%xmm1, %%xmm3, %%xmm1") /* xmm1 = (abs(s) > X_MIN) & (abs(s) <= X_MAX) */ \ 347 __ASM_EMIT("vblendvps %%xmm1, %%xmm0, %%xmm2, %%xmm0") /* xmm0 = ((abs(s) > X_MIN) & (abs(s) <= X_MAX))) ? s : sign(s) */ \ 348 __ASM_EMIT("vmovdqu %%xmm0, 0x00(%[" DST "], %[off])") /* xmm0 = s */ \ 349 __ASM_EMIT("sub $4, %[count]") \ 350 __ASM_EMIT("add $0x10, %[off]") \ 351 /* 1x blocks */ \ 352 __ASM_EMIT("6:") \ 353 __ASM_EMIT("add $3, %[count]") \ 354 __ASM_EMIT("jl 8f") \ 355 __ASM_EMIT("7:") \ 356 __ASM_EMIT("vmovd 0x00(%[" SRC "], %[off]), %%xmm0") /* xmm0 = s */ \ 357 __ASM_EMIT("vpand 0x00 + %[CVAL], %%xmm0, %%xmm1") /* xmm1 = abs(s) */ \ 358 __ASM_EMIT("vpand 0x20 + %[CVAL], %%xmm0, %%xmm2") /* xmm2 = sign(s) */ \ 359 __ASM_EMIT("vpcmpgtd 0x40 + %[CVAL], %%xmm1, %%xmm3") /* xmm3 = abs(s) > X_MAX */ \ 360 __ASM_EMIT("vpcmpgtd 0x60 + %[CVAL], %%xmm1, %%xmm1") /* xmm1 = abs(s) > X_MIN */ \ 361 __ASM_EMIT("vpandn %%xmm1, %%xmm3, %%xmm1") /* xmm1 = (abs(s) > X_MIN) & (abs(s) <= X_MAX) */ \ 362 __ASM_EMIT("vblendvps %%xmm1, %%xmm0, %%xmm2, %%xmm0") /* xmm0 = ((abs(s) > X_MIN) & (abs(s) <= X_MAX))) ? s : sign(s) */ \ 363 __ASM_EMIT("vmovd %%xmm0, 0x00(%[" DST "], %[off])") /* xmm0 = s */ \ 364 __ASM_EMIT("add $0x04, %[off]") \ 365 __ASM_EMIT("dec %[count]") \ 366 __ASM_EMIT("jge 7b") \ 367 /* end */ \ 368 __ASM_EMIT("8:") 369 370 #define U8VEC(x) x, x, x, x, x, x, x, x 371 IF_ARCH_X86( 372 static uint32_t SANITIZE_CVAL[] __lsp_aligned32 = 373 { 374 U8VEC(0x7fffffff), // X_ABS 375 U8VEC(0x80000000), // X_SIGN 376 U8VEC(0x7f7fffff), // X_MAX 377 U8VEC(0x007fffff) // X_MIN 378 }; 379 ) 380 #undef U8VEC 381 sanitize1(float * dst,size_t count)382 void sanitize1(float *dst, size_t count) 383 { 384 IF_ARCH_X86(size_t off); 385 386 ARCH_X86_ASM 387 ( 388 SANITIZE_BODY("dst", "dst") 389 : [off] "=&r" (off), [count] "+r" (count) 390 : [dst] "r" (dst), 391 [CVAL] "o" (SANITIZE_CVAL) 392 : "cc", "memory", 393 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 394 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 395 ); 396 } 397 sanitize2(float * dst,const float * src,size_t count)398 void sanitize2(float *dst, const float *src, size_t count) 399 { 400 IF_ARCH_X86(size_t off); 401 402 ARCH_X86_ASM 403 ( 404 SANITIZE_BODY("dst", "src") 405 : [off] "=&r" (off), [count] "+r" (count) 406 : [dst] "r" (dst), [src] "r" (src), 407 [CVAL] "o" (SANITIZE_CVAL) 408 : "cc", "memory", 409 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 410 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 411 ); 412 } 413 414 #undef SANITIZE_BODY 415 416 } 417 418 419 420 #endif /* DSP_ARCH_X86_AVX2_FLOAT_H_ */ 421