1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 28 нояб. 2019 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_AVX_PMATH_OP_KX_H_ 23 #define DSP_ARCH_X86_AVX_PMATH_OP_KX_H_ 24 25 #ifndef DSP_ARCH_X86_AVX_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_X86_AVX_IMPL */ 28 29 namespace avx 30 { 31 #define OP_DSEL(a, b) a 32 #define OP_RSEL(a, b) b 33 34 #define OP_K4_CORE(DST, SRC, OP, SEL) \ 35 __ASM_EMIT("xor %[off], %[off]") \ 36 __ASM_EMIT("vbroadcastss %[k], %%ymm0") \ 37 __ASM_EMIT("sub $32, %[count]") \ 38 __ASM_EMIT("vmovaps %%ymm0, %%ymm1") \ 39 __ASM_EMIT("jb 2f") \ 40 /* 32x blocks */ \ 41 __ASM_EMIT("1:") \ 42 __ASM_EMIT(SEL("", "vmovups 0x000(%[" SRC "], %[off]), %%ymm4")) \ 43 __ASM_EMIT(SEL("", "vmovups 0x020(%[" SRC "], %[off]), %%ymm5")) \ 44 __ASM_EMIT(SEL("", "vmovups 0x040(%[" SRC "], %[off]), %%ymm6")) \ 45 __ASM_EMIT(SEL("", "vmovups 0x060(%[" SRC "], %[off]), %%ymm7")) \ 46 __ASM_EMIT(OP "ps " SEL("0x000(%[" SRC "], %[off])", "%%ymm0") ", " SEL("%%ymm0", "%%ymm4") ", %%ymm4") \ 47 __ASM_EMIT(OP "ps " SEL("0x020(%[" SRC "], %[off])", "%%ymm1") ", " SEL("%%ymm1", "%%ymm5") ", %%ymm5") \ 48 __ASM_EMIT(OP "ps " SEL("0x040(%[" SRC "], %[off])", "%%ymm0") ", " SEL("%%ymm0", "%%ymm6") ", %%ymm6") \ 49 __ASM_EMIT(OP "ps " SEL("0x060(%[" SRC "], %[off])", "%%ymm1") ", " SEL("%%ymm1", "%%ymm7") ", %%ymm7") \ 50 __ASM_EMIT("vmovups %%ymm4, 0x000(%[" DST "], %[off])") \ 51 __ASM_EMIT("vmovups %%ymm5, 0x020(%[" DST "], %[off])") \ 52 __ASM_EMIT("vmovups %%ymm6, 0x040(%[" DST "], %[off])") \ 53 __ASM_EMIT("vmovups %%ymm7, 0x060(%[" DST "], %[off])") \ 54 __ASM_EMIT("add $0x80, %[off]") \ 55 __ASM_EMIT("sub $32, %[count]") \ 56 __ASM_EMIT("jae 1b") \ 57 /* 16x block */ \ 58 __ASM_EMIT("2:") \ 59 __ASM_EMIT("add $16, %[count]") /* 32 - 16 */ \ 60 __ASM_EMIT("jl 4f") \ 61 __ASM_EMIT(SEL("", "vmovups 0x000(%[" SRC "], %[off]), %%ymm4")) \ 62 __ASM_EMIT(SEL("", "vmovups 0x020(%[" SRC "], %[off]), %%ymm5")) \ 63 __ASM_EMIT(OP "ps " SEL("0x000(%[" SRC "], %[off])", "%%ymm0") ", " SEL("%%ymm0", "%%ymm4") ", %%ymm4") \ 64 __ASM_EMIT(OP "ps " SEL("0x020(%[" SRC "], %[off])", "%%ymm1") ", " SEL("%%ymm1", "%%ymm5") ", %%ymm5") \ 65 __ASM_EMIT("vmovups %%ymm4, 0x000(%[" DST "], %[off])") \ 66 __ASM_EMIT("vmovups %%ymm5, 0x020(%[" DST "], %[off])") \ 67 __ASM_EMIT("add $0x40, %[off]") \ 68 __ASM_EMIT("sub $16, %[count]") \ 69 /* 8x block */ \ 70 __ASM_EMIT("4:") \ 71 __ASM_EMIT("add $8, %[count]") /* 16 - 8 */ \ 72 __ASM_EMIT("jl 6f") \ 73 __ASM_EMIT(SEL("", "vmovups 0x000(%[" SRC "], %[off]), %%ymm4")) \ 74 __ASM_EMIT(OP "ps " SEL("0x000(%[" SRC "], %[off])", "%%ymm0") ", " SEL("%%ymm0", "%%ymm4") ", %%ymm4") \ 75 __ASM_EMIT("vmovups %%ymm4, 0x000(%[" DST "], %[off])") \ 76 __ASM_EMIT("add $0x20, %[off]") \ 77 __ASM_EMIT("sub $8, %[count]") \ 78 \ 79 /* 4x block */ \ 80 __ASM_EMIT("6:") \ 81 __ASM_EMIT("add $4, %[count]") /* 8 - 4 */ \ 82 __ASM_EMIT("jl 8f") \ 83 __ASM_EMIT(SEL("", "vmovups 0x000(%[" SRC "], %[off]), %%xmm4")) \ 84 __ASM_EMIT(OP "ps " SEL("0x000(%[" SRC "], %[off])", "%%xmm0") ", " SEL("%%xmm0", "%%xmm4") ", %%xmm4") \ 85 __ASM_EMIT("vmovups %%xmm4, 0x000(%[" DST "], %[off])") \ 86 __ASM_EMIT("add $0x10, %[off]") \ 87 __ASM_EMIT("sub $4, %[count]") \ 88 \ 89 /* 1x blocks */ \ 90 __ASM_EMIT("8:") \ 91 __ASM_EMIT("add $3, %[count]") /* 8 - 4 */ \ 92 __ASM_EMIT("jl 10f") \ 93 __ASM_EMIT("9:") \ 94 __ASM_EMIT(SEL("", "vmovss 0x000(%[" SRC "], %[off]), %%xmm4")) \ 95 __ASM_EMIT(OP "ss " SEL("0x000(%[" SRC "], %[off])", "%%xmm0") ", " SEL("%%xmm0", "%%xmm4") ", %%xmm4") \ 96 __ASM_EMIT("vmovss %%xmm4, 0x000(%[" DST "], %[off])") \ 97 __ASM_EMIT("add $0x04, %[off]") \ 98 __ASM_EMIT("dec %[count]") \ 99 __ASM_EMIT("jge 9b") \ 100 __ASM_EMIT("10:") 101 add_k2(float * dst,float k,size_t count)102 void add_k2(float *dst, float k, size_t count) 103 { 104 IF_ARCH_X86(size_t off); 105 ARCH_X86_ASM 106 ( 107 OP_K4_CORE("dst", "dst", "vadd", OP_DSEL) 108 : [count] "+r" (count), [off] "=&r" (off) 109 : [dst] "r" (dst), 110 [k] "o" (k) 111 : "cc", "memory", 112 "%xmm0", "%xmm1", 113 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 114 ); 115 } 116 sub_k2(float * dst,float k,size_t count)117 void sub_k2(float *dst, float k, size_t count) 118 { 119 IF_ARCH_X86(size_t off); 120 ARCH_X86_ASM 121 ( 122 OP_K4_CORE("dst", "dst", "vsub", OP_RSEL) 123 : [count] "+r" (count), [off] "=&r" (off) 124 : [dst] "r" (dst), 125 [k] "o" (k) 126 : "cc", "memory", 127 "%xmm0", "%xmm1", 128 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 129 ); 130 } 131 rsub_k2(float * dst,float k,size_t count)132 void rsub_k2(float *dst, float k, size_t count) 133 { 134 IF_ARCH_X86(size_t off); 135 ARCH_X86_ASM 136 ( 137 OP_K4_CORE("dst", "dst", "vsub", OP_DSEL) 138 : [count] "+r" (count), [off] "=&r" (off) 139 : [dst] "r" (dst), 140 [k] "o" (k) 141 : "cc", "memory", 142 "%xmm0", "%xmm1", 143 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 144 ); 145 } 146 mul_k2(float * dst,float k,size_t count)147 void mul_k2(float *dst, float k, size_t count) 148 { 149 IF_ARCH_X86(size_t off); 150 ARCH_X86_ASM 151 ( 152 OP_K4_CORE("dst", "dst", "vmul", OP_DSEL) 153 : [count] "+r" (count), [off] "=&r" (off) 154 : [dst] "r" (dst), 155 [k] "o" (k) 156 : "cc", "memory", 157 "%xmm0", "%xmm1", 158 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 159 ); 160 } 161 div_k2(float * dst,float k,size_t count)162 void div_k2(float *dst, float k, size_t count) 163 { 164 mul_k2(dst, 1.0f/k, count); 165 } 166 rdiv_k2(float * dst,float k,size_t count)167 void rdiv_k2(float *dst, float k, size_t count) 168 { 169 IF_ARCH_X86(size_t off); 170 ARCH_X86_ASM 171 ( 172 OP_K4_CORE("dst", "dst", "vdiv", OP_DSEL) 173 : [count] "+r" (count), [off] "=&r" (off) 174 : [dst] "r" (dst), 175 [k] "o" (k) 176 : "cc", "memory", 177 "%xmm0", "%xmm1", 178 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 179 ); 180 } 181 add_k3(float * dst,const float * src,float k,size_t count)182 void add_k3(float *dst, const float *src, float k, size_t count) 183 { 184 IF_ARCH_X86(size_t off); 185 ARCH_X86_ASM 186 ( 187 OP_K4_CORE("dst", "src", "vadd", OP_DSEL) 188 : [count] "+r" (count), [off] "=&r" (off) 189 : [dst] "r" (dst), [src] "r" (src), 190 [k] "o" (k) 191 : "cc", "memory", 192 "%xmm0", "%xmm1", 193 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 194 ); 195 } 196 sub_k3(float * dst,const float * src,float k,size_t count)197 void sub_k3(float *dst, const float *src, float k, size_t count) 198 { 199 IF_ARCH_X86(size_t off); 200 ARCH_X86_ASM 201 ( 202 OP_K4_CORE("dst", "src", "vsub", OP_RSEL) 203 : [count] "+r" (count), [off] "=&r" (off) 204 : [dst] "r" (dst), [src] "r" (src), 205 [k] "o" (k) 206 : "cc", "memory", 207 "%xmm0", "%xmm1", 208 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 209 ); 210 } 211 rsub_k3(float * dst,const float * src,float k,size_t count)212 void rsub_k3(float *dst, const float *src, float k, size_t count) 213 { 214 IF_ARCH_X86(size_t off); 215 ARCH_X86_ASM 216 ( 217 OP_K4_CORE("dst", "src", "vsub", OP_DSEL) 218 : [count] "+r" (count), [off] "=&r" (off) 219 : [dst] "r" (dst), [src] "r" (src), 220 [k] "o" (k) 221 : "cc", "memory", 222 "%xmm0", "%xmm1", 223 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 224 ); 225 } 226 mul_k3(float * dst,const float * src,float k,size_t count)227 void mul_k3(float *dst, const float *src, float k, size_t count) 228 { 229 IF_ARCH_X86(size_t off); 230 ARCH_X86_ASM 231 ( 232 OP_K4_CORE("dst", "src", "vmul", OP_DSEL) 233 : [count] "+r" (count), [off] "=&r" (off) 234 : [dst] "r" (dst), [src] "r" (src), 235 [k] "o" (k) 236 : "cc", "memory", 237 "%xmm0", "%xmm1", 238 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 239 ); 240 } 241 div_k3(float * dst,const float * src,float k,size_t count)242 void div_k3(float *dst, const float *src, float k, size_t count) 243 { 244 mul_k3(dst, src, 1.0f/k, count); 245 } 246 rdiv_k3(float * dst,const float * src,float k,size_t count)247 void rdiv_k3(float *dst, const float *src, float k, size_t count) 248 { 249 IF_ARCH_X86(size_t off); 250 ARCH_X86_ASM 251 ( 252 OP_K4_CORE("dst", "src", "vdiv", OP_DSEL) 253 : [count] "+r" (count), [off] "=&r" (off) 254 : [dst] "r" (dst), [src] "r" (src), 255 [k] "o" (k) 256 : "cc", "memory", 257 "%xmm0", "%xmm1", 258 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 259 ); 260 } 261 262 #undef OP_K4_CORE 263 #undef OP_DSEL 264 #undef OP_RSEL 265 266 #define FMA_OFF(a, b) a 267 #define FMA_ON(a, b) b 268 269 #define FMOD_KX_CORE(DST, SRC, FMA_SEL) \ 270 __ASM_EMIT("vbroadcastss %[k], %%ymm0") \ 271 __ASM_EMIT("xor %[off], %[off]") \ 272 __ASM_EMIT("vmovaps %%ymm0, %%ymm1") \ 273 /* x16 blocks */ \ 274 __ASM_EMIT("sub $16, %[count]") \ 275 __ASM_EMIT("jb 2f") \ 276 __ASM_EMIT("1:") \ 277 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%ymm2") \ 278 __ASM_EMIT("vmovups 0x20(%[" SRC "], %[off]), %%ymm3") \ 279 __ASM_EMIT("vdivps %%ymm0, %%ymm2, %%ymm4") \ 280 __ASM_EMIT("vdivps %%ymm1, %%ymm3, %%ymm5") \ 281 __ASM_EMIT("vcvttps2dq %%ymm4, %%ymm4") \ 282 __ASM_EMIT("vcvttps2dq %%ymm5, %%ymm5") \ 283 __ASM_EMIT("vcvtdq2ps %%ymm4, %%ymm4") \ 284 __ASM_EMIT("vcvtdq2ps %%ymm5, %%ymm5") \ 285 __ASM_EMIT(FMA_SEL("vmulps %%ymm0, %%ymm4, %%ymm4", "")) \ 286 __ASM_EMIT(FMA_SEL("vmulps %%ymm1, %%ymm5, %%ymm5", "")) \ 287 __ASM_EMIT(FMA_SEL("vsubps %%ymm4, %%ymm2, %%ymm2", "vfnmadd231ps %%ymm0, %%ymm4, %%ymm2")) \ 288 __ASM_EMIT(FMA_SEL("vsubps %%ymm5, %%ymm3, %%ymm3", "vfnmadd231ps %%ymm1, %%ymm5, %%ymm3")) \ 289 __ASM_EMIT("vmovups %%ymm2, 0x00(%[" DST "], %[off])") \ 290 __ASM_EMIT("vmovups %%ymm3, 0x20(%[" DST "], %[off])") \ 291 __ASM_EMIT("add $0x40, %[off]") \ 292 __ASM_EMIT("sub $16, %[count]") \ 293 __ASM_EMIT("jae 1b") \ 294 __ASM_EMIT("2:") \ 295 /* x8 block */ \ 296 __ASM_EMIT("add $8, %[count]") \ 297 __ASM_EMIT("jl 4f") \ 298 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm2") \ 299 __ASM_EMIT("vmovups 0x10(%[" SRC "], %[off]), %%xmm3") \ 300 __ASM_EMIT("vdivps %%xmm0, %%xmm2, %%xmm4") \ 301 __ASM_EMIT("vdivps %%xmm1, %%xmm3, %%xmm5") \ 302 __ASM_EMIT("vcvttps2dq %%xmm4, %%xmm4") \ 303 __ASM_EMIT("vcvttps2dq %%xmm5, %%xmm5") \ 304 __ASM_EMIT("vcvtdq2ps %%xmm4, %%xmm4") \ 305 __ASM_EMIT("vcvtdq2ps %%xmm5, %%xmm5") \ 306 __ASM_EMIT(FMA_SEL("vmulps %%xmm0, %%xmm4, %%xmm4", "")) \ 307 __ASM_EMIT(FMA_SEL("vmulps %%xmm1, %%xmm5, %%xmm5", "")) \ 308 __ASM_EMIT(FMA_SEL("vsubps %%xmm4, %%xmm2, %%xmm2", "vfnmadd231ps %%xmm0, %%xmm4, %%xmm2")) \ 309 __ASM_EMIT(FMA_SEL("vsubps %%xmm5, %%xmm3, %%xmm3", "vfnmadd231ps %%xmm1, %%xmm5, %%xmm3")) \ 310 __ASM_EMIT("vmovups %%xmm2, 0x00(%[" DST "], %[off])") \ 311 __ASM_EMIT("vmovups %%xmm3, 0x10(%[" DST "], %[off])") \ 312 __ASM_EMIT("sub $8, %[count]") \ 313 __ASM_EMIT("add $0x20, %[off]") \ 314 __ASM_EMIT("4:") \ 315 /* x4 block */ \ 316 __ASM_EMIT("add $4, %[count]") \ 317 __ASM_EMIT("jl 6f") \ 318 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm2") \ 319 __ASM_EMIT("vdivps %%xmm0, %%xmm2, %%xmm4") \ 320 __ASM_EMIT("vcvttps2dq %%xmm4, %%xmm4") \ 321 __ASM_EMIT("vcvtdq2ps %%xmm4, %%xmm4") \ 322 __ASM_EMIT(FMA_SEL("vmulps %%xmm0, %%xmm4, %%xmm4", "")) \ 323 __ASM_EMIT(FMA_SEL("vsubps %%xmm4, %%xmm2, %%xmm2", "vfnmadd231ps %%xmm0, %%xmm4, %%xmm2")) \ 324 __ASM_EMIT("vmovups %%xmm2, 0x00(%[" DST "], %[off])") \ 325 __ASM_EMIT("sub $4, %[count]") \ 326 __ASM_EMIT("add $0x10, %[off]") \ 327 __ASM_EMIT("6:") \ 328 /* x1 blocks */ \ 329 __ASM_EMIT("add $3, %[count]") \ 330 __ASM_EMIT("jl 8f") \ 331 __ASM_EMIT("7:") \ 332 __ASM_EMIT("vmovss 0x00(%[" SRC "], %[off]), %%xmm2") \ 333 __ASM_EMIT("vdivps %%xmm0, %%xmm2, %%xmm4") \ 334 __ASM_EMIT("vcvttps2dq %%xmm4, %%xmm4") \ 335 __ASM_EMIT("vcvtdq2ps %%xmm4, %%xmm4") \ 336 __ASM_EMIT(FMA_SEL("vmulps %%xmm0, %%xmm4, %%xmm4", "")) \ 337 __ASM_EMIT(FMA_SEL("vsubps %%xmm4, %%xmm2, %%xmm2", "vfnmadd231ps %%xmm0, %%xmm4, %%xmm2")) \ 338 __ASM_EMIT("vmovss %%xmm2, 0x00(%[" DST "], %[off])") \ 339 __ASM_EMIT("add $0x04, %[off]") \ 340 __ASM_EMIT("dec %[count]") \ 341 __ASM_EMIT("jge 7b") \ 342 __ASM_EMIT("8:") 343 mod_k2(float * dst,float k,size_t count)344 void mod_k2(float *dst, float k, size_t count) 345 { 346 IF_ARCH_X86(size_t off); 347 ARCH_X86_ASM 348 ( 349 FMOD_KX_CORE("dst", "dst", FMA_OFF) 350 : [count] "+r" (count), [off] "=&r" (off) 351 : [dst] "r" (dst), 352 [k] "o" (k) 353 : "cc", "memory", 354 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 355 "%xmm4", "%xmm5" 356 ); 357 } 358 mod_k2_fma3(float * dst,float k,size_t count)359 void mod_k2_fma3(float *dst, float k, size_t count) 360 { 361 IF_ARCH_X86(size_t off); 362 ARCH_X86_ASM 363 ( 364 FMOD_KX_CORE("dst", "dst", FMA_ON) 365 : [count] "+r" (count), [off] "=&r" (off) 366 : [dst] "r" (dst), 367 [k] "o" (k) 368 : "cc", "memory", 369 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 370 "%xmm4", "%xmm5" 371 ); 372 } 373 mod_k3(float * dst,const float * src,float k,size_t count)374 void mod_k3(float *dst, const float *src, float k, size_t count) 375 { 376 IF_ARCH_X86(size_t off); 377 ARCH_X86_ASM 378 ( 379 FMOD_KX_CORE("dst", "src", FMA_OFF) 380 : [count] "+r" (count), [off] "=&r" (off) 381 : [dst] "r" (dst), [src] "r" (src), 382 [k] "o" (k) 383 : "cc", "memory", 384 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 385 "%xmm4", "%xmm5" 386 ); 387 } 388 mod_k3_fma3(float * dst,const float * src,float k,size_t count)389 void mod_k3_fma3(float *dst, const float *src, float k, size_t count) 390 { 391 IF_ARCH_X86(size_t off); 392 ARCH_X86_ASM 393 ( 394 FMOD_KX_CORE("dst", "src", FMA_ON) 395 : [count] "+r" (count), [off] "=&r" (off) 396 : [dst] "r" (dst), [src] "r" (src), 397 [k] "o" (k) 398 : "cc", "memory", 399 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 400 "%xmm4", "%xmm5" 401 ); 402 } 403 404 #undef FMOD_KX_CORE 405 406 #define FRMOD_KX_CORE(DST, SRC, FMA_SEL) \ 407 __ASM_EMIT("vbroadcastss %[k], %%ymm0") \ 408 __ASM_EMIT("xor %[off], %[off]") \ 409 __ASM_EMIT("vmovaps %%ymm0, %%ymm1") \ 410 /* x16 blocks */ \ 411 __ASM_EMIT("sub $16, %[count]") \ 412 __ASM_EMIT("jb 2f") \ 413 __ASM_EMIT("1:") \ 414 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%ymm2") \ 415 __ASM_EMIT("vmovups 0x20(%[" SRC "], %[off]), %%ymm3") \ 416 __ASM_EMIT("vdivps %%ymm2, %%ymm0, %%ymm4") \ 417 __ASM_EMIT("vdivps %%ymm3, %%ymm1, %%ymm5") \ 418 __ASM_EMIT("vcvttps2dq %%ymm4, %%ymm4") \ 419 __ASM_EMIT("vcvttps2dq %%ymm5, %%ymm5") \ 420 __ASM_EMIT("vcvtdq2ps %%ymm4, %%ymm4") \ 421 __ASM_EMIT("vcvtdq2ps %%ymm5, %%ymm5") \ 422 __ASM_EMIT(FMA_SEL("vmulps %%ymm2, %%ymm4, %%ymm4", "")) \ 423 __ASM_EMIT(FMA_SEL("vmulps %%ymm3, %%ymm5, %%ymm5", "")) \ 424 __ASM_EMIT(FMA_SEL("vsubps %%ymm4, %%ymm0, %%ymm2", "vfnmadd132ps %%ymm4, %%ymm0, %%ymm2")) \ 425 __ASM_EMIT(FMA_SEL("vsubps %%ymm5, %%ymm1, %%ymm3", "vfnmadd132ps %%ymm5, %%ymm1, %%ymm3")) \ 426 __ASM_EMIT("vmovups %%ymm2, 0x00(%[" DST "], %[off])") \ 427 __ASM_EMIT("vmovups %%ymm3, 0x20(%[" DST "], %[off])") \ 428 __ASM_EMIT("add $0x40, %[off]") \ 429 __ASM_EMIT("sub $16, %[count]") \ 430 __ASM_EMIT("jae 1b") \ 431 __ASM_EMIT("2:") \ 432 /* x8 block */ \ 433 __ASM_EMIT("add $8, %[count]") \ 434 __ASM_EMIT("jl 4f") \ 435 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm2") \ 436 __ASM_EMIT("vmovups 0x10(%[" SRC "], %[off]), %%xmm3") \ 437 __ASM_EMIT("vdivps %%xmm2, %%xmm0, %%xmm4") \ 438 __ASM_EMIT("vdivps %%xmm3, %%xmm1, %%xmm5") \ 439 __ASM_EMIT("vcvttps2dq %%xmm4, %%xmm4") \ 440 __ASM_EMIT("vcvttps2dq %%xmm5, %%xmm5") \ 441 __ASM_EMIT("vcvtdq2ps %%xmm4, %%xmm4") \ 442 __ASM_EMIT("vcvtdq2ps %%xmm5, %%xmm5") \ 443 __ASM_EMIT(FMA_SEL("vmulps %%xmm2, %%xmm4, %%xmm4", "")) \ 444 __ASM_EMIT(FMA_SEL("vmulps %%xmm3, %%xmm5, %%xmm5", "")) \ 445 __ASM_EMIT(FMA_SEL("vsubps %%xmm4, %%xmm0, %%xmm2", "vfnmadd132ps %%xmm4, %%xmm0, %%xmm2")) \ 446 __ASM_EMIT(FMA_SEL("vsubps %%xmm5, %%xmm1, %%xmm3", "vfnmadd132ps %%xmm5, %%xmm1, %%xmm3")) \ 447 __ASM_EMIT("vmovups %%xmm2, 0x00(%[" DST "], %[off])") \ 448 __ASM_EMIT("vmovups %%xmm3, 0x10(%[" DST "], %[off])") \ 449 __ASM_EMIT("sub $8, %[count]") \ 450 __ASM_EMIT("add $0x20, %[off]") \ 451 __ASM_EMIT("4:") \ 452 /* x4 block */ \ 453 __ASM_EMIT("add $4, %[count]") \ 454 __ASM_EMIT("jl 6f") \ 455 __ASM_EMIT("vmovups 0x00(%[" SRC "], %[off]), %%xmm2") \ 456 __ASM_EMIT("vdivps %%xmm2, %%xmm0, %%xmm4") \ 457 __ASM_EMIT("vcvttps2dq %%xmm4, %%xmm4") \ 458 __ASM_EMIT("vcvtdq2ps %%xmm4, %%xmm4") \ 459 __ASM_EMIT(FMA_SEL("vmulps %%xmm2, %%xmm4, %%xmm4", "")) \ 460 __ASM_EMIT(FMA_SEL("vsubps %%xmm4, %%xmm0, %%xmm2", "vfnmadd132ps %%xmm4, %%xmm0, %%xmm2")) \ 461 __ASM_EMIT("vmovups %%xmm2, 0x00(%[" DST "], %[off])") \ 462 __ASM_EMIT("sub $4, %[count]") \ 463 __ASM_EMIT("add $0x10, %[off]") \ 464 __ASM_EMIT("6:") \ 465 /* x1 blocks */ \ 466 __ASM_EMIT("add $3, %[count]") \ 467 __ASM_EMIT("jl 8f") \ 468 __ASM_EMIT("7:") \ 469 __ASM_EMIT("vmovss 0x00(%[" SRC "], %[off]), %%xmm2") \ 470 __ASM_EMIT("vdivps %%xmm2, %%xmm0, %%xmm4") \ 471 __ASM_EMIT("vcvttps2dq %%xmm4, %%xmm4") \ 472 __ASM_EMIT("vcvtdq2ps %%xmm4, %%xmm4") \ 473 __ASM_EMIT(FMA_SEL("vmulps %%xmm2, %%xmm4, %%xmm4", "")) \ 474 __ASM_EMIT(FMA_SEL("vsubps %%xmm4, %%xmm0, %%xmm2", "vfnmadd132ps %%xmm4, %%xmm0, %%xmm2")) \ 475 __ASM_EMIT("vmovss %%xmm2, 0x00(%[" DST "], %[off])") \ 476 __ASM_EMIT("add $0x04, %[off]") \ 477 __ASM_EMIT("dec %[count]") \ 478 __ASM_EMIT("jge 7b") \ 479 __ASM_EMIT("8:") 480 rmod_k2(float * dst,float k,size_t count)481 void rmod_k2(float *dst, float k, size_t count) 482 { 483 IF_ARCH_X86(size_t off); 484 ARCH_X86_ASM 485 ( 486 FRMOD_KX_CORE("dst", "dst", FMA_OFF) 487 : [count] "+r" (count), [off] "=&r" (off) 488 : [dst] "r" (dst), 489 [k] "o" (k) 490 : "cc", "memory", 491 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 492 "%xmm4", "%xmm5" 493 ); 494 } 495 rmod_k2_fma3(float * dst,float k,size_t count)496 void rmod_k2_fma3(float *dst, float k, size_t count) 497 { 498 IF_ARCH_X86(size_t off); 499 ARCH_X86_ASM 500 ( 501 FRMOD_KX_CORE("dst", "dst", FMA_ON) 502 : [count] "+r" (count), [off] "=&r" (off) 503 : [dst] "r" (dst), 504 [k] "o" (k) 505 : "cc", "memory", 506 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 507 "%xmm4", "%xmm5" 508 ); 509 } 510 rmod_k3(float * dst,const float * src,float k,size_t count)511 void rmod_k3(float *dst, const float *src, float k, size_t count) 512 { 513 IF_ARCH_X86(size_t off); 514 ARCH_X86_ASM 515 ( 516 FRMOD_KX_CORE("dst", "src", FMA_OFF) 517 : [count] "+r" (count), [off] "=&r" (off) 518 : [dst] "r" (dst), [src] "r" (src), 519 [k] "o" (k) 520 : "cc", "memory", 521 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 522 "%xmm4", "%xmm5" 523 ); 524 } 525 rmod_k3_fma3(float * dst,const float * src,float k,size_t count)526 void rmod_k3_fma3(float *dst, const float *src, float k, size_t count) 527 { 528 IF_ARCH_X86(size_t off); 529 ARCH_X86_ASM 530 ( 531 FRMOD_KX_CORE("dst", "src", FMA_ON) 532 : [count] "+r" (count), [off] "=&r" (off) 533 : [dst] "r" (dst), [src] "r" (src), 534 [k] "o" (k) 535 : "cc", "memory", 536 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 537 "%xmm4", "%xmm5" 538 ); 539 } 540 541 #undef FRMOD_KX_CORE 542 543 #undef FMA_OFF 544 #undef FMA_ON 545 } 546 547 #endif /* DSP_ARCH_X86_AVX_PMATH_OP_KX_H_ */ 548