1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 13 дек. 2019 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_AVX_IMPL 23 #error "This header should not be included directly" 24 #endif /* DSP_ARCH_X86_AVX_IMPL */ 25 26 #define FASTCONV_DIRECT_PREPARE_BODY(FMA_SEL) \ 27 size_t off; \ 28 \ 29 ARCH_X86_ASM( \ 30 __ASM_EMIT("lea (,%[np], 8), %[off]") \ 31 __ASM_EMIT("vmovups 0x00(%[ak]), %%ymm6") /* ymm6 = x_re */ \ 32 __ASM_EMIT("vmovups 0x20(%[ak]), %%ymm7") /* ymm7 = x_im */ \ 33 __ASM_EMIT("vmovups 0x00(%[wk]), %%ymm4") /* ymm4 = w_re */ \ 34 __ASM_EMIT("vmovups 0x20(%[wk]), %%ymm5") /* ymm5 = w_im */ \ 35 __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = a_im = 0 */ \ 36 /* x8 blocks */ \ 37 __ASM_EMIT32("subl $8, %[np]") \ 38 __ASM_EMIT64("sub $8, %[np]") \ 39 __ASM_EMIT64("jb 2f") \ 40 __ASM_EMIT("1:") \ 41 __ASM_EMIT("vmovups 0x00(%[src]), %%ymm0") /* ymm0 = a_re = re */ \ 42 __ASM_EMIT("vmulps %%ymm0, %%ymm7, %%ymm3") /* ymm3 = x_im * re */ \ 43 __ASM_EMIT("vmulps %%ymm0, %%ymm6, %%ymm2") /* ymm2 = b_re = x_re * re */ \ 44 __ASM_EMIT("vsubps %%ymm3, %%ymm1, %%ymm3") /* ymm3 = b_im = -x_im * re */ \ 45 __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") \ 46 __ASM_EMIT("vmovups %%ymm1, 0x20(%[dst])") \ 47 __ASM_EMIT("vmovups %%ymm2, 0x00(%[dst], %[off])") \ 48 __ASM_EMIT("vmovups %%ymm3, 0x20(%[dst], %[off])") \ 49 __ASM_EMIT("add $0x20, %[src]") \ 50 __ASM_EMIT("add $0x40, %[dst]") \ 51 __ASM_EMIT32("subl $8, %[np]") \ 52 __ASM_EMIT64("sub $8, %[np]") \ 53 __ASM_EMIT("jb 2f") \ 54 /* Rotate angle */ \ 55 __ASM_EMIT("vmulps %%ymm5, %%ymm6, %%ymm2") /* ymm2 = w_im * x_re */ \ 56 __ASM_EMIT("vmulps %%ymm5, %%ymm7, %%ymm3") /* ymm3 = w_im * x_im */ \ 57 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm6, %%ymm6", "")) /* ymm6 = w_re * x_re */ \ 58 __ASM_EMIT(FMA_SEL("vmulps %%ymm4, %%ymm7, %%ymm7", "")) /* ymm7 = w_re * x_im */ \ 59 __ASM_EMIT(FMA_SEL("vsubps %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \ 60 __ASM_EMIT(FMA_SEL("vaddps %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \ 61 __ASM_EMIT("jmp 1b") \ 62 __ASM_EMIT("2:") \ 63 : [dst] "+r" (dst), [src] "+r" (src), \ 64 [off] "=&r" (off), [np] __ASM_ARG_RW(np) \ 65 : [ak] "r" (ak), [wk] "r" (wk) \ 66 : "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 67 "%xmm4", "%xmm5", "%xmm6", "%xmm7" \ 68 ) 69 70 #define FASTCONV_REVERSE_PREPARE_BODY(FMA_SEL) \ 71 ARCH_X86_ASM( \ 72 /* 2x blocks of 4x-butterfly loop */ \ 73 __ASM_EMIT("sub $2, %[nb]") \ 74 __ASM_EMIT("jb 2f") \ 75 __ASM_EMIT("1:") \ 76 __ASM_EMIT("vmovups 0x00(%[dst]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ \ 77 __ASM_EMIT("vmovups 0x10(%[dst]), %%xmm4") /* xmm4 = r4 r5 r6 r7 */ \ 78 __ASM_EMIT("vmovups 0x20(%[dst]), %%xmm2") /* xmm2 = i0 i1 i2 i3 */ \ 79 __ASM_EMIT("vmovups 0x30(%[dst]), %%xmm6") /* xmm6 = i4 i5 i6 i7 */ \ 80 __ASM_EMIT("vinsertf128 $1, 0x40(%[dst]), %%ymm0, %%ymm0") /* ymm0 = r0 r1 r2 r3 r8 r9 r10 r11 */ \ 81 __ASM_EMIT("vinsertf128 $1, 0x50(%[dst]), %%ymm4, %%ymm4") /* ymm4 = r4 r5 r6 r7 r12 r13 r14 r15 */ \ 82 __ASM_EMIT("vinsertf128 $1, 0x60(%[dst]), %%ymm2, %%ymm2") /* ymm2 = i0 i1 i2 i3 i8 i9 i10 i11 */ \ 83 __ASM_EMIT("vinsertf128 $1, 0x70(%[dst]), %%ymm6, %%ymm6") /* ymm6 = i4 i5 i6 i7 i12 i13 i14 i15 */ \ 84 /* 1st-order 4x butterfly */ \ 85 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ \ 86 __ASM_EMIT("vhsubps %%ymm6, %%ymm2, %%ymm3") /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ \ 87 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ \ 88 __ASM_EMIT("vhaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ \ 89 /* 2nd-order 4x butterfly */ \ 90 __ASM_EMIT("vblendps $0xaa, %%ymm3, %%ymm1, %%ymm4") /* ymm4 = r1' i3' r5' i7' */ \ 91 __ASM_EMIT("vblendps $0xaa, %%ymm1, %%ymm3, %%ymm5") /* ymm5 = i1' r3' i5' r7' */ \ 92 __ASM_EMIT("vhsubps %%ymm4, %%ymm0, %%ymm1") /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */ \ 93 __ASM_EMIT("vhsubps %%ymm5, %%ymm2, %%ymm3") /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */ \ 94 __ASM_EMIT("vhaddps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */ \ 95 __ASM_EMIT("vhaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */ \ 96 __ASM_EMIT("vblendps $0xcc, %%ymm1, %%ymm0, %%ymm4") /* ymm4 = r0" i4" r1" r5" */ \ 97 __ASM_EMIT("vblendps $0xcc, %%ymm0, %%ymm1, %%ymm5") /* ymm5 = r2" r6" r3" r7" */ \ 98 __ASM_EMIT("vshufps $0x88, %%ymm3, %%ymm2, %%ymm6") /* ymm6 = i0" i1" i2" i3" */ \ 99 __ASM_EMIT("vshufps $0xdd, %%ymm3, %%ymm2, %%ymm7") /* ymm7 = i4" i5" i6" i7" */ \ 100 __ASM_EMIT("vshufps $0x88, %%ymm5, %%ymm4, %%ymm2") /* ymm2 = r0" r1" r2" r3" */ \ 101 __ASM_EMIT("vshufps $0xdd, %%ymm5, %%ymm4, %%ymm3") /* ymm3 = r4" r5" r6" r7" */ \ 102 /* 3rd-order 8x butterfly */ \ 103 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm3, %%ymm4") /* ymm4 = x_im * b_re */ \ 104 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%ymm7, %%ymm5") /* ymm5 = x_im * b_im */ \ 105 __ASM_EMIT(FMA_SEL("vmulps 0x00 + %[FFT_A], %%ymm3, %%ymm3", "")) /* ymm3 = x_re * b_re */ \ 106 __ASM_EMIT(FMA_SEL("vmulps 0x00 + %[FFT_A], %%ymm7, %%ymm7", "")) /* ymm7 = x_re * b_im */ \ 107 __ASM_EMIT(FMA_SEL("vsubps %%ymm5, %%ymm3, %%ymm5", "vfmsub231ps 0x00 + %[FFT_A], %%ymm3, %%ymm5")) /* ymm5 = c_re = x_re * b_re - x_im * b_im */ \ 108 __ASM_EMIT(FMA_SEL("vaddps %%ymm4, %%ymm7, %%ymm4", "vfmadd231ps 0x00 + %[FFT_A], %%ymm7, %%ymm4")) /* ymm4 = c_im = x_re * b_im + x_im * b_re */ \ 109 __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm0") /* ymm0 = a_re - c_re */ \ 110 __ASM_EMIT("vsubps %%ymm4, %%ymm6, %%ymm1") /* ymm1 = a_im - c_im */ \ 111 __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = a_re + c_re */ \ 112 __ASM_EMIT("vaddps %%ymm4, %%ymm6, %%ymm3") /* ymm3 = a_im + c_im */ \ 113 /* Store */ \ 114 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst])") \ 115 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst])") \ 116 __ASM_EMIT("vmovups %%xmm3, 0x20(%[dst])") \ 117 __ASM_EMIT("vmovups %%xmm1, 0x30(%[dst])") \ 118 __ASM_EMIT("vextractf128 $1, %%ymm2, 0x40(%[dst])") \ 119 __ASM_EMIT("vextractf128 $1, %%ymm0, 0x50(%[dst])") \ 120 __ASM_EMIT("vextractf128 $1, %%ymm3, 0x60(%[dst])") \ 121 __ASM_EMIT("vextractf128 $1, %%ymm1, 0x70(%[dst])") \ 122 __ASM_EMIT("add $0x80, %[dst]") \ 123 __ASM_EMIT("sub $2, %[nb]") \ 124 __ASM_EMIT("jae 1b") \ 125 /* 1x block of 4-butterfly */ \ 126 __ASM_EMIT("2:") \ 127 __ASM_EMIT("add $1, %[nb]") \ 128 __ASM_EMIT("jl 4f") \ 129 __ASM_EMIT("vmovups 0x00(%[dst]), %%xmm0") /* xmm0 = r0 r1 r2 r3 */ \ 130 __ASM_EMIT("vmovups 0x10(%[dst]), %%xmm4") /* xmm4 = r4 r5 r6 r7 */ \ 131 __ASM_EMIT("vmovups 0x20(%[dst]), %%xmm2") /* xmm2 = i0 i1 i2 i3 */ \ 132 __ASM_EMIT("vmovups 0x30(%[dst]), %%xmm6") /* xmm6 = i4 i5 i6 i7 */ \ 133 /* 1st-order 4x butterfly */ \ 134 __ASM_EMIT("vhsubps %%xmm4, %%xmm0, %%xmm1") /* xmm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ \ 135 __ASM_EMIT("vhsubps %%xmm6, %%xmm2, %%xmm3") /* xmm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ \ 136 __ASM_EMIT("vhaddps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ \ 137 __ASM_EMIT("vhaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ \ 138 /* 2nd-order 4x butterfly */ \ 139 __ASM_EMIT("vblendps $0xaa, %%xmm3, %%xmm1, %%xmm4") /* xmm4 = r1' i3' r5' i7' */ \ 140 __ASM_EMIT("vblendps $0xaa, %%xmm1, %%xmm3, %%xmm5") /* xmm5 = i1' r3' i5' r7' */ \ 141 __ASM_EMIT("vhsubps %%xmm4, %%xmm0, %%xmm1") /* xmm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */ \ 142 __ASM_EMIT("vhsubps %%xmm5, %%xmm2, %%xmm3") /* xmm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */ \ 143 __ASM_EMIT("vhaddps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */ \ 144 __ASM_EMIT("vhaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */ \ 145 __ASM_EMIT("vblendps $0xcc, %%xmm1, %%xmm0, %%xmm4") /* xmm4 = r0" i4" r1" r5" */ \ 146 __ASM_EMIT("vblendps $0xcc, %%xmm0, %%xmm1, %%xmm5") /* xmm5 = r2" r6" r3" r7" */ \ 147 __ASM_EMIT("vshufps $0x88, %%xmm3, %%xmm2, %%xmm6") /* xmm6 = i0" i1" i2" i3" */ \ 148 __ASM_EMIT("vshufps $0xdd, %%xmm3, %%xmm2, %%xmm7") /* xmm7 = i4" i5" i6" i7" */ \ 149 __ASM_EMIT("vshufps $0x88, %%xmm5, %%xmm4, %%xmm2") /* xmm2 = r0" r1" r2" r3" */ \ 150 __ASM_EMIT("vshufps $0xdd, %%xmm5, %%xmm4, %%xmm3") /* xmm3 = r4" r5" r6" r7" */ \ 151 /* 3rd-order 8x butterfly */ \ 152 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%xmm3, %%xmm4") /* xmm4 = x_im * b_re */ \ 153 __ASM_EMIT("vmulps 0x20 + %[FFT_A], %%xmm7, %%xmm5") /* xmm5 = x_im * b_im */ \ 154 __ASM_EMIT(FMA_SEL("vmulps 0x00 + %[FFT_A], %%xmm3, %%xmm3", "")) /* xmm3 = x_re * b_re */ \ 155 __ASM_EMIT(FMA_SEL("vmulps 0x00 + %[FFT_A], %%xmm7, %%xmm7", "")) /* xmm7 = x_re * b_im */ \ 156 __ASM_EMIT(FMA_SEL("vsubps %%xmm5, %%xmm3, %%xmm5", "vfmsub231ps 0x00 + %[FFT_A], %%xmm3, %%xmm5")) /* xmm5 = c_re = x_re * b_re - x_im * b_im */ \ 157 __ASM_EMIT(FMA_SEL("vaddps %%xmm4, %%xmm7, %%xmm4", "vfmadd231ps 0x00 + %[FFT_A], %%xmm7, %%xmm4")) /* xmm4 = c_im = x_re * b_im + x_im * b_re */ \ 158 __ASM_EMIT("vsubps %%xmm5, %%xmm2, %%xmm0") /* xmm0 = a_re - c_re */ \ 159 __ASM_EMIT("vsubps %%xmm4, %%xmm6, %%xmm1") /* xmm1 = a_im - c_im */ \ 160 __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = a_re + c_re */ \ 161 __ASM_EMIT("vaddps %%xmm4, %%xmm6, %%xmm3") /* xmm3 = a_im + c_im */ \ 162 /* Store */ \ 163 __ASM_EMIT("vmovups %%xmm2, 0x00(%[dst])") \ 164 __ASM_EMIT("vmovups %%xmm0, 0x10(%[dst])") \ 165 __ASM_EMIT("vmovups %%xmm3, 0x20(%[dst])") \ 166 __ASM_EMIT("vmovups %%xmm1, 0x30(%[dst])") \ 167 __ASM_EMIT("4:") \ 168 : [dst] "+r" (dst), [nb] "+r" (nb) \ 169 : [FFT_A] "o" (FFT_A) \ 170 : "cc", "memory", \ 171 "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 172 "%xmm4", "%xmm5", "%xmm6", "%xmm7" \ 173 ) 174 175 namespace avx 176 { 177 #define FMA_OFF(a, b) a 178 #define FMA_ON(a, b) b 179 fastconv_direct_prepare(float * dst,const float * src,const float * ak,const float * wk,size_t np)180 static inline void fastconv_direct_prepare(float *dst, const float *src, const float *ak, const float *wk, size_t np) 181 { 182 FASTCONV_DIRECT_PREPARE_BODY(FMA_OFF); 183 } 184 fastconv_reverse_prepare(float * dst,size_t nb)185 static inline void fastconv_reverse_prepare(float *dst, size_t nb) 186 { 187 FASTCONV_REVERSE_PREPARE_BODY(FMA_OFF); 188 } 189 fastconv_direct_prepare_fma3(float * dst,const float * src,const float * ak,const float * wk,size_t np)190 static inline void fastconv_direct_prepare_fma3(float *dst, const float *src, const float *ak, const float *wk, size_t np) 191 { 192 FASTCONV_DIRECT_PREPARE_BODY(FMA_OFF); 193 } 194 fastconv_reverse_prepare_fma3(float * dst,size_t nb)195 static inline void fastconv_reverse_prepare_fma3(float *dst, size_t nb) 196 { 197 FASTCONV_REVERSE_PREPARE_BODY(FMA_OFF); 198 } 199 fastconv_direct_unpack(float * dst,const float * src)200 static inline void fastconv_direct_unpack(float *dst, const float *src) 201 { 202 ARCH_X86_ASM( 203 __ASM_EMIT("vmovups (%[src]), %%xmm0") 204 __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") 205 __ASM_EMIT("vmovups %%ymm0, 0x00(%[dst])") 206 __ASM_EMIT("vmovups %%ymm1, 0x20(%[dst])") 207 : 208 : [dst] "r" (dst), [src] "r" (src) 209 : "%xmm0", "%xmm1" 210 ); 211 } 212 fastconv_reverse_unpack(float * dst,const float * src,size_t rank)213 static inline void fastconv_reverse_unpack(float *dst, const float *src, size_t rank) 214 { 215 size_t blocks = 1 << rank; 216 float norm = 1.0f / float(blocks); 217 218 // Perform 4-element butterflies 219 ARCH_X86_ASM 220 ( 221 __ASM_EMIT("vbroadcastss %[norm], %%ymm0") 222 // 16x blocks 223 __ASM_EMIT("sub $16, %[blocks]") 224 __ASM_EMIT("jb 2f") 225 __ASM_EMIT("vmovaps %%ymm0, %%ymm1") 226 __ASM_EMIT("1:") 227 __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm2") /* ymm2 = r0 r1 r2 r3 r4 r5 r6 r7 */ 228 __ASM_EMIT("vmulps 0x40(%[src]), %%ymm1, %%ymm3") /* ymm3 = r8 r9 r10 r11 r12 r13 r14 r15 */ 229 __ASM_EMIT("vmovups %%ymm2, 0x00(%[dst])") 230 __ASM_EMIT("vmovups %%ymm3, 0x20(%[dst])") 231 __ASM_EMIT("add $0x80, %[dst]") 232 __ASM_EMIT("add $0x40, %[src]") 233 __ASM_EMIT("sub $16, %[blocks]") 234 __ASM_EMIT("jae 1b") 235 __ASM_EMIT("2:") 236 // 8x block 237 __ASM_EMIT("add $8, %[blocks]") 238 __ASM_EMIT("jl 4f") 239 __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm2") /* ymm2 = r0 r1 r2 r3 r4 r5 r6 r7 */ 240 __ASM_EMIT("vmovups %%ymm2, 0x00(%[dst])") 241 __ASM_EMIT("4:") 242 243 : [dst] "+r"(dst), [src] "+r" (src), [blocks] "+r" (blocks) 244 : [norm] "o" (norm) 245 : "cc", "memory", 246 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 247 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 248 ); 249 } 250 fastconv_reverse_unpack_adding(float * dst,const float * src,size_t rank)251 static inline void fastconv_reverse_unpack_adding(float *dst, const float *src, size_t rank) 252 { 253 size_t blocks = 1 << rank; 254 float norm = 1.0f / float(blocks); 255 256 // Perform 4-element butterflies 257 ARCH_X86_ASM 258 ( 259 __ASM_EMIT("vbroadcastss %[norm], %%ymm0") 260 // 16x blocks 261 __ASM_EMIT("sub $16, %[blocks]") 262 __ASM_EMIT("jb 2f") 263 __ASM_EMIT("vmovaps %%ymm0, %%ymm1") 264 __ASM_EMIT("1:") 265 __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm2") /* ymm2 = r0 r1 r2 r3 r4 r5 r6 r7 */ 266 __ASM_EMIT("vmulps 0x40(%[src]), %%ymm1, %%ymm3") /* ymm3 = r8 r9 r10 r11 r12 r13 r14 r15 */ 267 __ASM_EMIT("vaddps 0x00(%[dst]), %%ymm2, %%ymm2") 268 __ASM_EMIT("vaddps 0x20(%[dst]), %%ymm3, %%ymm3") 269 __ASM_EMIT("vmovups %%ymm2, 0x00(%[dst])") 270 __ASM_EMIT("vmovups %%ymm3, 0x20(%[dst])") 271 __ASM_EMIT("add $0x80, %[dst]") 272 __ASM_EMIT("add $0x40, %[src]") 273 __ASM_EMIT("sub $16, %[blocks]") 274 __ASM_EMIT("jae 1b") 275 __ASM_EMIT("2:") 276 // 8x block 277 __ASM_EMIT("add $8, %[blocks]") 278 __ASM_EMIT("jl 4f") 279 __ASM_EMIT("vmulps 0x00(%[src]), %%ymm0, %%ymm2") /* ymm2 = r0 r1 r2 r3 r4 r5 r6 r7 */ 280 __ASM_EMIT("vaddps 0x00(%[dst]), %%ymm2, %%ymm2") 281 __ASM_EMIT("vmovups %%ymm2, 0x00(%[dst])") 282 __ASM_EMIT("4:") 283 284 : [dst] "+r"(dst), [src] "+r" (src), [blocks] "+r" (blocks) 285 : [norm] "o" (norm) 286 : "cc", "memory", 287 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 288 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 289 ); 290 } 291 292 #undef FASTCONV_DIRECT_PREPARE_BODY 293 #undef FASTCONV_REVERSE_PREPARE_BODY 294 #undef FMA_OFF 295 #undef FMA_ON 296 } 297 298 299