1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 13 дек. 2019 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef DSP_ARCH_X86_AVX_IMPL
23     #error "This header should not be included directly"
24 #endif /* DSP_ARCH_X86_AVX_IMPL */
25 
26 #define FASTCONV_DIRECT_PREPARE_BODY(FMA_SEL) \
27         size_t off; \
28         \
29         ARCH_X86_ASM( \
30             __ASM_EMIT("lea                 (,%[np], 8), %[off]") \
31             __ASM_EMIT("vmovups             0x00(%[ak]), %%ymm6")               /* ymm6 = x_re */ \
32             __ASM_EMIT("vmovups             0x20(%[ak]), %%ymm7")               /* ymm7 = x_im */ \
33             __ASM_EMIT("vmovups             0x00(%[wk]), %%ymm4")               /* ymm4 = w_re */ \
34             __ASM_EMIT("vmovups             0x20(%[wk]), %%ymm5")               /* ymm5 = w_im */ \
35             __ASM_EMIT("vxorps              %%ymm1, %%ymm1, %%ymm1")            /* ymm1 = a_im = 0 */ \
36             /* x8 blocks */ \
37             __ASM_EMIT32("subl              $8, %[np]") \
38             __ASM_EMIT64("sub               $8, %[np]") \
39             __ASM_EMIT64("jb                2f") \
40             __ASM_EMIT("1:") \
41             __ASM_EMIT("vmovups             0x00(%[src]), %%ymm0")              /* ymm0 = a_re = re */ \
42             __ASM_EMIT("vmulps              %%ymm0, %%ymm7, %%ymm3")            /* ymm3 = x_im * re */ \
43             __ASM_EMIT("vmulps              %%ymm0, %%ymm6, %%ymm2")            /* ymm2 = b_re = x_re * re */ \
44             __ASM_EMIT("vsubps              %%ymm3, %%ymm1, %%ymm3")            /* ymm3 = b_im = -x_im * re */ \
45             __ASM_EMIT("vmovups             %%ymm0, 0x00(%[dst])") \
46             __ASM_EMIT("vmovups             %%ymm1, 0x20(%[dst])") \
47             __ASM_EMIT("vmovups             %%ymm2, 0x00(%[dst], %[off])") \
48             __ASM_EMIT("vmovups             %%ymm3, 0x20(%[dst], %[off])") \
49             __ASM_EMIT("add                 $0x20, %[src]") \
50             __ASM_EMIT("add                 $0x40, %[dst]") \
51             __ASM_EMIT32("subl              $8, %[np]") \
52             __ASM_EMIT64("sub               $8, %[np]") \
53             __ASM_EMIT("jb                  2f") \
54             /* Rotate angle */ \
55             __ASM_EMIT("vmulps              %%ymm5, %%ymm6, %%ymm2")            /* ymm2 = w_im * x_re */ \
56             __ASM_EMIT("vmulps              %%ymm5, %%ymm7, %%ymm3")            /* ymm3 = w_im * x_im */ \
57             __ASM_EMIT(FMA_SEL("vmulps      %%ymm4, %%ymm6, %%ymm6", ""))       /* ymm6 = w_re * x_re */ \
58             __ASM_EMIT(FMA_SEL("vmulps      %%ymm4, %%ymm7, %%ymm7", ""))       /* ymm7 = w_re * x_im */ \
59             __ASM_EMIT(FMA_SEL("vsubps      %%ymm3, %%ymm6, %%ymm6", "vfmsub132ps %%ymm4, %%ymm3, %%ymm6")) /* ymm6 = x_re' = w_re * x_re - w_im * x_im */ \
60             __ASM_EMIT(FMA_SEL("vaddps      %%ymm2, %%ymm7, %%ymm7", "vfmadd132ps %%ymm4, %%ymm2, %%ymm7")) /* ymm7 = x_im' = w_re * x_im + w_im * x_re */ \
61             __ASM_EMIT("jmp                 1b") \
62             __ASM_EMIT("2:") \
63             : [dst] "+r" (dst), [src] "+r" (src), \
64               [off] "=&r" (off), [np] __ASM_ARG_RW(np) \
65             : [ak] "r" (ak), [wk] "r" (wk) \
66             : "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
67               "%xmm4", "%xmm5", "%xmm6", "%xmm7" \
68         )
69 
70 #define FASTCONV_REVERSE_PREPARE_BODY(FMA_SEL) \
71         ARCH_X86_ASM( \
72             /* 2x blocks of 4x-butterfly loop */ \
73             __ASM_EMIT("sub             $2, %[nb]") \
74             __ASM_EMIT("jb              2f") \
75             __ASM_EMIT("1:") \
76                 __ASM_EMIT("vmovups         0x00(%[dst]), %%xmm0")                  /* xmm0 = r0  r1  r2  r3 */ \
77                 __ASM_EMIT("vmovups         0x10(%[dst]), %%xmm4")                  /* xmm4 = r4  r5  r6  r7 */ \
78                 __ASM_EMIT("vmovups         0x20(%[dst]), %%xmm2")                  /* xmm2 = i0  i1  i2  i3 */ \
79                 __ASM_EMIT("vmovups         0x30(%[dst]), %%xmm6")                  /* xmm6 = i4  i5  i6  i7 */ \
80                 __ASM_EMIT("vinsertf128     $1, 0x40(%[dst]), %%ymm0, %%ymm0")      /* ymm0 = r0  r1  r2  r3  r8  r9  r10 r11   */ \
81                 __ASM_EMIT("vinsertf128     $1, 0x50(%[dst]), %%ymm4, %%ymm4")      /* ymm4 = r4  r5  r6  r7  r12 r13 r14 r15   */ \
82                 __ASM_EMIT("vinsertf128     $1, 0x60(%[dst]), %%ymm2, %%ymm2")      /* ymm2 = i0  i1  i2  i3  i8  i9  i10 i11   */ \
83                 __ASM_EMIT("vinsertf128     $1, 0x70(%[dst]), %%ymm6, %%ymm6")      /* ymm6 = i4  i5  i6  i7  i12 i13 i14 i15   */ \
84                 /* 1st-order 4x butterfly */ \
85                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ \
86                 __ASM_EMIT("vhsubps         %%ymm6, %%ymm2, %%ymm3")                /* ymm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ \
87                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ \
88                 __ASM_EMIT("vhaddps         %%ymm6, %%ymm2, %%ymm2")                /* ymm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ \
89                 /* 2nd-order 4x butterfly */ \
90                 __ASM_EMIT("vblendps        $0xaa, %%ymm3, %%ymm1, %%ymm4")         /* ymm4 = r1' i3' r5' i7' */ \
91                 __ASM_EMIT("vblendps        $0xaa, %%ymm1, %%ymm3, %%ymm5")         /* ymm5 = i1' r3' i5' r7' */ \
92                 __ASM_EMIT("vhsubps         %%ymm4, %%ymm0, %%ymm1")                /* ymm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */ \
93                 __ASM_EMIT("vhsubps         %%ymm5, %%ymm2, %%ymm3")                /* ymm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */ \
94                 __ASM_EMIT("vhaddps         %%ymm4, %%ymm0, %%ymm0")                /* ymm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */ \
95                 __ASM_EMIT("vhaddps         %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */ \
96                 __ASM_EMIT("vblendps        $0xcc, %%ymm1, %%ymm0, %%ymm4")         /* ymm4 = r0" i4" r1" r5" */ \
97                 __ASM_EMIT("vblendps        $0xcc, %%ymm0, %%ymm1, %%ymm5")         /* ymm5 = r2" r6" r3" r7" */ \
98                 __ASM_EMIT("vshufps         $0x88, %%ymm3, %%ymm2, %%ymm6")         /* ymm6 = i0" i1" i2" i3" */ \
99                 __ASM_EMIT("vshufps         $0xdd, %%ymm3, %%ymm2, %%ymm7")         /* ymm7 = i4" i5" i6" i7" */ \
100                 __ASM_EMIT("vshufps         $0x88, %%ymm5, %%ymm4, %%ymm2")         /* ymm2 = r0" r1" r2" r3" */ \
101                 __ASM_EMIT("vshufps         $0xdd, %%ymm5, %%ymm4, %%ymm3")         /* ymm3 = r4" r5" r6" r7" */ \
102                 /* 3rd-order 8x butterfly */ \
103                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm3, %%ymm4")       /* ymm4 = x_im * b_re */ \
104                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%ymm7, %%ymm5")       /* ymm5 = x_im * b_im */ \
105                 __ASM_EMIT(FMA_SEL("vmulps  0x00 + %[FFT_A], %%ymm3, %%ymm3", ""))  /* ymm3 = x_re * b_re */ \
106                 __ASM_EMIT(FMA_SEL("vmulps  0x00 + %[FFT_A], %%ymm7, %%ymm7", ""))  /* ymm7 = x_re * b_im */ \
107                 __ASM_EMIT(FMA_SEL("vsubps  %%ymm5, %%ymm3, %%ymm5", "vfmsub231ps  0x00 + %[FFT_A], %%ymm3, %%ymm5"))       /* ymm5 = c_re = x_re * b_re - x_im * b_im */ \
108                 __ASM_EMIT(FMA_SEL("vaddps  %%ymm4, %%ymm7, %%ymm4", "vfmadd231ps  0x00 + %[FFT_A], %%ymm7, %%ymm4"))       /* ymm4 = c_im = x_re * b_im + x_im * b_re */ \
109                 __ASM_EMIT("vsubps          %%ymm5, %%ymm2, %%ymm0")                /* ymm0 = a_re - c_re */ \
110                 __ASM_EMIT("vsubps          %%ymm4, %%ymm6, %%ymm1")                /* ymm1 = a_im - c_im */ \
111                 __ASM_EMIT("vaddps          %%ymm5, %%ymm2, %%ymm2")                /* ymm2 = a_re + c_re */ \
112                 __ASM_EMIT("vaddps          %%ymm4, %%ymm6, %%ymm3")                /* ymm3 = a_im + c_im */ \
113                 /* Store */ \
114                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst])") \
115                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst])") \
116                 __ASM_EMIT("vmovups         %%xmm3, 0x20(%[dst])") \
117                 __ASM_EMIT("vmovups         %%xmm1, 0x30(%[dst])") \
118                 __ASM_EMIT("vextractf128    $1, %%ymm2, 0x40(%[dst])") \
119                 __ASM_EMIT("vextractf128    $1, %%ymm0, 0x50(%[dst])") \
120                 __ASM_EMIT("vextractf128    $1, %%ymm3, 0x60(%[dst])") \
121                 __ASM_EMIT("vextractf128    $1, %%ymm1, 0x70(%[dst])") \
122             __ASM_EMIT("add             $0x80, %[dst]") \
123             __ASM_EMIT("sub             $2, %[nb]") \
124             __ASM_EMIT("jae             1b") \
125             /* 1x block of 4-butterfly */ \
126             __ASM_EMIT("2:") \
127             __ASM_EMIT("add             $1, %[nb]") \
128             __ASM_EMIT("jl              4f") \
129                 __ASM_EMIT("vmovups         0x00(%[dst]), %%xmm0")                  /* xmm0 = r0  r1  r2  r3 */ \
130                 __ASM_EMIT("vmovups         0x10(%[dst]), %%xmm4")                  /* xmm4 = r4  r5  r6  r7 */ \
131                 __ASM_EMIT("vmovups         0x20(%[dst]), %%xmm2")                  /* xmm2 = i0  i1  i2  i3 */ \
132                 __ASM_EMIT("vmovups         0x30(%[dst]), %%xmm6")                  /* xmm6 = i4  i5  i6  i7 */ \
133                 /* 1st-order 4x butterfly */ \
134                 __ASM_EMIT("vhsubps         %%xmm4, %%xmm0, %%xmm1")                /* xmm1 = r0-r1 r2-r3 r4-r5 r6-r7 = r1' r3' r5' r7' */ \
135                 __ASM_EMIT("vhsubps         %%xmm6, %%xmm2, %%xmm3")                /* xmm3 = i0-i1 i2-i3 i4-i5 i6-i7 = i1' i3' i5' i7' */ \
136                 __ASM_EMIT("vhaddps         %%xmm4, %%xmm0, %%xmm0")                /* xmm0 = r0+r1 r2+r3 r4+r5 r6+r7 = r0' r2' r4' r6' */ \
137                 __ASM_EMIT("vhaddps         %%xmm6, %%xmm2, %%xmm2")                /* xmm2 = i0+i1 i2+i3 i4+i5 i6+i7 = i0' i2' i4' i6' */ \
138                 /* 2nd-order 4x butterfly */ \
139                 __ASM_EMIT("vblendps        $0xaa, %%xmm3, %%xmm1, %%xmm4")         /* xmm4 = r1' i3' r5' i7' */ \
140                 __ASM_EMIT("vblendps        $0xaa, %%xmm1, %%xmm3, %%xmm5")         /* xmm5 = i1' r3' i5' r7' */ \
141                 __ASM_EMIT("vhsubps         %%xmm4, %%xmm0, %%xmm1")                /* xmm1 = r0'-r2' r4'-r6' r1'-i3' r5'-i7' = r2" r6" r1" r5" */ \
142                 __ASM_EMIT("vhsubps         %%xmm5, %%xmm2, %%xmm3")                /* xmm3 = i0'-i2' i4'-i6' i1'-r3' i5'-r7' = i2" i6" i3" i7" */ \
143                 __ASM_EMIT("vhaddps         %%xmm4, %%xmm0, %%xmm0")                /* xmm0 = r0'+r2' r4'+r6' r1'+i3' r5'+i7' = r0" r4" r3" r7" */ \
144                 __ASM_EMIT("vhaddps         %%xmm5, %%xmm2, %%xmm2")                /* xmm2 = i0'+i2' i4'+i6' i1'+r3' i5'+r7' = i0" i4" i1" i5" */ \
145                 __ASM_EMIT("vblendps        $0xcc, %%xmm1, %%xmm0, %%xmm4")         /* xmm4 = r0" i4" r1" r5" */ \
146                 __ASM_EMIT("vblendps        $0xcc, %%xmm0, %%xmm1, %%xmm5")         /* xmm5 = r2" r6" r3" r7" */ \
147                 __ASM_EMIT("vshufps         $0x88, %%xmm3, %%xmm2, %%xmm6")         /* xmm6 = i0" i1" i2" i3" */ \
148                 __ASM_EMIT("vshufps         $0xdd, %%xmm3, %%xmm2, %%xmm7")         /* xmm7 = i4" i5" i6" i7" */ \
149                 __ASM_EMIT("vshufps         $0x88, %%xmm5, %%xmm4, %%xmm2")         /* xmm2 = r0" r1" r2" r3" */ \
150                 __ASM_EMIT("vshufps         $0xdd, %%xmm5, %%xmm4, %%xmm3")         /* xmm3 = r4" r5" r6" r7" */ \
151                 /* 3rd-order 8x butterfly */ \
152                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%xmm3, %%xmm4")       /* xmm4 = x_im * b_re */ \
153                 __ASM_EMIT("vmulps          0x20 + %[FFT_A], %%xmm7, %%xmm5")       /* xmm5 = x_im * b_im */ \
154                 __ASM_EMIT(FMA_SEL("vmulps  0x00 + %[FFT_A], %%xmm3, %%xmm3", ""))  /* xmm3 = x_re * b_re */ \
155                 __ASM_EMIT(FMA_SEL("vmulps  0x00 + %[FFT_A], %%xmm7, %%xmm7", ""))  /* xmm7 = x_re * b_im */ \
156                 __ASM_EMIT(FMA_SEL("vsubps  %%xmm5, %%xmm3, %%xmm5", "vfmsub231ps  0x00 + %[FFT_A], %%xmm3, %%xmm5"))       /* xmm5 = c_re = x_re * b_re - x_im * b_im */ \
157                 __ASM_EMIT(FMA_SEL("vaddps  %%xmm4, %%xmm7, %%xmm4", "vfmadd231ps  0x00 + %[FFT_A], %%xmm7, %%xmm4"))       /* xmm4 = c_im = x_re * b_im + x_im * b_re */ \
158                 __ASM_EMIT("vsubps          %%xmm5, %%xmm2, %%xmm0")                /* xmm0 = a_re - c_re */ \
159                 __ASM_EMIT("vsubps          %%xmm4, %%xmm6, %%xmm1")                /* xmm1 = a_im - c_im */ \
160                 __ASM_EMIT("vaddps          %%xmm5, %%xmm2, %%xmm2")                /* xmm2 = a_re + c_re */ \
161                 __ASM_EMIT("vaddps          %%xmm4, %%xmm6, %%xmm3")                /* xmm3 = a_im + c_im */ \
162                 /* Store */ \
163                 __ASM_EMIT("vmovups         %%xmm2, 0x00(%[dst])") \
164                 __ASM_EMIT("vmovups         %%xmm0, 0x10(%[dst])") \
165                 __ASM_EMIT("vmovups         %%xmm3, 0x20(%[dst])") \
166                 __ASM_EMIT("vmovups         %%xmm1, 0x30(%[dst])") \
167             __ASM_EMIT("4:")  \
168             : [dst] "+r" (dst), [nb] "+r" (nb) \
169             : [FFT_A] "o" (FFT_A) \
170             : "cc", "memory", \
171               "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
172               "%xmm4", "%xmm5", "%xmm6", "%xmm7" \
173         )
174 
175 namespace avx
176 {
177 #define FMA_OFF(a, b)       a
178 #define FMA_ON(a, b)        b
179 
fastconv_direct_prepare(float * dst,const float * src,const float * ak,const float * wk,size_t np)180     static inline void fastconv_direct_prepare(float *dst, const float *src, const float *ak, const float *wk, size_t np)
181     {
182         FASTCONV_DIRECT_PREPARE_BODY(FMA_OFF);
183     }
184 
fastconv_reverse_prepare(float * dst,size_t nb)185     static inline void fastconv_reverse_prepare(float *dst, size_t nb)
186     {
187         FASTCONV_REVERSE_PREPARE_BODY(FMA_OFF);
188     }
189 
fastconv_direct_prepare_fma3(float * dst,const float * src,const float * ak,const float * wk,size_t np)190     static inline void fastconv_direct_prepare_fma3(float *dst, const float *src, const float *ak, const float *wk, size_t np)
191     {
192         FASTCONV_DIRECT_PREPARE_BODY(FMA_OFF);
193     }
194 
fastconv_reverse_prepare_fma3(float * dst,size_t nb)195     static inline void fastconv_reverse_prepare_fma3(float *dst, size_t nb)
196     {
197         FASTCONV_REVERSE_PREPARE_BODY(FMA_OFF);
198     }
199 
fastconv_direct_unpack(float * dst,const float * src)200     static inline void fastconv_direct_unpack(float *dst, const float *src)
201     {
202         ARCH_X86_ASM(
203             __ASM_EMIT("vmovups         (%[src]), %%xmm0")
204             __ASM_EMIT("vxorps          %%ymm1, %%ymm1, %%ymm1")
205             __ASM_EMIT("vmovups         %%ymm0, 0x00(%[dst])")
206             __ASM_EMIT("vmovups         %%ymm1, 0x20(%[dst])")
207             :
208             : [dst] "r" (dst), [src] "r" (src)
209             : "%xmm0", "%xmm1"
210         );
211     }
212 
fastconv_reverse_unpack(float * dst,const float * src,size_t rank)213     static inline void fastconv_reverse_unpack(float *dst, const float *src, size_t rank)
214     {
215         size_t blocks = 1 << rank;
216         float norm = 1.0f / float(blocks);
217 
218         // Perform 4-element butterflies
219         ARCH_X86_ASM
220         (
221             __ASM_EMIT("vbroadcastss    %[norm], %%ymm0")
222             // 16x blocks
223             __ASM_EMIT("sub             $16, %[blocks]")
224             __ASM_EMIT("jb              2f")
225             __ASM_EMIT("vmovaps         %%ymm0, %%ymm1")
226             __ASM_EMIT("1:")
227             __ASM_EMIT("vmulps          0x00(%[src]), %%ymm0, %%ymm2")  /* ymm2 = r0  r1  r2  r3  r4  r5  r6  r7  */
228             __ASM_EMIT("vmulps          0x40(%[src]), %%ymm1, %%ymm3")  /* ymm3 = r8  r9  r10 r11 r12 r13 r14 r15 */
229             __ASM_EMIT("vmovups         %%ymm2, 0x00(%[dst])")
230             __ASM_EMIT("vmovups         %%ymm3, 0x20(%[dst])")
231             __ASM_EMIT("add             $0x80, %[dst]")
232             __ASM_EMIT("add             $0x40, %[src]")
233             __ASM_EMIT("sub             $16, %[blocks]")
234             __ASM_EMIT("jae             1b")
235             __ASM_EMIT("2:")
236             // 8x block
237             __ASM_EMIT("add             $8, %[blocks]")
238             __ASM_EMIT("jl              4f")
239             __ASM_EMIT("vmulps          0x00(%[src]), %%ymm0, %%ymm2")  /* ymm2 = r0  r1  r2  r3  r4  r5  r6  r7  */
240             __ASM_EMIT("vmovups         %%ymm2, 0x00(%[dst])")
241             __ASM_EMIT("4:")
242 
243             : [dst] "+r"(dst), [src] "+r" (src), [blocks] "+r" (blocks)
244             : [norm] "o" (norm)
245             : "cc", "memory",
246               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
247               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
248         );
249     }
250 
fastconv_reverse_unpack_adding(float * dst,const float * src,size_t rank)251     static inline void fastconv_reverse_unpack_adding(float *dst, const float *src, size_t rank)
252     {
253         size_t blocks = 1 << rank;
254         float norm = 1.0f / float(blocks);
255 
256         // Perform 4-element butterflies
257         ARCH_X86_ASM
258         (
259             __ASM_EMIT("vbroadcastss    %[norm], %%ymm0")
260             // 16x blocks
261             __ASM_EMIT("sub             $16, %[blocks]")
262             __ASM_EMIT("jb              2f")
263             __ASM_EMIT("vmovaps         %%ymm0, %%ymm1")
264             __ASM_EMIT("1:")
265             __ASM_EMIT("vmulps          0x00(%[src]), %%ymm0, %%ymm2")  /* ymm2 = r0  r1  r2  r3  r4  r5  r6  r7  */
266             __ASM_EMIT("vmulps          0x40(%[src]), %%ymm1, %%ymm3")  /* ymm3 = r8  r9  r10 r11 r12 r13 r14 r15 */
267             __ASM_EMIT("vaddps          0x00(%[dst]), %%ymm2, %%ymm2")
268             __ASM_EMIT("vaddps          0x20(%[dst]), %%ymm3, %%ymm3")
269             __ASM_EMIT("vmovups         %%ymm2, 0x00(%[dst])")
270             __ASM_EMIT("vmovups         %%ymm3, 0x20(%[dst])")
271             __ASM_EMIT("add             $0x80, %[dst]")
272             __ASM_EMIT("add             $0x40, %[src]")
273             __ASM_EMIT("sub             $16, %[blocks]")
274             __ASM_EMIT("jae             1b")
275             __ASM_EMIT("2:")
276             // 8x block
277             __ASM_EMIT("add             $8, %[blocks]")
278             __ASM_EMIT("jl              4f")
279             __ASM_EMIT("vmulps          0x00(%[src]), %%ymm0, %%ymm2")  /* ymm2 = r0  r1  r2  r3  r4  r5  r6  r7  */
280             __ASM_EMIT("vaddps          0x00(%[dst]), %%ymm2, %%ymm2")
281             __ASM_EMIT("vmovups         %%ymm2, 0x00(%[dst])")
282             __ASM_EMIT("4:")
283 
284             : [dst] "+r"(dst), [src] "+r" (src), [blocks] "+r" (blocks)
285             : [norm] "o" (norm)
286             : "cc", "memory",
287               "%xmm0", "%xmm1", "%xmm2", "%xmm3",
288               "%xmm4", "%xmm5", "%xmm6", "%xmm7"
289         );
290     }
291 
292 #undef FASTCONV_DIRECT_PREPARE_BODY
293 #undef FASTCONV_REVERSE_PREPARE_BODY
294 #undef FMA_OFF
295 #undef FMA_ON
296 }
297 
298 
299