1/*
2 * FFT transform with Altivec optimizations
3 * Copyright (c) 2009 Loren Merritt
4 *
5 * This algorithm (though not any of the implementation details) is
6 * based on libdjbfft by D. J. Bernstein.
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/*
26 * These functions are not individually interchangeable with the C versions.
27 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
28 * in blocks as convenient to the vector size.
29 * i.e. {4x real, 4x imaginary, 4x real, ...}
30 *
31 * I ignore standard calling convention.
32 * Instead, the following registers are treated as global constants:
33 * v14: zero
34 * v15..v18: cosines
35 * v19..v29: permutations
36 * r9: 16
37 * r12: ff_cos_tabs
38 * and the rest are free for local use.
39 */
40
41#include "config.h"
42
43#if HAVE_GNU_AS && HAVE_ALTIVEC
44
45#include "asm.S"
46
47.text
48
49.macro addi2 ra, imm // add 32-bit immediate
50.if \imm & 0xffff
51    addi \ra, \ra, \imm@l
52.endif
53.if (\imm+0x8000)>>16
54    addis \ra, \ra, \imm@ha
55.endif
56.endm
57
58.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
59    vperm   \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
60    vperm   \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
61    vaddfp  \a0,\a2,\a3                         // {t1,t2,t6,t5}
62    vsubfp  \a1,\a2,\a3                         // {t3,t4,t8,t7}
63    vmrghw  \a2,\a0,\a1     // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
64    vperm   \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
65    vaddfp  \a0,\a2,\a3                         // {r0,r1,i0,i1}
66    vsubfp  \a1,\a2,\a3                         // {r2,r3,i2,i3}
67    vperm   \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
68    vperm   \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
69.endm
70
71.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
72    vperm   \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
73    vperm   \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
74    vperm   \b2,\b0,\b1,v20
75    vperm   \b3,\b0,\b1,v21
76    vaddfp  \a0,\a2,\a3                         // {t1,t2,t6,t5}
77    vsubfp  \a1,\a2,\a3                         // {t3,t4,t8,t7}
78    vaddfp  \b0,\b2,\b3
79    vsubfp  \b1,\b2,\b3
80    vmrghw  \a2,\a0,\a1     // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
81    vperm   \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
82    vmrghw  \b2,\b0,\b1
83    vperm   \b3,\b0,\b1,v22
84    vaddfp  \a0,\a2,\a3                         // {r0,r1,i0,i1}
85    vsubfp  \a1,\a2,\a3                         // {r2,r3,i2,i3}
86    vaddfp  \b0,\b2,\b3
87    vsubfp  \b1,\b2,\b3
88    vperm   \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
89    vperm   \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
90    vperm   \b2,\b0,\b1,v23
91    vperm   \b3,\b0,\b1,v24
92.endm
93
94.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
95    vmrghw  \b2,\b0,\b1     // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
96    vmrglw  \b3,\b0,\b1     // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
97    vperm   \a2,\a0,\a1,v20         // FFT4 ...
98    vperm   \a3,\a0,\a1,v21
99    vaddfp  \b0,\b2,\b3                         // {t1,t3,t2,t4}
100    vsubfp  \b1,\b2,\b3                         // {r5,r7,i5,i7}
101    vperm   \b4,\b1,\b1,v25 // vcprm(2,3,0,1)   // {i5,i7,r5,r7}
102    vaddfp  \a0,\a2,\a3
103    vsubfp  \a1,\a2,\a3
104    vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
105    vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
106    vmrghw  \a2,\a0,\a1
107    vperm   \a3,\a0,\a1,v22
108    vperm   \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
109    vperm   \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
110    vaddfp  \a0,\a2,\a3
111    vsubfp  \a1,\a2,\a3
112    vaddfp  \b0,\b2,\b3                         // {t1,t2,t9,ta}
113    vsubfp  \b1,\b2,\b3                         // {t6,t5,tc,tb}
114    vperm   \a2,\a0,\a1,v23
115    vperm   \a3,\a0,\a1,v24
116    vperm   \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
117    vperm   \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
118    vsubfp  \b0,\a2,\b2                         // {r4,r5,r6,r7}
119    vsubfp  \b1,\a3,\b3                         // {i4,i5,i6,i7}
120    vaddfp  \a0,\a2,\b2                         // {r0,r1,r2,r3}
121    vaddfp  \a1,\a3,\b3                         // {i0,i1,i2,i3}
122.endm
123
124.macro BF d0,d1,s0,s1
125    vsubfp  \d1,\s0,\s1
126    vaddfp  \d0,\s0,\s1
127.endm
128
129.macro zip d0,d1,s0,s1
130    vmrghw  \d0,\s0,\s1
131    vmrglw  \d1,\s0,\s1
132.endm
133
134.macro def_fft4 interleave
135fft4\interleave\()_altivec:
136    lvx    v0, 0,r3
137    lvx    v1,r9,r3
138    FFT4   v0,v1,v2,v3
139.ifnb \interleave
140    zip    v0,v1,v2,v3
141    stvx   v0, 0,r3
142    stvx   v1,r9,r3
143.else
144    stvx   v2, 0,r3
145    stvx   v3,r9,r3
146.endif
147    blr
148.endm
149
150.macro def_fft8 interleave
151fft8\interleave\()_altivec:
152    addi   r4,r3,32
153    lvx    v0, 0,r3
154    lvx    v1,r9,r3
155    lvx    v2, 0,r4
156    lvx    v3,r9,r4
157    FFT8   v0,v1,v2,v3,v4,v5,v6,v7,v8
158.ifnb \interleave
159    zip    v4,v5,v0,v1
160    zip    v6,v7,v2,v3
161    stvx   v4, 0,r3
162    stvx   v5,r9,r3
163    stvx   v6, 0,r4
164    stvx   v7,r9,r4
165.else
166    stvx   v0, 0,r3
167    stvx   v1,r9,r3
168    stvx   v2, 0,r4
169    stvx   v3,r9,r4
170.endif
171    blr
172.endm
173
174.macro def_fft16 interleave
175fft16\interleave\()_altivec:
176    addi   r5,r3,64
177    addi   r6,r3,96
178    addi   r4,r3,32
179    lvx    v0, 0,r5
180    lvx    v1,r9,r5
181    lvx    v2, 0,r6
182    lvx    v3,r9,r6
183    FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
184    lvx    v0, 0,r3
185    lvx    v1,r9,r3
186    lvx    v2, 0,r4
187    lvx    v3,r9,r4
188    FFT8   v0,v1,v2,v3,v8,v9,v10,v11,v12
189    vmaddfp   v8,v4,v15,v14 // r2*wre
190    vmaddfp   v9,v5,v15,v14 // i2*wre
191    vmaddfp  v10,v6,v15,v14 // r3*wre
192    vmaddfp  v11,v7,v15,v14 // i3*wre
193    vmaddfp   v8,v5,v16,v8  // i2*wim
194    vnmsubfp  v9,v4,v16,v9  // r2*wim
195    vnmsubfp v10,v7,v16,v10 // i3*wim
196    vmaddfp  v11,v6,v16,v11 // r3*wim
197    BF     v10,v12,v10,v8
198    BF     v11,v13,v9,v11
199    BF     v0,v4,v0,v10
200    BF     v3,v7,v3,v12
201    BF     v1,v5,v1,v11
202    BF     v2,v6,v2,v13
203.ifnb \interleave
204    zip     v8, v9,v0,v1
205    zip    v10,v11,v2,v3
206    zip    v12,v13,v4,v5
207    zip    v14,v15,v6,v7
208    stvx    v8, 0,r3
209    stvx    v9,r9,r3
210    stvx   v10, 0,r4
211    stvx   v11,r9,r4
212    stvx   v12, 0,r5
213    stvx   v13,r9,r5
214    stvx   v14, 0,r6
215    stvx   v15,r9,r6
216.else
217    stvx   v0, 0,r3
218    stvx   v4, 0,r5
219    stvx   v3,r9,r4
220    stvx   v7,r9,r6
221    stvx   v1,r9,r3
222    stvx   v5,r9,r5
223    stvx   v2, 0,r4
224    stvx   v6, 0,r6
225.endif
226    blr
227.endm
228
229// void pass(float *z, float *wre, int n)
230.macro PASS interleave, suffix
231fft_pass\suffix\()_altivec:
232    mtctr  r5
233    slwi   r0,r5,4
234    slwi   r7,r5,6   // o2
235    slwi   r5,r5,5   // o1
236    add   r10,r5,r7  // o3
237    add    r0,r4,r0  // wim
238    addi   r6,r5,16  // o1+16
239    addi   r8,r7,16  // o2+16
240    addi  r11,r10,16 // o3+16
2411:
242    lvx    v8, 0,r4  // wre
243    lvx   v10, 0,r0  // wim
244    sub    r0,r0,r9
245    lvx    v9, 0,r0
246    vperm  v9,v9,v10,v19   // vcprm(s0,3,2,1) => wim[0 .. -3]
247    lvx    v4,r3,r7        // r2 = z[o2]
248    lvx    v5,r3,r8        // i2 = z[o2+16]
249    lvx    v6,r3,r10       // r3 = z[o3]
250    lvx    v7,r3,r11       // i3 = z[o3+16]
251    vmaddfp  v10,v4,v8,v14 // r2*wre
252    vmaddfp  v11,v5,v8,v14 // i2*wre
253    vmaddfp  v12,v6,v8,v14 // r3*wre
254    vmaddfp  v13,v7,v8,v14 // i3*wre
255    lvx    v0, 0,r3        // r0 = z[0]
256    lvx    v3,r3,r6        // i1 = z[o1+16]
257    vmaddfp  v10,v5,v9,v10 // i2*wim
258    vnmsubfp v11,v4,v9,v11 // r2*wim
259    vnmsubfp v12,v7,v9,v12 // i3*wim
260    vmaddfp  v13,v6,v9,v13 // r3*wim
261    lvx    v1,r3,r9        // i0 = z[16]
262    lvx    v2,r3,r5        // r1 = z[o1]
263    BF     v12,v8,v12,v10
264    BF     v13,v9,v11,v13
265    BF     v0,v4,v0,v12
266    BF     v3,v7,v3,v8
267.if !\interleave
268    stvx   v0, 0,r3
269    stvx   v4,r3,r7
270    stvx   v3,r3,r6
271    stvx   v7,r3,r11
272.endif
273    BF     v1,v5,v1,v13
274    BF     v2,v6,v2,v9
275.if !\interleave
276    stvx   v1,r3,r9
277    stvx   v2,r3,r5
278    stvx   v5,r3,r8
279    stvx   v6,r3,r10
280.else
281    vmrghw v8,v0,v1
282    vmrglw v9,v0,v1
283    stvx   v8, 0,r3
284    stvx   v9,r3,r9
285    vmrghw v8,v2,v3
286    vmrglw v9,v2,v3
287    stvx   v8,r3,r5
288    stvx   v9,r3,r6
289    vmrghw v8,v4,v5
290    vmrglw v9,v4,v5
291    stvx   v8,r3,r7
292    stvx   v9,r3,r8
293    vmrghw v8,v6,v7
294    vmrglw v9,v6,v7
295    stvx   v8,r3,r10
296    stvx   v9,r3,r11
297.endif
298    addi   r3,r3,32
299    addi   r4,r4,16
300    bdnz 1b
301    sub    r3,r3,r5
302    blr
303.endm
304
305#define M_SQRT1_2      0.70710678118654752440  /* 1/sqrt(2) */
306
307#define WORD_0  0x00,0x01,0x02,0x03
308#define WORD_1  0x04,0x05,0x06,0x07
309#define WORD_2  0x08,0x09,0x0a,0x0b
310#define WORD_3  0x0c,0x0d,0x0e,0x0f
311#define WORD_s0 0x10,0x11,0x12,0x13
312#define WORD_s1 0x14,0x15,0x16,0x17
313#define WORD_s2 0x18,0x19,0x1a,0x1b
314#define WORD_s3 0x1c,0x1d,0x1e,0x1f
315
316#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
317
318    .rodata
319    .align 4
320fft_data:
321    .float  0, 0, 0, 0
322    .float  1, 0.92387953, M_SQRT1_2, 0.38268343
323    .float  0, 0.38268343, M_SQRT1_2, 0.92387953
324    .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
325    .float   M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
326    vcprm(s0,3,2,1)
327    vcprm(0,1,s2,s1)
328    vcprm(2,3,s0,s3)
329    vcprm(2,s3,3,s2)
330    vcprm(0,1,s0,s1)
331    vcprm(2,3,s2,s3)
332    vcprm(2,3,0,1)
333    vcprm(1,2,s3,s0)
334    vcprm(0,3,s2,s1)
335    vcprm(0,2,s1,s3)
336    vcprm(1,3,s0,s2)
337
338.macro lvm  b, r, regs:vararg
339    lvx     \r, 0, \b
340    addi    \b, \b, 16
341  .ifnb \regs
342    lvm     \b, \regs
343  .endif
344.endm
345
346.macro stvm b, r, regs:vararg
347    stvx    \r, 0, \b
348    addi    \b, \b, 16
349  .ifnb \regs
350    stvm    \b, \regs
351  .endif
352.endm
353
354.macro fft_calc interleave
355extfunc ff_fft_calc\interleave\()_altivec
356    mflr    r0
357    stp     r0, 2*PS(r1)
358    stpu    r1, -(160+16*PS)(r1)
359    get_got r11
360    addi    r6, r1, 16*PS
361    stvm    r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
362    mfvrsave r0
363    stw     r0, 15*PS(r1)
364    li      r6, 0xfffffffc
365    mtvrsave r6
366
367    movrel  r6, fft_data, r11
368    lvm     r6, v14, v15, v16, v17, v18, v19, v20, v21
369    lvm     r6, v22, v23, v24, v25, v26, v27, v28, v29
370
371    li      r9, 16
372    movrel  r12, X(ff_cos_tabs), r11
373
374    movrel  r6, fft_dispatch_tab\interleave\()_altivec, r11
375    lwz     r3, 0(r3)
376    subi    r3, r3, 2
377    slwi    r3, r3, 2+ARCH_PPC64
378    lpx     r3, r3, r6
379    mtctr   r3
380    mr      r3, r4
381    bctrl
382
383    addi    r6, r1, 16*PS
384    lvm     r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
385    lwz     r6, 15*PS(r1)
386    mtvrsave r6
387    lp      r1, 0(r1)
388    lp      r0, 2*PS(r1)
389    mtlr    r0
390    blr
391.endm
392
393.macro DECL_FFT suffix, bits, n, n2, n4
394fft\n\suffix\()_altivec:
395    mflr  r0
396    stp   r0,PS*(\bits-3)(r1)
397    bl    fft\n2\()_altivec
398    addi2 r3,\n*4
399    bl    fft\n4\()_altivec
400    addi2 r3,\n*2
401    bl    fft\n4\()_altivec
402    addi2 r3,\n*-6
403    lp    r0,PS*(\bits-3)(r1)
404    lp    r4,\bits*PS(r12)
405    mtlr  r0
406    li    r5,\n/16
407    b     fft_pass\suffix\()_altivec
408.endm
409
410.macro DECL_FFTS interleave, suffix
411    .text
412    def_fft4  \suffix
413    def_fft8  \suffix
414    def_fft16 \suffix
415    PASS \interleave, \suffix
416    DECL_FFT \suffix, 5,   32,   16,    8
417    DECL_FFT \suffix, 6,   64,   32,   16
418    DECL_FFT \suffix, 7,  128,   64,   32
419    DECL_FFT \suffix, 8,  256,  128,   64
420    DECL_FFT \suffix, 9,  512,  256,  128
421    DECL_FFT \suffix,10, 1024,  512,  256
422    DECL_FFT \suffix,11, 2048, 1024,  512
423    DECL_FFT \suffix,12, 4096, 2048, 1024
424    DECL_FFT \suffix,13, 8192, 4096, 2048
425    DECL_FFT \suffix,14,16384, 8192, 4096
426    DECL_FFT \suffix,15,32768,16384, 8192
427    DECL_FFT \suffix,16,65536,32768,16384
428
429    fft_calc \suffix
430
431    .rodata
432    .align 3
433fft_dispatch_tab\suffix\()_altivec:
434    PTR fft4\suffix\()_altivec
435    PTR fft8\suffix\()_altivec
436    PTR fft16\suffix\()_altivec
437    PTR fft32\suffix\()_altivec
438    PTR fft64\suffix\()_altivec
439    PTR fft128\suffix\()_altivec
440    PTR fft256\suffix\()_altivec
441    PTR fft512\suffix\()_altivec
442    PTR fft1024\suffix\()_altivec
443    PTR fft2048\suffix\()_altivec
444    PTR fft4096\suffix\()_altivec
445    PTR fft8192\suffix\()_altivec
446    PTR fft16384\suffix\()_altivec
447    PTR fft32768\suffix\()_altivec
448    PTR fft65536\suffix\()_altivec
449.endm
450
451DECL_FFTS 0
452DECL_FFTS 1, _interleave
453
454#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */
455