1/*
2 * Copyright (c) 2014 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24#define MAX_CHANNELS        8
25#define MAX_FIR_ORDER       8
26#define MAX_IIR_ORDER       4
27#define MAX_RATEFACTOR      4
28#define MAX_BLOCKSIZE       (40 * MAX_RATEFACTOR)
29
30PST     .req    a1
31PCO     .req    a2
32AC0     .req    a3
33AC1     .req    a4
34CO0     .req    v1
35CO1     .req    v2
36CO2     .req    v3
37CO3     .req    v4
38ST0     .req    v5
39ST1     .req    v6
40ST2     .req    sl
41ST3     .req    fp
42I       .req    ip
43PSAMP   .req    lr
44
45
46.macro branch_pic_label first, remainder:vararg
47A       .word           \first   - 4
48T       .hword          (\first) / 2
49.ifnb   \remainder
50        branch_pic_label \remainder
51.endif
52.endm
53
54// Some macros that do loads/multiplies where the register number is determined
55// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
56
57.macro load  group, index, base, offset
58       .altmacro
59       load_ \group, %(\index), \base, \offset
60       .noaltmacro
61.endm
62
63.macro load_ group, index, base, offset
64        ldr     \group\index, [\base, #\offset]
65.endm
66
67.macro loadd  group, index, base, offset
68       .altmacro
69       loadd_ \group, %(\index), %(\index+1), \base, \offset
70       .noaltmacro
71.endm
72
73.macro loadd_ group, index0, index1, base, offset
74A .if \offset >= 256
75A       ldr     \group\index0, [\base, #\offset]
76A       ldr     \group\index1, [\base, #(\offset) + 4]
77A .else
78        ldrd    \group\index0, \group\index1, [\base, #\offset]
79A .endif
80.endm
81
82.macro multiply  index, accumulate, long
83        .altmacro
84        multiply_ %(\index), \accumulate, \long
85        .noaltmacro
86.endm
87
88.macro multiply_  index, accumulate, long
89 .if \long
90  .if \accumulate
91        smlal   AC0, AC1, CO\index, ST\index
92  .else
93        smull   AC0, AC1, CO\index, ST\index
94  .endif
95 .else
96  .if \accumulate
97        mla     AC0, CO\index, ST\index, AC0
98  .else
99        mul     AC0, CO\index, ST\index
100  .endif
101 .endif
102.endm
103
104// A macro to update the load register number and load offsets
105
106.macro inc  howmany
107  .set LOAD_REG, (LOAD_REG + \howmany) & 3
108  .set OFFSET_CO, OFFSET_CO + 4 * \howmany
109  .set OFFSET_ST, OFFSET_ST + 4 * \howmany
110  .if FIR_REMAIN > 0
111    .set FIR_REMAIN, FIR_REMAIN - \howmany
112    .if FIR_REMAIN == 0
113      .set OFFSET_CO, 4 * MAX_FIR_ORDER
114      .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
115    .endif
116  .elseif IIR_REMAIN > 0
117    .set IIR_REMAIN, IIR_REMAIN - \howmany
118  .endif
119.endm
120
121// Macro to implement the inner loop for one specific combination of parameters
122
123.macro implement_filter  mask_minus1, shift_0, shift_8, iir_taps, fir_taps
124  .set TOTAL_TAPS, \iir_taps + \fir_taps
125
126  // Deal with register allocation...
127  .set DEFINED_SHIFT, 0
128  .set DEFINED_MASK, 0
129  .set SHUFFLE_SHIFT, 0
130  .set SHUFFLE_MASK, 0
131  .set SPILL_SHIFT, 0
132  .set SPILL_MASK, 0
133  .if TOTAL_TAPS == 0
134    // Little register pressure in this case - just keep MASK where it was
135    .if !\mask_minus1
136      MASK .req ST1
137      .set DEFINED_MASK, 1
138    .endif
139  .else
140    .if \shift_0
141      .if !\mask_minus1
142        // AC1 is unused with shift 0
143        MASK .req AC1
144        .set DEFINED_MASK, 1
145        .set SHUFFLE_MASK, 1
146      .endif
147    .elseif \shift_8
148      .if !\mask_minus1
149        .if TOTAL_TAPS <= 4
150        // All coefficients are preloaded (so pointer not needed)
151          MASK .req PCO
152          .set DEFINED_MASK, 1
153          .set SHUFFLE_MASK, 1
154        .else
155          .set SPILL_MASK, 1
156        .endif
157      .endif
158    .else // shift not 0 or 8
159      .if TOTAL_TAPS <= 3
160        // All coefficients are preloaded, and at least one CO register is unused
161        .if \fir_taps & 1
162          SHIFT .req CO0
163          .set DEFINED_SHIFT, 1
164          .set SHUFFLE_SHIFT, 1
165        .else
166          SHIFT .req CO3
167          .set DEFINED_SHIFT, 1
168          .set SHUFFLE_SHIFT, 1
169        .endif
170        .if !\mask_minus1
171          MASK .req PCO
172          .set DEFINED_MASK, 1
173          .set SHUFFLE_MASK, 1
174        .endif
175      .elseif TOTAL_TAPS == 4
176        // All coefficients are preloaded
177        SHIFT .req PCO
178        .set DEFINED_SHIFT, 1
179        .set SHUFFLE_SHIFT, 1
180        .if !\mask_minus1
181          .set SPILL_MASK, 1
182        .endif
183      .else
184        .set SPILL_SHIFT, 1
185        .if !\mask_minus1
186          .set SPILL_MASK, 1
187        .endif
188      .endif
189    .endif
190  .endif
191  .if SPILL_SHIFT
192    SHIFT .req ST0
193    .set DEFINED_SHIFT, 1
194  .endif
195  .if SPILL_MASK
196    MASK .req ST1
197    .set DEFINED_MASK, 1
198  .endif
199
200        // Preload coefficients if possible
201  .if TOTAL_TAPS <= 4
202    .set OFFSET_CO, 0
203    .if \fir_taps & 1
204      .set LOAD_REG, 1
205    .else
206      .set LOAD_REG, 0
207    .endif
208    .rept \fir_taps
209        load    CO, LOAD_REG, PCO, OFFSET_CO
210      .set LOAD_REG, (LOAD_REG + 1) & 3
211      .set OFFSET_CO, OFFSET_CO + 4
212    .endr
213    .set OFFSET_CO, 4 * MAX_FIR_ORDER
214    .rept \iir_taps
215        load    CO, LOAD_REG, PCO, OFFSET_CO
216      .set LOAD_REG, (LOAD_REG + 1) & 3
217      .set OFFSET_CO, OFFSET_CO + 4
218    .endr
219  .endif
220
221        // Move mask/shift to final positions if necessary
222        // Need to do this after preloading, because in some cases we
223        // reuse the coefficient pointer register
224  .if SHUFFLE_SHIFT
225        mov     SHIFT, ST0
226  .endif
227  .if SHUFFLE_MASK
228        mov     MASK, ST1
229  .endif
230
231        // Begin loop
23201:
233  .if TOTAL_TAPS == 0
234        // Things simplify a lot in this case
235        // In fact this could be pipelined further if it's worth it...
236        ldr     ST0, [PSAMP]
237        subs    I, I, #1
238    .if !\mask_minus1
239        and     ST0, ST0, MASK
240    .endif
241        str     ST0, [PST, #-4]!
242        str     ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
243        str     ST0, [PSAMP], #4 * MAX_CHANNELS
244        bne     01b
245  .else
246    .if \fir_taps & 1
247      .set LOAD_REG, 1
248    .else
249      .set LOAD_REG, 0
250    .endif
251    .set LOAD_BANK, 0
252    .set FIR_REMAIN, \fir_taps
253    .set IIR_REMAIN, \iir_taps
254    .if FIR_REMAIN == 0 // only IIR terms
255      .set OFFSET_CO, 4 * MAX_FIR_ORDER
256      .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
257    .else
258      .set OFFSET_CO, 0
259      .set OFFSET_ST, 0
260    .endif
261    .set MUL_REG, LOAD_REG
262    .set COUNTER, 0
263    .rept TOTAL_TAPS + 2
264        // Do load(s)
265     .if FIR_REMAIN != 0 || IIR_REMAIN != 0
266      .if COUNTER == 0
267       .if TOTAL_TAPS > 4
268        load    CO, LOAD_REG, PCO, OFFSET_CO
269       .endif
270        load    ST, LOAD_REG, PST, OFFSET_ST
271        inc     1
272      .elseif COUNTER == 1 && (\fir_taps & 1) == 0
273       .if TOTAL_TAPS > 4
274        load    CO, LOAD_REG, PCO, OFFSET_CO
275       .endif
276        load    ST, LOAD_REG, PST, OFFSET_ST
277        inc     1
278      .elseif LOAD_BANK == 0
279       .if TOTAL_TAPS > 4
280        .if FIR_REMAIN == 0 && IIR_REMAIN == 1
281        load    CO, LOAD_REG, PCO, OFFSET_CO
282        .else
283        loadd   CO, LOAD_REG, PCO, OFFSET_CO
284        .endif
285       .endif
286       .set LOAD_BANK, 1
287      .else
288       .if FIR_REMAIN == 0 && IIR_REMAIN == 1
289        load    ST, LOAD_REG, PST, OFFSET_ST
290        inc     1
291       .else
292        loadd   ST, LOAD_REG, PST, OFFSET_ST
293        inc     2
294       .endif
295       .set LOAD_BANK, 0
296      .endif
297     .endif
298
299        // Do interleaved multiplies, slightly delayed
300     .if COUNTER >= 2
301        multiply MUL_REG, COUNTER > 2, !\shift_0
302      .set MUL_REG, (MUL_REG + 1) & 3
303     .endif
304     .set COUNTER, COUNTER + 1
305    .endr
306
307        // Post-process the result of the multiplies
308    .if SPILL_SHIFT
309        ldr     SHIFT, [sp, #9*4 + 0*4]
310    .endif
311    .if SPILL_MASK
312        ldr     MASK, [sp, #9*4 + 1*4]
313    .endif
314        ldr     ST2, [PSAMP]
315        subs    I, I, #1
316    .if \shift_8
317        mov     AC0, AC0, lsr #8
318        orr     AC0, AC0, AC1, lsl #24
319    .elseif !\shift_0
320        rsb     ST3, SHIFT, #32
321        mov     AC0, AC0, lsr SHIFT
322A       orr     AC0, AC0, AC1, lsl ST3
323T       mov     AC1, AC1, lsl ST3
324T       orr     AC0, AC0, AC1
325    .endif
326    .if \mask_minus1
327        add     ST3, ST2, AC0
328    .else
329        add     ST2, ST2, AC0
330        and     ST3, ST2, MASK
331        sub     ST2, ST3, AC0
332    .endif
333        str     ST3, [PST, #-4]!
334        str     ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
335        str     ST3, [PSAMP], #4 * MAX_CHANNELS
336        bne     01b
337  .endif
338        b       99f
339
340  .if DEFINED_SHIFT
341    .unreq SHIFT
342  .endif
343  .if DEFINED_MASK
344    .unreq MASK
345  .endif
346.endm
347
348.macro switch_on_fir_taps  mask_minus1, shift_0, shift_8, iir_taps
349A       ldr     CO0, [pc, a3, lsl #2]   // firorder is in range 0-(8-iir_taps)
350A       add     pc,  pc,  CO0
351T       tbh     [pc, a3, lsl #1]
3520:
353        branch_pic_label (70f - 0b), (71f - 0b), (72f - 0b), (73f - 0b)
354        branch_pic_label (74f - 0b)
355 .if \iir_taps <= 3
356        branch_pic_label (75f - 0b)
357  .if \iir_taps <= 2
358        branch_pic_label (76f - 0b)
359   .if \iir_taps <= 1
360        branch_pic_label (77f - 0b)
361    .if \iir_taps == 0
362        branch_pic_label (78f - 0b)
363    .endif
364   .endif
365  .endif
366 .endif
36770:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
36871:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
36972:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
37073:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
37174:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
372 .if \iir_taps <= 3
37375:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
374  .if \iir_taps <= 2
37576:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
376   .if \iir_taps <= 1
37777:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
378    .if \iir_taps == 0
37978:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
380    .endif
381   .endif
382  .endif
383 .endif
384.endm
385
386.macro switch_on_iir_taps  mask_minus1, shift_0, shift_8
387A       ldr     CO0, [pc, a4, lsl #2]   // irorder is in range 0-4
388A       add     pc,  pc,  CO0
389T       tbh     [pc, a4, lsl #1]
3900:
391        branch_pic_label (60f - 0b), (61f - 0b), (62f - 0b), (63f - 0b)
392        branch_pic_label (64f - 0b)
39360:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 0
39461:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 1
39562:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 2
39663:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 3
39764:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 4
398.endm
399
400/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
401 *                                int firorder, int iirorder,
402 *                                unsigned int filter_shift, int32_t mask,
403 *                                int blocksize, int32_t *sample_buffer);
404 */
405function ff_mlp_filter_channel_arm, export=1
406        push    {v1-fp,lr}
407        add     v1, sp, #9*4 // point at arguments on stack
408        ldm     v1, {ST0,ST1,I,PSAMP}
409        cmp     ST1, #-1
410        bne     30f
411        movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
412        bne     20f
413        bcs     10f
414        switch_on_iir_taps 1, 1, 0
41510:     switch_on_iir_taps 1, 0, 1
41620:     switch_on_iir_taps 1, 0, 0
41730:     movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
418        bne     50f
419        bcs     40f
420        switch_on_iir_taps 0, 1, 0
42140:     switch_on_iir_taps 0, 0, 1
42250:     switch_on_iir_taps 0, 0, 0
42399:     pop     {v1-fp,pc}
424endfunc
425
426        .unreq  PST
427        .unreq  PCO
428        .unreq  AC0
429        .unreq  AC1
430        .unreq  CO0
431        .unreq  CO1
432        .unreq  CO2
433        .unreq  CO3
434        .unreq  ST0
435        .unreq  ST1
436        .unreq  ST2
437        .unreq  ST3
438        .unreq  I
439        .unreq  PSAMP
440
441/********************************************************************/
442
443PSA     .req    a1 // samples
444PCO     .req    a2 // coeffs
445PBL     .req    a3 // bypassed_lsbs
446INDEX   .req    a4
447CO0     .req    v1
448CO1     .req    v2
449CO2     .req    v3
450CO3     .req    v4
451SA0     .req    v5
452SA1     .req    v6
453SA2     .req    sl
454SA3     .req    fp
455AC0     .req    ip
456AC1     .req    lr
457NOISE   .req    SA0
458LSB     .req    SA1
459DCH     .req    SA2 // dest_ch
460MASK    .req    SA3
461
462    // INDEX is used as follows:
463    // bits 0..6   index2 (values up to 17, but wider so that we can
464    //               add to index field without needing to mask)
465    // bits 7..14  i (values up to 160)
466    // bit 15      underflow detect for i
467    // bits 25..31 (if access_unit_size_pow2 == 128)  \ index
468    // bits 26..31 (if access_unit_size_pow2 == 64)   /
469
470.macro implement_rematrix  shift, index_mask, mask_minus1, maxchan
471    .if \maxchan == 1
472        // We can just leave the coefficients in registers in this case
473        ldrd    CO0, CO1, [PCO]
474    .endif
4751:
476    .if \maxchan == 1
477        ldrd    SA0, SA1, [PSA]
478        smull   AC0, AC1, CO0, SA0
479    .elseif \maxchan == 5
480        ldr     CO0, [PCO, #0]
481        ldr     SA0, [PSA, #0]
482        ldr     CO1, [PCO, #4]
483        ldr     SA1, [PSA, #4]
484        ldrd    CO2, CO3, [PCO, #8]
485        smull   AC0, AC1, CO0, SA0
486        ldrd    SA2, SA3, [PSA, #8]
487        smlal   AC0, AC1, CO1, SA1
488        ldrd    CO0, CO1, [PCO, #16]
489        smlal   AC0, AC1, CO2, SA2
490        ldrd    SA0, SA1, [PSA, #16]
491        smlal   AC0, AC1, CO3, SA3
492        smlal   AC0, AC1, CO0, SA0
493    .else // \maxchan == 7
494        ldr     CO2, [PCO, #0]
495        ldr     SA2, [PSA, #0]
496        ldr     CO3, [PCO, #4]
497        ldr     SA3, [PSA, #4]
498        ldrd    CO0, CO1, [PCO, #8]
499        smull   AC0, AC1, CO2, SA2
500        ldrd    SA0, SA1, [PSA, #8]
501        smlal   AC0, AC1, CO3, SA3
502        ldrd    CO2, CO3, [PCO, #16]
503        smlal   AC0, AC1, CO0, SA0
504        ldrd    SA2, SA3, [PSA, #16]
505        smlal   AC0, AC1, CO1, SA1
506        ldrd    CO0, CO1, [PCO, #24]
507        smlal   AC0, AC1, CO2, SA2
508        ldrd    SA0, SA1, [PSA, #24]
509        smlal   AC0, AC1, CO3, SA3
510        smlal   AC0, AC1, CO0, SA0
511    .endif
512        ldm     sp, {NOISE, DCH, MASK}
513        smlal   AC0, AC1, CO1, SA1
514    .if \shift != 0
515      .if \index_mask == 63
516        add     NOISE, NOISE, INDEX, lsr #32-6
517        ldrb    LSB, [PBL], #MAX_CHANNELS
518        ldrsb   NOISE, [NOISE]
519        add     INDEX, INDEX, INDEX, lsl #32-6
520      .else // \index_mask == 127
521        add     NOISE, NOISE, INDEX, lsr #32-7
522        ldrb    LSB, [PBL], #MAX_CHANNELS
523        ldrsb   NOISE, [NOISE]
524        add     INDEX, INDEX, INDEX, lsl #32-7
525      .endif
526        sub     INDEX, INDEX, #1<<7
527        adds    AC0, AC0, NOISE, lsl #\shift + 7
528        adc     AC1, AC1, NOISE, asr #31
529    .else
530        ldrb    LSB, [PBL], #MAX_CHANNELS
531        sub     INDEX, INDEX, #1<<7
532    .endif
533        add     PSA, PSA, #MAX_CHANNELS*4
534        mov     AC0, AC0, lsr #14
535        orr     AC0, AC0, AC1, lsl #18
536    .if !\mask_minus1
537        and     AC0, AC0, MASK
538    .endif
539        add     AC0, AC0, LSB
540        tst     INDEX, #1<<15
541        str     AC0, [PSA, DCH, lsl #2]  // DCH is precompensated for the early increment of PSA
542        beq     1b
543        b       98f
544.endm
545
546.macro switch_on_maxchan  shift, index_mask, mask_minus1
547        cmp     v4, #5
548        blo     51f
549        beq     50f
550        implement_rematrix  \shift, \index_mask, \mask_minus1, 7
55150:     implement_rematrix  \shift, \index_mask, \mask_minus1, 5
55251:     implement_rematrix  \shift, \index_mask, \mask_minus1, 1
553.endm
554
555.macro switch_on_mask  shift, index_mask
556        cmp     sl, #-1
557        bne     40f
558        switch_on_maxchan  \shift, \index_mask, 1
55940:     switch_on_maxchan  \shift, \index_mask, 0
560.endm
561
562.macro switch_on_au_size  shift
563  .if \shift == 0
564        switch_on_mask  \shift, undefined
565  .else
566        teq     v6, #64
567        bne     30f
568        orr     INDEX, INDEX, v1, lsl #32-6
569        switch_on_mask  \shift, 63
57030:     orr     INDEX, INDEX, v1, lsl #32-7
571        switch_on_mask  \shift, 127
572  .endif
573.endm
574
575/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
576 *                                  const int32_t *coeffs,
577 *                                  const uint8_t *bypassed_lsbs,
578 *                                  const int8_t *noise_buffer,
579 *                                  int index,
580 *                                  unsigned int dest_ch,
581 *                                  uint16_t blockpos,
582 *                                  unsigned int maxchan,
583 *                                  int matrix_noise_shift,
584 *                                  int access_unit_size_pow2,
585 *                                  int32_t mask);
586 */
587function ff_mlp_rematrix_channel_arm, export=1
588        push    {v1-fp,lr}
589        add     v1, sp, #9*4 // point at arguments on stack
590        ldm     v1, {v1-sl}
591        teq     v4, #1
592        itt     ne
593        teqne   v4, #5
594        teqne   v4, #7
595        bne     99f
596        teq     v6, #64
597        it      ne
598        teqne   v6, #128
599        bne     99f
600        sub     v2, v2, #MAX_CHANNELS
601        push    {a4,v2,sl}          // initialise NOISE,DCH,MASK; make sp dword-aligned
602        movs    INDEX, v3, lsl #7
603        beq     98f                 // just in case, do nothing if blockpos = 0
604        subs    INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
605        adc     lr, v1, v1          // calculate index2 (C was set by preceding subs)
606        orr     INDEX, INDEX, lr
607        // Switch on matrix_noise_shift: values 0 and 1 are
608        // disproportionately common so do those in a form the branch
609        // predictor can accelerate. Values can only go up to 15.
610        cmp     v5, #1
611        beq     11f
612        blo     10f
613A       ldr     v5,  [pc,  v5,  lsl #2]
614A       add     pc,  pc,  v5
615T       tbh     [pc, v5, lsl #1]
6160:
617        branch_pic_label          0,          0, (12f - 0b), (13f - 0b)
618        branch_pic_label (14f - 0b), (15f - 0b), (16f - 0b), (17f - 0b)
619        branch_pic_label (18f - 0b), (19f - 0b), (20f - 0b), (21f - 0b)
620        branch_pic_label (22f - 0b), (23f - 0b), (24f - 0b), (25f - 0b)
62110:     switch_on_au_size  0
62211:     switch_on_au_size  1
62312:     switch_on_au_size  2
62413:     switch_on_au_size  3
62514:     switch_on_au_size  4
62615:     switch_on_au_size  5
62716:     switch_on_au_size  6
62817:     switch_on_au_size  7
62918:     switch_on_au_size  8
63019:     switch_on_au_size  9
63120:     switch_on_au_size  10
63221:     switch_on_au_size  11
63322:     switch_on_au_size  12
63423:     switch_on_au_size  13
63524:     switch_on_au_size  14
63625:     switch_on_au_size  15
637
63898:     add     sp, sp, #3*4
639        pop     {v1-fp,pc}
64099:     // Can't handle these parameters, drop back to C
641        pop     {v1-fp,lr}
642        b       X(ff_mlp_rematrix_channel)
643endfunc
644
645        .unreq  PSA
646        .unreq  PCO
647        .unreq  PBL
648        .unreq  INDEX
649        .unreq  CO0
650        .unreq  CO1
651        .unreq  CO2
652        .unreq  CO3
653        .unreq  SA0
654        .unreq  SA1
655        .unreq  SA2
656        .unreq  SA3
657        .unreq  AC0
658        .unreq  AC1
659        .unreq  NOISE
660        .unreq  LSB
661        .unreq  DCH
662        .unreq  MASK
663