1/*****************************************************************************
2* Copyright (C) 2000-2001 Andre McCurdy  <armccurdy@yahoo.co.uk>
3*
4* This program is free software. you can redistribute it and/or modify
5* it under the terms of the GNU General Public License as published by
6* the Free Software Foundation@ either version 2 of the License, or
7* (at your option) any later version.
8*
9* This program is distributed in the hope that it will be useful,
10* but WITHOUT ANY WARRANTY, without even the implied warranty of
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12* GNU General Public License for more details.
13*
14* You should have received a copy of the GNU General Public License
15* along with this program@ if not, write to the Free Software
16* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17*
18*****************************************************************************
19*
20* Notes:
21*
22*
23*****************************************************************************
24*
25* $Id: imdct_l_arm.S,v 1.7 2001/03/25 20:03:34 rob Rel $
26*
27* 2001/03/24:  Andre McCurdy <armccurdy@yahoo.co.uk>
28*   - Corrected PIC unsafe loading of address of 'imdct36_long_karray'
29*
30* 2000/09/20:  Robert Leslie <rob@mars.org>
31*   - Added a global symbol with leading underscore per suggestion of
32*     Simon Burge to support linking with the a.out format.
33*
34* 2000/09/15:  Robert Leslie <rob@mars.org>
35*   - Fixed a small bug where flags were changed before a conditional branch.
36*
37* 2000/09/15:  Andre McCurdy <armccurdy@yahoo.co.uk>
38*   - Applied Nicolas Pitre's rounding optimisation in all remaining places.
39*
40* 2000/09/09:  Nicolas Pitre <nico@cam.org>
41*   - Optimized rounding + scaling operations.
42*
43* 2000/08/09:  Andre McCurdy <armccurdy@yahoo.co.uk>
44*   - Original created.
45*
46****************************************************************************/
47
48
49/*
50   On entry:
51
52      r0 = pointer to 18 element input  array
53      r1 = pointer to 36 element output array
54      r2 = windowing block type
55
56
57   Stack frame created during execution of the function:
58
59   Initial   Holds:
60   Stack
61   pointer
62   minus:
63
64       0
65       4     lr
66       8     r11
67      12     r10
68      16     r9
69      20     r8
70      24     r7
71      28     r6
72      32     r5
73      36     r4
74
75      40     r2 : windowing block type
76
77      44     ct00 high
78      48     ct00 low
79      52     ct01 high
80      56     ct01 low
81      60     ct04 high
82      64     ct04 low
83      68     ct06 high
84      72     ct06 low
85      76     ct05 high
86      80     ct05 low
87      84     ct03 high
88      88     ct03 low
89      92    -ct05 high
90      96    -ct05 low
91     100    -ct07 high
92     104    -ct07 low
93     108     ct07 high
94     112     ct07 low
95     116     ct02 high
96     120     ct02 low
97*/
98
99#define BLOCK_MODE_NORMAL   0
100#define BLOCK_MODE_START    1
101#define BLOCK_MODE_STOP     3
102
103
104#define X0   0x00
105#define X1   0x04
106#define X2   0x08
107#define X3   0x0C
108#define X4   0x10
109#define X5   0x14
110#define X6   0x18
111#define X7   0x1c
112#define X8   0x20
113#define X9   0x24
114#define X10  0x28
115#define X11  0x2c
116#define X12  0x30
117#define X13  0x34
118#define X14  0x38
119#define X15  0x3c
120#define X16  0x40
121#define X17  0x44
122
123#define x0   0x00
124#define x1   0x04
125#define x2   0x08
126#define x3   0x0C
127#define x4   0x10
128#define x5   0x14
129#define x6   0x18
130#define x7   0x1c
131#define x8   0x20
132#define x9   0x24
133#define x10  0x28
134#define x11  0x2c
135#define x12  0x30
136#define x13  0x34
137#define x14  0x38
138#define x15  0x3c
139#define x16  0x40
140#define x17  0x44
141#define x18  0x48
142#define x19  0x4c
143#define x20  0x50
144#define x21  0x54
145#define x22  0x58
146#define x23  0x5c
147#define x24  0x60
148#define x25  0x64
149#define x26  0x68
150#define x27  0x6c
151#define x28  0x70
152#define x29  0x74
153#define x30  0x78
154#define x31  0x7c
155#define x32  0x80
156#define x33  0x84
157#define x34  0x88
158#define x35  0x8c
159
160#define K00  0x0ffc19fd
161#define K01  0x00b2aa3e
162#define K02  0x0fdcf549
163#define K03  0x0216a2a2
164#define K04  0x0f9ee890
165#define K05  0x03768962
166#define K06  0x0f426cb5
167#define K07  0x04cfb0e2
168#define K08  0x0ec835e8
169#define K09  0x061f78aa
170#define K10  0x0e313245
171#define K11  0x07635284
172#define K12  0x0d7e8807
173#define K13  0x0898c779
174#define K14  0x0cb19346
175#define K15  0x09bd7ca0
176#define K16  0x0bcbe352
177#define K17  0x0acf37ad
178
179#define minus_K02 0xf0230ab7
180
181#define WL0  0x00b2aa3e
182#define WL1  0x0216a2a2
183#define WL2  0x03768962
184#define WL3  0x04cfb0e2
185#define WL4  0x061f78aa
186#define WL5  0x07635284
187#define WL6  0x0898c779
188#define WL7  0x09bd7ca0
189#define WL8  0x0acf37ad
190#define WL9  0x0bcbe352
191#define WL10 0x0cb19346
192#define WL11 0x0d7e8807
193#define WL12 0x0e313245
194#define WL13 0x0ec835e8
195#define WL14 0x0f426cb5
196#define WL15 0x0f9ee890
197#define WL16 0x0fdcf549
198#define WL17 0x0ffc19fd
199
200
201@*****************************************************************************
202
203
204    .text
205    .align
206
207    .global III_imdct_l
208    .global _III_imdct_l
209
210III_imdct_l:
211_III_imdct_l:
212
213    stmdb   sp!, { r2, r4 - r11, lr }   @ all callee saved regs, plus arg3
214
215    ldr     r4, =K08                    @ r4 =  K08
216    ldr     r5, =K09                    @ r5 =  K09
217    ldr     r8, [r0, #X4]               @ r8 =  X4
218    ldr     r9, [r0, #X13]              @ r9 =  X13
219    rsb     r6, r4, #0                  @ r6 = -K08
220    rsb     r7, r5, #0                  @ r7 = -K09
221
222    smull   r2, r3, r4, r8              @ r2..r3  = (X4 * K08)
223    smlal   r2, r3, r5, r9              @ r2..r3  = (X4 * K08) + (X13 *  K09) = ct01
224
225    smull   r10, lr, r8, r5             @ r10..lr = (X4 * K09)
226    smlal   r10, lr, r9, r6             @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00
227
228    ldr     r8, [r0, #X7]               @ r8 = X7
229    ldr     r9, [r0, #X16]              @ r9 = X16
230
231    stmdb   sp!, { r2, r3, r10, lr }    @ stack ct00_h, ct00_l, ct01_h, ct01_l
232
233    add     r8, r8, r9                  @ r8 = (X7 + X16)
234    ldr     r9, [r0, #X1]               @ r9 = X1
235
236    smlal   r2, r3, r6, r8              @ r2..r3  = ct01 + ((X7 + X16) * -K08)
237    smlal   r2, r3, r7, r9              @ r2..r3 += (X1  * -K09)
238
239    ldr     r7, [r0, #X10]              @ r7 = X10
240
241    rsbs    r10, r10, #0
242    rsc     lr, lr, #0                  @ r10..lr  = -ct00
243
244    smlal   r2, r3, r5, r7              @ r2..r3  += (X10 *  K09) = ct06
245
246    smlal   r10, lr, r9, r6             @ r10..lr  = -ct00 + ( X1        * -K08)
247    smlal   r10, lr, r8, r5             @ r10..lr +=         ((X7 + X16) *  K09)
248    smlal   r10, lr, r7, r4             @ r10..lr +=         ( X10       *  K08) = ct04
249
250    stmdb   sp!, { r2, r3, r10, lr }    @ stack ct04_h, ct04_l, ct06_h, ct06_l
251
252    @----
253
254    ldr     r7, [r0, #X0]
255    ldr     r8, [r0, #X11]
256    ldr     r9, [r0, #X12]
257    sub     r7, r7, r8
258    sub     r7, r7, r9                  @ r7 = (X0 - X11 -X12) = ct14
259
260    ldr     r9,  [r0, #X3]
261    ldr     r8,  [r0, #X8]
262    ldr     r11, [r0, #X15]
263    sub     r8, r8, r9
264    add     r8, r8, r11                 @ r8 = (X8 - X3 + X15) = ct16
265
266    add     r11, r7, r8                 @ r11 = ct14 + ct16 = ct18
267
268    smlal   r2, r3, r6, r11             @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08)
269
270    ldr     r6,  [r0, #X2]
271    ldr     r9,  [r0, #X9]
272    ldr     r12, [r0, #X14]
273    sub     r6, r6, r9
274    sub     r6, r6, r12                 @ r6 = (X2 - X9 - X14) = ct15
275
276    ldr     r9,  [r0, #X5]
277    ldr     r12, [r0, #X6]
278    sub     r9, r9, r12
279    ldr     r12, [r0, #X17]
280    sub     r9, r9, r12                 @ r9 = (X5 - X6 - X17) = ct17
281
282    add     r12, r9, r6                 @ r12 = ct15 + ct17 = ct19
283
284    smlal   r2, r3, r5, r12             @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09)
285
286    smlal   r10, lr, r11, r5            @ r10..lr = ct04 + (ct18 * K09)
287    smlal   r10, lr, r12, r4            @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08)
288
289    movs    r2, r2, lsr #28
290    adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
291    str     r2, [r1, #x22]              @ store result x22
292
293    movs    r10, r10, lsr #28
294    adc     r10, r10, lr, lsl #4        @ r10 = bits[59..28] of r10..lr
295    str     r10, [r1, #x4]              @ store result x4
296
297    @----
298
299    ldmia   sp, { r2, r3, r4, r5 }      @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
300
301    @ r2..r3 = ct06
302    @ r4..r5 = ct04
303    @ r6     = ct15
304    @ r7     = ct14
305    @ r8     = ct16
306    @ r9     = ct17
307    @ r10    = .
308    @ r11    = .
309    @ r12    = .
310    @ lr     = .
311
312    ldr     r10, =K03                   @ r10 = K03
313    ldr     lr,  =K15                   @ lr  = K15
314
315    smlal   r2, r3, r10, r7             @ r2..r3 = ct06 + (ct14 * K03)
316    smlal   r4, r5,  lr, r7             @ r4..r5 = ct04 + (ct14 * K15)
317
318    ldr     r12, =K14                   @ r12 =  K14
319    rsb     r10, r10, #0                @ r10 = -K03
320
321    smlal   r2, r3,  lr, r6             @ r2..r3 += (ct15 *  K15)
322    smlal   r4, r5, r10, r6             @ r4..r5 += (ct15 * -K03)
323    smlal   r2, r3, r12, r8             @ r2..r3 += (ct16 *  K14)
324
325    ldr     r11, =minus_K02             @ r11 = -K02
326    rsb     r12, r12, #0                @ r12 = -K14
327
328    smlal   r4, r5, r12, r9             @ r4..r5 += (ct17 * -K14)
329    smlal   r2, r3, r11, r9             @ r2..r3 += (ct17 * -K02)
330    smlal   r4, r5, r11, r8             @ r4..r5 += (ct16 * -K02)
331
332    movs    r2, r2, lsr #28
333    adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
334    str     r2, [r1, #x7]               @ store result x7
335
336    movs    r4, r4, lsr #28
337    adc     r4, r4, r5, lsl #4          @ r4 = bits[59..28] of r4..r5
338    str     r4, [r1, #x1]               @ store result x1
339
340    @----
341
342    ldmia   sp, { r2, r3, r4, r5 }      @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
343
344    @ r2..r3 = ct06
345    @ r4..r5 = ct04
346    @ r6     = ct15
347    @ r7     = ct14
348    @ r8     = ct16
349    @ r9     = ct17
350    @ r10    = -K03
351    @ r11    = -K02
352    @ r12    = -K14
353    @ lr     =  K15
354
355    rsbs    r2, r2, #0
356    rsc     r3, r3, #0                  @ r2..r3 = -ct06
357
358    smlal   r2, r3, r12, r7             @ r2..r3  = -ct06 + (ct14 * -K14)
359    smlal   r2, r3, r10, r8             @ r2..r3 += (ct16 * -K03)
360
361    smlal   r4, r5, r12, r6             @ r4..r5  =  ct04 + (ct15 * -K14)
362    smlal   r4, r5, r10, r9             @ r4..r5 += (ct17 * -K03)
363    smlal   r4, r5,  lr, r8             @ r4..r5 += (ct16 *  K15)
364    smlal   r4, r5, r11, r7             @ r4..r5 += (ct14 * -K02)
365
366    rsb     lr, lr, #0                  @ lr  = -K15
367    rsb     r11, r11, #0                @ r11 =  K02
368
369    smlal   r2, r3,  lr, r9             @ r2..r3 += (ct17 * -K15)
370    smlal   r2, r3, r11, r6             @ r2..r3 += (ct15 *  K02)
371
372    movs    r4, r4, lsr #28
373    adc     r4, r4, r5, lsl #4          @ r4 = bits[59..28] of r4..r5
374    str     r4, [r1, #x25]              @ store result x25
375
376    movs    r2, r2, lsr #28
377    adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
378    str     r2, [r1, #x19]              @ store result x19
379
380    @----
381
382    ldr     r2, [sp, #16]               @ r2 = ct01_l
383    ldr     r3, [sp, #20]               @ r3 = ct01_h
384
385    ldr     r6, [r0, #X1]
386    ldr     r8, [r0, #X7]
387    ldr     r9, [r0, #X10]
388    ldr     r7, [r0, #X16]
389
390    rsbs    r2, r2, #0
391    rsc     r3, r3, #0                  @ r2..r3 = -ct01
392
393    mov     r4, r2
394    mov     r5, r3                      @ r4..r5 = -ct01
395
396    @ r2..r3 = -ct01
397    @ r4..r5 = -ct01
398    @ r6     =  X1
399    @ r7     =  X16
400    @ r8     =  X7
401    @ r9     =  X10
402    @ r10    = -K03
403    @ r11    =  K02
404    @ r12    = -K14
405    @ lr     = -K15
406
407    smlal   r4, r5, r12, r7             @ r4..r5 = -ct01 + (X16 * -K14)
408    smlal   r2, r3,  lr, r9             @ r2..r3 = -ct01 + (X10 * -K15)
409
410    smlal   r4, r5, r10, r8             @ r4..r5 += (X7  * -K03)
411    smlal   r2, r3, r10, r7             @ r2..r3 += (X16 * -K03)
412
413    smlal   r4, r5, r11, r9             @ r4..r5 += (X10 *  K02)
414    smlal   r2, r3, r12, r8             @ r2..r3 += (X7  * -K14)
415
416    rsb     lr, lr, #0                  @ lr  =  K15
417    rsb     r11, r11, #0                @ r11 = -K02
418
419    smlal   r4, r5,  lr, r6             @ r4..r5 += (X1  *  K15) = ct05
420    smlal   r2, r3, r11, r6             @ r2..r3 += (X1  * -K02) = ct03
421
422    stmdb   sp!, { r2, r3, r4, r5 }     @ stack ct05_h, ct05_l, ct03_h, ct03_l
423
424    rsbs    r4, r4, #0
425    rsc     r5, r5, #0                  @ r4..r5 = -ct05
426
427    stmdb   sp!, { r4, r5 }             @ stack -ct05_h, -ct05_l
428
429    ldr     r2, [sp, #48]               @ r2 = ct00_l
430    ldr     r3, [sp, #52]               @ r3 = ct00_h
431
432    rsb     r10, r10, #0                @ r10 = K03
433
434    rsbs    r4, r2, #0
435    rsc     r5, r3, #0                  @ r4..r5 = -ct00
436
437    @ r2..r3 =  ct00
438    @ r4..r5 = -ct00
439    @ r6     =  X1
440    @ r7     =  X16
441    @ r8     =  X7
442    @ r9     =  X10
443    @ r10    =  K03
444    @ r11    = -K02
445    @ r12    = -K14
446    @ lr     =  K15
447
448    smlal   r4, r5, r10, r6             @ r4..r5 = -ct00 + (X1  * K03)
449    smlal   r2, r3, r10, r9             @ r2..r3 =  ct00 + (X10 * K03)
450
451    smlal   r4, r5, r12, r9             @ r4..r5 += (X10 * -K14)
452    smlal   r2, r3, r12, r6             @ r2..r3 += (X1  * -K14)
453
454    smlal   r4, r5, r11, r7             @ r4..r5 += (X16 * -K02)
455    smlal   r4, r5,  lr, r8             @ r4..r5 += (X7  *  K15) = ct07
456
457    rsb     lr, lr, #0                  @ lr  = -K15
458    rsb     r11, r11, #0                @ r11 =  K02
459
460    smlal   r2, r3, r11, r8             @ r2..r3 += (X7  *  K02)
461    smlal   r2, r3,  lr, r7             @ r2..r3 += (X16 * -K15) = ct02
462
463    rsbs    r6, r4, #0
464    rsc     r7, r5, #0                  @ r6..r7 = -ct07
465
466    stmdb   sp!, { r2 - r7 }            @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l
467
468
469    @----
470
471    add     r2, pc, #(imdct36_long_karray-.-8)  @ r2 = base address of Knn array (PIC safe ?)
472
473
474loop:
475    ldr     r12, [r0, #X0]
476
477    ldmia   r2!, { r5 - r11 }           @ first 7 words from Karray element
478
479    smull   r3, r4, r5, r12             @ sum =  (Kxx * X0)
480    ldr     r12, [r0, #X2]
481    ldr     r5,  [r0, #X3]
482    smlal   r3, r4, r6, r12             @ sum += (Kxx * X2)
483    ldr     r12, [r0, #X5]
484    ldr     r6,  [r0, #X6]
485    smlal   r3, r4, r7, r5              @ sum += (Kxx * X3)
486    smlal   r3, r4, r8, r12             @ sum += (Kxx * X5)
487    ldr     r12, [r0, #X8]
488    ldr     r5,  [r0, #X9]
489    smlal   r3, r4, r9, r6              @ sum += (Kxx * X6)
490    smlal   r3, r4, r10, r12            @ sum += (Kxx * X8)
491    smlal   r3, r4, r11, r5             @ sum += (Kxx * X9)
492
493    ldmia   r2!, { r5 - r10 }           @ final 6 words from Karray element
494
495    ldr     r11, [r0, #X11]
496    ldr     r12, [r0, #X12]
497    smlal   r3, r4, r5, r11             @ sum += (Kxx * X11)
498    ldr     r11, [r0, #X14]
499    ldr     r5,  [r0, #X15]
500    smlal   r3, r4, r6, r12             @ sum += (Kxx * X12)
501    smlal   r3, r4, r7, r11             @ sum += (Kxx * X14)
502    ldr     r11, [r0, #X17]
503    smlal   r3, r4, r8, r5              @ sum += (Kxx * X15)
504    smlal   r3, r4, r9, r11             @ sum += (Kxx * X17)
505
506    add     r5, sp, r10, lsr #16        @ create index back into stack for required ctxx
507
508    ldmia   r5, { r6, r7 }              @ r6..r7 = ctxx
509
510    mov     r8, r10, lsl #16            @ push ctxx index off the top end
511
512    adds    r3, r3, r6                  @ add low words
513    adc     r4, r4, r7                  @ add high words, with carry
514    movs    r3, r3, lsr #28
515    adc     r3, r3, r4, lsl #4          @ r3 = bits[59..28] of r3..r4
516
517    str     r3, [r1, r8, lsr #24]       @ push completion flag off the bottom end
518
519    movs    r8, r8, lsl #8              @ push result location index off the top end
520    beq     loop                        @ loop back if completion flag not set
521    b       imdct_l_windowing           @ branch to windowing stage if looping finished
522
523imdct36_long_karray:
524
525    .word   K17, -K13,  K10, -K06, -K05,  K01, -K00,  K04, -K07,  K11,  K12, -K16, 0x00000000
526    .word   K13,  K07,  K16,  K01,  K10, -K05,  K04, -K11,  K00, -K17,  K06, -K12, 0x00200800
527    .word   K11,  K17,  K05,  K12, -K01,  K06, -K07,  K00, -K13,  K04, -K16,  K10, 0x00200c00
528    .word   K07,  K00, -K12,  K05, -K16, -K10,  K11, -K17,  K04,  K13,  K01,  K06, 0x00001400
529    .word   K05,  K10, -K00, -K17,  K07, -K13,  K12,  K06, -K16,  K01, -K11, -K04, 0x00181800
530    .word   K01,  K05, -K07, -K11,  K13,  K17, -K16, -K12,  K10,  K06, -K04, -K00, 0x00102000
531    .word  -K16,  K12, -K11,  K07,  K04, -K00, -K01,  K05, -K06,  K10,  K13, -K17, 0x00284800
532    .word  -K12,  K06,  K17, -K00, -K11,  K04,  K05, -K10,  K01,  K16, -K07, -K13, 0x00085000
533    .word  -K10,  K16,  K04, -K13, -K00,  K07,  K06, -K01, -K12, -K05,  K17,  K11, 0x00105400
534    .word  -K06, -K01,  K13,  K04,  K17, -K11, -K10, -K16, -K05,  K12,  K00,  K07, 0x00185c00
535    .word  -K04, -K11, -K01,  K16,  K06,  K12,  K13, -K07, -K17, -K00, -K10, -K05, 0x00006000
536    .word  -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801
537
538
539    @----
540    @-------------------------------------------------------------------------
541    @----
542
543imdct_l_windowing:
544
545    ldr     r11, [sp, #80]              @ fetch function parameter 3 from out of the stack
546    ldmia   r1!, { r0, r2 - r9 }        @ load 9 words from x0, update pointer
547
548    @ r0     = x0
549    @ r1     = &x[9]
550    @ r2     = x1
551    @ r3     = x2
552    @ r4     = x3
553    @ r5     = x4
554    @ r6     = x5
555    @ r7     = x6
556    @ r8     = x7
557    @ r9     = x8
558    @ r10    = .
559    @ r11    = window mode: (0 == normal), (1 == start block), (3 == stop block)
560    @ r12    = .
561    @ lr     = .
562
563    cmp     r11, #BLOCK_MODE_STOP       @ setup flags
564    rsb     r10, r0, #0                 @ r10 = -x0 (DONT change flags !!)
565    beq     stop_block_x0_to_x17
566
567
568    @ start and normal blocks are treated the same for x[0]..x[17]
569
570normal_block_x0_to_x17:
571
572    ldr     r12, =WL9                   @ r12 = window_l[9]
573
574    rsb     r0,  r9, #0                 @ r0  = -x8
575    rsb     r9,  r2, #0                 @ r9  = -x1
576    rsb     r2,  r8, #0                 @ r2  = -x7
577    rsb     r8,  r3, #0                 @ r8  = -x2
578    rsb     r3,  r7, #0                 @ r3  = -x6
579    rsb     r7,  r4, #0                 @ r7  = -x3
580    rsb     r4,  r6, #0                 @ r4  = -x5
581    rsb     r6,  r5, #0                 @ r6  = -x4
582
583    @ r0     = -x8
584    @ r1     = &x[9]
585    @ r2     = -x7
586    @ r3     = -x6
587    @ r4     = -x5
588    @ r5     = .
589    @ r6     = -x4
590    @ r7     = -x3
591    @ r8     = -x2
592    @ r9     = -x1
593    @ r10    = -x0
594    @ r11    = window mode: (0 == normal), (1 == start block), (3 == stop block)
595    @ r12    = window_l[9]
596    @ lr     = .
597
598    smull   r5, lr, r12, r0             @ r5..lr = (window_l[9]  * (x[9]  == -x[8]))
599    ldr     r12, =WL10                  @ r12 = window_l[10]
600    movs    r5, r5, lsr #28
601    adc     r0, r5, lr, lsl #4          @ r0 = bits[59..28] of windowed x9
602
603    smull   r5, lr, r12, r2             @ r5..lr = (window_l[10] * (x[10] == -x[7]))
604    ldr     r12, =WL11                  @ r12 = window_l[11]
605    movs    r5, r5, lsr #28
606    adc     r2, r5, lr, lsl #4          @ r2 = bits[59..28] of windowed x10
607
608    smull   r5, lr, r12, r3             @ r5..lr = (window_l[11] * (x[11] == -x[6]))
609    ldr     r12, =WL12                  @ r12 = window_l[12]
610    movs    r5, r5, lsr #28
611    adc     r3, r5, lr, lsl #4          @ r3 = bits[59..28] of windowed x11
612
613    smull   r5, lr, r12, r4             @ r5..lr = (window_l[12] * (x[12] == -x[5]))
614    ldr     r12, =WL13                  @ r12 = window_l[13]
615    movs    r5, r5, lsr #28
616    adc     r4, r5, lr, lsl #4          @ r4 = bits[59..28] of windowed x12
617
618    smull   r5, lr, r12, r6             @ r5..lr = (window_l[13] * (x[13] == -x[4]))
619    ldr     r12, =WL14                  @ r12 = window_l[14]
620    movs    r5, r5, lsr #28
621    adc     r6, r5, lr, lsl #4          @ r6 = bits[59..28] of windowed x13
622
623    smull   r5, lr, r12, r7             @ r5..lr = (window_l[14] * (x[14] == -x[3]))
624    ldr     r12, =WL15                  @ r12 = window_l[15]
625    movs    r5, r5, lsr #28
626    adc     r7, r5, lr, lsl #4          @ r7 = bits[59..28] of windowed x14
627
628    smull   r5, lr, r12, r8             @ r5..lr = (window_l[15] * (x[15] == -x[2]))
629    ldr     r12, =WL16                  @ r12 = window_l[16]
630    movs    r5, r5, lsr #28
631    adc     r8, r5, lr, lsl #4          @ r8 = bits[59..28] of windowed x15
632
633    smull   r5, lr, r12, r9             @ r5..lr = (window_l[16] * (x[16] == -x[1]))
634    ldr     r12, =WL17                  @ r12 = window_l[17]
635    movs    r5, r5, lsr #28
636    adc     r9, r5, lr, lsl #4          @ r9 = bits[59..28] of windowed x16
637
638    smull   r5, lr, r12, r10            @ r5..lr = (window_l[17] * (x[17] == -x[0]))
639    ldr     r12, =WL0                   @ r12 = window_l[0]
640    movs    r5, r5, lsr #28
641    adc     r10, r5, lr, lsl #4         @ r10 = bits[59..28] of windowed x17
642
643
644    stmia   r1,  { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17]
645    ldmdb   r1!, { r0, r2 - r9 }           @ load 9 words downto (and including) x0
646
647
648    smull   r10, lr, r12, r0            @ r10..lr = (window_l[0] * x[0])
649    ldr     r12, =WL1                   @ r12 = window_l[1]
650    movs    r10, r10, lsr #28
651    adc     r0, r10, lr, lsl #4         @ r0 = bits[59..28] of windowed x0
652
653    smull   r10, lr, r12, r2            @ r10..lr = (window_l[1] * x[1])
654    ldr     r12, =WL2                   @ r12 = window_l[2]
655    movs    r10, r10, lsr #28
656    adc     r2, r10, lr, lsl #4         @ r2 = bits[59..28] of windowed x1
657
658    smull   r10, lr, r12, r3            @ r10..lr = (window_l[2] * x[2])
659    ldr     r12, =WL3                   @ r12 = window_l[3]
660    movs    r10, r10, lsr #28
661    adc     r3, r10, lr, lsl #4         @ r3 = bits[59..28] of windowed x2
662
663    smull   r10, lr, r12, r4            @ r10..lr = (window_l[3] * x[3])
664    ldr     r12, =WL4                   @ r12 = window_l[4]
665    movs    r10, r10, lsr #28
666    adc     r4, r10, lr, lsl #4         @ r4 = bits[59..28] of windowed x3
667
668    smull   r10, lr, r12, r5            @ r10..lr = (window_l[4] * x[4])
669    ldr     r12, =WL5                   @ r12 = window_l[5]
670    movs    r10, r10, lsr #28
671    adc     r5, r10, lr, lsl #4         @ r5 = bits[59..28] of windowed x4
672
673    smull   r10, lr, r12, r6            @ r10..lr = (window_l[5] * x[5])
674    ldr     r12, =WL6                   @ r12 = window_l[6]
675    movs    r10, r10, lsr #28
676    adc     r6, r10, lr, lsl #4         @ r6 = bits[59..28] of windowed x5
677
678    smull   r10, lr, r12, r7            @ r10..lr = (window_l[6] * x[6])
679    ldr     r12, =WL7                   @ r12 = window_l[7]
680    movs    r10, r10, lsr #28
681    adc     r7, r10, lr, lsl #4         @ r7 = bits[59..28] of windowed x6
682
683    smull   r10, lr, r12, r8            @ r10..lr = (window_l[7] * x[7])
684    ldr     r12, =WL8                   @ r12 = window_l[8]
685    movs    r10, r10, lsr #28
686    adc     r8, r10, lr, lsl #4         @ r8 = bits[59..28] of windowed x7
687
688    smull   r10, lr, r12, r9            @ r10..lr = (window_l[8] * x[8])
689    movs    r10, r10, lsr #28
690    adc     r9, r10, lr, lsl #4         @ r9 = bits[59..28] of windowed x8
691
692    stmia   r1, { r0, r2 - r9 }         @ store windowed x[0] .. x[8]
693
694    cmp     r11, #BLOCK_MODE_START
695    beq     start_block_x18_to_x35
696
697
698    @----
699
700
701normal_block_x18_to_x35:
702
703    ldr     r11, =WL3                   @ r11 = window_l[3]
704    ldr     r12, =WL4                   @ r12 = window_l[4]
705
706    add     r1, r1, #(18*4)             @ r1 = &x[18]
707
708    ldmia   r1!, { r0, r2 - r4, r6 - r10 }  @ load 9 words from x18, update pointer
709
710    @ r0     = x18
711    @ r1     = &x[27]
712    @ r2     = x19
713    @ r3     = x20
714    @ r4     = x21
715    @ r5     = .
716    @ r6     = x22
717    @ r7     = x23
718    @ r8     = x24
719    @ r9     = x25
720    @ r10    = x26
721    @ r11    = window_l[3]
722    @ r12    = window_l[4]
723    @ lr     = .
724
725    smull   r5, lr, r12, r6             @ r5..lr = (window_l[4] * (x[22] == x[31]))
726    movs    r5, r5, lsr #28
727    adc     r5, r5, lr, lsl #4          @ r5 = bits[59..28] of windowed x31
728
729    smull   r6, lr, r11, r4             @ r5..lr = (window_l[3] * (x[21] == x[32]))
730    ldr     r12, =WL5                   @ r12    =  window_l[5]
731    movs    r6, r6, lsr #28
732    adc     r6, r6, lr, lsl #4          @ r6 = bits[59..28] of windowed x32
733
734    smull   r4, lr, r12, r7             @ r4..lr = (window_l[5] * (x[23] == x[30]))
735    ldr     r11, =WL1                   @ r11    =  window_l[1]
736    ldr     r12, =WL2                   @ r12    =  window_l[2]
737    movs    r4, r4, lsr #28
738    adc     r4, r4, lr, lsl #4          @ r4 = bits[59..28] of windowed x30
739
740    smull   r7, lr, r12, r3             @ r7..lr = (window_l[2] * (x[20] == x[33]))
741    ldr     r12, =WL6                   @ r12 = window_l[6]
742    movs    r7, r7, lsr #28
743    adc     r7, r7, lr, lsl #4          @ r7 = bits[59..28] of windowed x33
744
745    smull   r3, lr, r12, r8             @ r3..lr = (window_l[6] * (x[24] == x[29]))
746    movs    r3, r3, lsr #28
747    adc     r3, r3, lr, lsl #4          @ r3 = bits[59..28] of windowed x29
748
749    smull   r8, lr, r11, r2             @ r7..lr = (window_l[1] * (x[19] == x[34]))
750    ldr     r12, =WL7                   @ r12    =  window_l[7]
751    ldr     r11, =WL8                   @ r11    =  window_l[8]
752    movs    r8, r8, lsr #28
753    adc     r8, r8, lr, lsl #4          @ r8 = bits[59..28] of windowed x34
754
755    smull   r2, lr, r12, r9             @ r7..lr = (window_l[7] * (x[25] == x[28]))
756    ldr     r12, =WL0                   @ r12 = window_l[0]
757    movs    r2, r2, lsr #28
758    adc     r2, r2, lr, lsl #4          @ r2 = bits[59..28] of windowed x28
759
760    smull   r9, lr, r12, r0             @ r3..lr = (window_l[0] * (x[18] == x[35]))
761    movs    r9, r9, lsr #28
762    adc     r9, r9, lr, lsl #4          @ r9 = bits[59..28] of windowed x35
763
764    smull   r0, lr, r11, r10            @ r7..lr = (window_l[8] * (x[26] == x[27]))
765    ldr     r11, =WL16                  @ r11    =  window_l[16]
766    ldr     r12, =WL17                  @ r12    =  window_l[17]
767    movs    r0, r0, lsr #28
768    adc     r0, r0, lr, lsl #4          @ r0 = bits[59..28] of windowed x27
769
770
771    stmia   r1,  { r0, r2 - r9 }        @ store windowed x[27] .. x[35]
772    ldmdb   r1!, { r0, r2 - r9 }        @ load 9 words downto (and including) x18
773
774
775    smull   r10, lr, r12, r0            @ r10..lr = (window_l[17] * x[18])
776    movs    r10, r10, lsr #28
777    adc     r0,  r10, lr, lsl #4        @ r0 = bits[59..28] of windowed x0
778
779    smull   r10, lr, r11, r2            @ r10..lr = (window_l[16] * x[19])
780    ldr     r11, =WL14                  @ r11     =  window_l[14]
781    ldr     r12, =WL15                  @ r12     =  window_l[15]
782    movs    r10, r10, lsr #28
783    adc     r2,  r10, lr, lsl #4        @ r2 = bits[59..28] of windowed x1
784
785    smull   r10, lr, r12, r3            @ r10..lr = (window_l[15] * x[20])
786    movs    r10, r10, lsr #28
787    adc     r3,  r10, lr, lsl #4        @ r3 = bits[59..28] of windowed x2
788
789    smull   r10, lr, r11, r4            @ r10..lr = (window_l[14] * x[21])
790    ldr     r11, =WL12                  @ r11     =  window_l[12]
791    ldr     r12, =WL13                  @ r12     =  window_l[13]
792    movs    r10, r10, lsr #28
793    adc     r4,  r10, lr, lsl #4        @ r4 = bits[59..28] of windowed x3
794
795    smull   r10, lr, r12, r5            @ r10..lr = (window_l[13] * x[22])
796    movs    r10, r10, lsr #28
797    adc     r5,  r10, lr, lsl #4        @ r5 = bits[59..28] of windowed x4
798
799    smull   r10, lr, r11, r6            @ r10..lr = (window_l[12] * x[23])
800    ldr     r11, =WL10                  @ r12 = window_l[10]
801    ldr     r12, =WL11                  @ r12 = window_l[11]
802    movs    r10, r10, lsr #28
803    adc     r6,  r10, lr, lsl #4        @ r6 = bits[59..28] of windowed x5
804
805    smull   r10, lr, r12, r7            @ r10..lr = (window_l[11] * x[24])
806    movs    r10, r10, lsr #28
807    adc     r7,  r10, lr, lsl #4        @ r7 = bits[59..28] of windowed x6
808
809    smull   r10, lr, r11, r8            @ r10..lr = (window_l[10] * x[25])
810    ldr     r12, =WL9                   @ r12 = window_l[9]
811    movs    r10, r10, lsr #28
812    adc     r8,  r10, lr, lsl #4        @ r8 = bits[59..28] of windowed x7
813
814    smull   r10, lr, r12, r9            @ r10..lr = (window_l[9] * x[26])
815
816    movs    r10, r10, lsr #28
817    adc     r9,  r10, lr, lsl #4        @ r9 = bits[59..28] of windowed x8
818
819    stmia   r1, { r0, r2 - r9 }         @ store windowed x[18] .. x[26]
820
821    @----
822    @ NB there are 2 possible exits from this function - this is only one of them
823    @----
824
825    add     sp, sp, #(21*4)             @ return stack frame
826    ldmia   sp!, { r4 - r11, pc }       @ restore callee saved regs, and return
827
828    @----
829
830
831stop_block_x0_to_x17:
832
833    @ r0     =  x0
834    @ r1     = &x[9]
835    @ r2     =  x1
836    @ r3     =  x2
837    @ r4     =  x3
838    @ r5     =  x4
839    @ r6     =  x5
840    @ r7     =  x6
841    @ r8     =  x7
842    @ r9     =  x8
843    @ r10    = -x0
844    @ r11    =  window mode: (0 == normal), (1 == start block), (3 == stop block)
845    @ r12    =  .
846    @ lr     =  .
847
848    rsb     r0, r6, #0                  @ r0 = -x5
849    rsb     r6, r2, #0                  @ r6 = -x1
850    rsb     r2, r5, #0                  @ r2 = -x4
851    rsb     r5, r3, #0                  @ r5 = -x2
852    rsb     r3, r4, #0                  @ r3 = -x3
853
854    add     r1, r1, #(3*4)                      @ r1 = &x[12]
855    stmia   r1, { r0, r2, r3, r5, r6, r10 }     @ store unchanged x[12] .. x[17]
856
857    ldr     r0, =WL1                    @ r0 = window_l[1]  == window_s[0]
858
859    rsb     r10, r9, #0                 @ r10 = -x8
860    rsb     r12, r8, #0                 @ r12 = -x7
861    rsb     lr,  r7, #0                 @ lr  = -x6
862
863    @ r0     =  WL1
864    @ r1     = &x[12]
865    @ r2     =  .
866    @ r3     =  .
867    @ r4     =  .
868    @ r5     =  .
869    @ r6     =  .
870    @ r7     =  x6
871    @ r8     =  x7
872    @ r9     =  x8
873    @ r10    = -x8
874    @ r11    =  window mode: (0 == normal), (1 == start block), (3 == stop block)
875    @ r12    = -x7
876    @ lr     = -x6
877
878    smull   r5, r6, r0, r7              @ r5..r6 = (window_l[1] * x[6])
879    ldr     r2, =WL4                    @ r2     =  window_l[4] == window_s[1]
880    movs    r5, r5, lsr #28
881    adc     r7, r5, r6, lsl #4          @ r7 = bits[59..28] of windowed x6
882
883    smull   r5, r6, r2, r8              @ r5..r6 = (window_l[4] * x[7])
884    ldr     r3, =WL7                    @ r3     =  window_l[7] == window_s[2]
885    movs    r5, r5, lsr #28
886    adc     r8, r5, r6, lsl #4          @ r8 = bits[59..28] of windowed x7
887
888    smull   r5, r6, r3, r9              @ r5..r6 = (window_l[7] * x[8])
889    ldr     r4, =WL10                   @ r4     =  window_l[10] == window_s[3]
890    movs    r5, r5, lsr #28
891    adc     r9, r5, r6, lsl #4          @ r9 = bits[59..28] of windowed x8
892
893    smull   r5, r6, r4, r10             @ r5..r6 = (window_l[10] * (x[9] == -x[8]))
894    ldr     r0, =WL13                   @ r0     =  window_l[13] == window_s[4]
895    movs    r5, r5, lsr #28
896    adc     r10, r5, r6, lsl #4         @ r10 = bits[59..28] of windowed x9
897
898    smull   r5, r6, r0, r12             @ r5..r6 = (window_l[13] * (x[10] == -x[7]))
899    ldr     r2, =WL16                   @ r2     =  window_l[16] == window_s[5]
900    movs    r5, r5, lsr #28
901    adc     r12, r5, r6, lsl #4         @ r10 = bits[59..28] of windowed x9
902
903    smull   r5, r6, r2, lr              @ r5..r6 = (window_l[16] * (x[11] == -x[6]))
904
905    ldr     r0, =0x00
906
907    movs    r5, r5, lsr #28
908    adc     lr, r5, r6, lsl #4          @ r10 = bits[59..28] of windowed x9
909
910    stmdb   r1!, { r7 - r10, r12, lr }  @ store windowed x[6] .. x[11]
911
912    ldr     r5, =0x00
913    ldr     r6, =0x00
914    ldr     r2, =0x00
915    ldr     r3, =0x00
916    ldr     r4, =0x00
917
918    stmdb   r1!, { r0, r2 - r6 }        @ store windowed x[0] .. x[5]
919
920    b       normal_block_x18_to_x35
921
922
923    @----
924
925
926start_block_x18_to_x35:
927
928    ldr     r4, =WL1                    @ r0 = window_l[1]  == window_s[0]
929
930    add     r1, r1, #(24*4)             @ r1 = &x[24]
931
932    ldmia   r1, { r0, r2, r3 }          @ load 3 words from x24, dont update pointer
933
934    @ r0     = x24
935    @ r1     = &x[24]
936    @ r2     = x25
937    @ r3     = x26
938    @ r4     = WL1
939    @ r5     = WL4
940    @ r6     = WL7
941    @ r7     = WL10
942    @ r8     = WL13
943    @ r9     = WL16
944    @ r10    = .
945    @ r11    = .
946    @ r12    = .
947    @ lr     = .
948
949    ldr     r5, =WL4                    @ r5 = window_l[4] == window_s[1]
950
951    smull   r10, r11, r4, r0            @ r10..r11 = (window_l[1] * (x[24] == x[29]))
952    ldr     r6, =WL7                    @ r6       =  window_l[7]  == window_s[2]
953    movs    r10, r10, lsr #28
954    adc     lr, r10, r11, lsl #4        @ lr = bits[59..28] of windowed x29
955
956    smull   r10, r11, r5, r2            @ r10..r11 = (window_l[4] * (x[25] == x[28]))
957    ldr     r7, =WL10                   @ r7       =  window_l[10] == window_s[3]
958    movs    r10, r10, lsr #28
959    adc     r12, r10, r11, lsl #4       @ r12 = bits[59..28] of windowed x28
960
961    smull   r10, r11, r6, r3            @ r10..r11 = (window_l[7] * (x[26] == x[27]))
962    ldr     r8, =WL13                   @ r8       =  window_l[13] == window_s[4]
963    movs    r10, r10, lsr #28
964    adc     r4, r10, r11, lsl #4        @ r4 = bits[59..28] of windowed x27
965
966    smull   r10, r11, r7, r3            @ r10..r11 = (window_l[10] * x[26])
967    ldr     r9, =WL16                   @ r9       =  window_l[16] == window_s[5]
968    movs    r10, r10, lsr #28
969    adc     r3, r10, r11, lsl #4        @ r3 = bits[59..28] of windowed x26
970
971    smull   r10, r11, r8, r2            @ r10..r11 = (window_l[13] * x[25])
972    ldr     r5, =0x00
973    movs    r10, r10, lsr #28
974    adc     r2, r10, r11, lsl #4        @ r2 = bits[59..28] of windowed x25
975
976    smull   r10, r11, r9, r0            @ r10..r11 = (window_l[16] * x[24])
977    ldr     r6, =0x00
978    movs    r10, r10, lsr #28
979    adc     r0, r10, r11, lsl #4        @ r0 = bits[59..28] of windowed x24
980
981    stmia   r1!, { r0, r2, r3, r4, r12, lr }    @ store windowed x[24] .. x[29]
982
983    ldr     r7, =0x00
984    ldr     r8, =0x00
985    ldr     r9, =0x00
986    ldr     r10, =0x00
987
988    stmia   r1!, { r5 - r10 }           @ store windowed x[30] .. x[35]
989
990    @----
991    @ NB there are 2 possible exits from this function - this is only one of them
992    @----
993
994    add     sp, sp, #(21*4)             @ return stack frame
995    ldmia   sp!, { r4 - r11, pc }       @ restore callee saved regs, and return
996
997    @----
998    @END
999    @----
1000
1001