1@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2@@                           **** WAVPACK ****                            @@
3@@                  Hybrid Lossless Wavefile Compressor                   @@
4@@                Copyright (c) 1998 - 2019 David Bryant.                 @@
5@@                          All Rights Reserved.                          @@
6@@      Distributed under the BSD Software License (see license.txt)      @@
7@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
8
9        .text
10        .align
11        .global         unpack_decorr_stereo_pass_cont_armv7
12        .global         unpack_decorr_mono_pass_cont_armv7
13
14/* This is an assembly optimized version of the following WavPack function:
15 *
16 * void decorr_stereo_pass_cont (struct decorr_pass *dpp,
17 *                               int32_t *buffer,
18 *                               int32_t sample_counti,
19 *                               int32_t long_math);
20 *
21 * It performs a single pass of stereo decorrelation on the provided buffer.
22 * Note that this version of the function requires that up to 8 previous stereo
23 * samples are visible and correct. In other words, it ignores the "samples_*"
24 * fields in the decorr_pass structure and gets the history data directly
25 * from the buffer. It does, however, return the appropriate history samples
26 * to the decorr_pass structure before returning.
27 *
28 * This should work on all ARM architectures. This version of the code
29 * checks the magnitude of the decorrelation sample with a pair of shifts
30 * to avoid possible overflow (and therefore ignores the "long_math" arg).
31 * Previously I used the SSAT instruction for this, but then discovered that
32 * SSAT is not universally available (although on the armv7 I'm testing on
33 * it is slightly faster than the shifts).
34 *
35 * A mono version follows below.
36 */
37
38/*
39 * on entry:
40 *
41 * r0 = struct decorr_pass *dpp
42 * r1 = int32_t *buffer
43 * r2 = int32_t sample_count
44 * r3 = int32_t long_math
45 */
46
47        .arm
48        .type           unpack_decorr_stereo_pass_cont_armv7, STT_FUNC
49
50unpack_decorr_stereo_pass_cont_armv7:
51
52        stmfd   sp!, {r4 - r8, r10, r11, lr}
53
54        mov     r5, r0                  @ r5 = dpp
55        mov     r11, #512               @ r11 = 512 for rounding
56        ldr     r6, [r0, #4]            @ r6 = dpp->delta
57        ldr     r4, [r0, #8]            @ r4 = dpp->weight_A
58        ldr     r0, [r0, #12]           @ r0 = dpp->weight_B
59        cmp     r2, #0                  @ exit if no samples to process
60        beq     common_exit
61
62        add     r7, r1, r2, asl #3      @ r7 = buffer ending position
63        ldr     r2, [r5, #0]            @ r2 = dpp->term
64        cmp     r2, #0
65        bmi     minus_term
66
67        ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
68        ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
69        ldr     r8, [r1, #-8]
70        ldr     r3, [r1, #-4]
71        cmp     r2, #17
72        beq     term_17_loop
73        cmp     r2, #18
74        beq     term_18_loop
75        cmp     r2, #2
76        beq     term_2_loop
77        b       term_default_loop       @ else handle default (1-8, except 2)
78
79minus_term:
80        mov     r10, #1024              @ r10 = -1024 for weight clipping
81        rsb     r10, r10, #0            @  (only used for negative terms)
82        cmn     r2, #1
83        beq     term_minus_1
84        cmn     r2, #2
85        beq     term_minus_2
86        cmn     r2, #3
87        beq     term_minus_3
88        b       common_exit
89
90/*
91 ******************************************************************************
92 * Loop to handle term = 17 condition
93 *
94 * r0 = dpp->weight_B           r8 = previous left sample
95 * r1 = bptr                    r9 =
96 * r2 = current sample          r10 = second previous right sample
97 * r3 = previous right sample   r11 = 512 (for rounding)
98 * r4 = dpp->weight_A           ip = current decorrelation value
99 * r5 = dpp                     sp =
100 * r6 = dpp->delta              lr = second previous left sample
101 * r7 = eptr                    pc =
102 *******************************************************************************
103 */
104
105term_17_loop:
106        rsb     ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
107        mov     lr, r8                  @ previous becomes 2nd previous
108        ldr     r2, [r1], #4            @ get sample & update pointer
109        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
110        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
111        bne     S117
112        cmp     ip, #0
113        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
114        add     r8, r2, r8, asr #10     @  shift, and add to new sample
115        b       S118
116
117S117:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
118        smlal   r11, r8, r4, ip
119        add     r8, r2, r8, lsl #22
120        add     r8, r8, r11, lsr #10
121        mov     r11, #512
122
123S118:   strne   r8, [r1, #-4]           @ if change possible, store sample back
124        cmpne   r2, #0
125        beq     S325
126        teq     ip, r2                  @ update weight based on signs
127        submi   r4, r4, r6
128        addpl   r4, r4, r6
129
130S325:   rsb     ip, r10, r3, asl #1     @ do same thing for right channel
131        mov     r10, r3
132        ldr     r2, [r1], #4
133        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
134        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
135        bne     S119
136        cmp     ip, #0
137        mla     r3, ip, r0, r11
138        add     r3, r2, r3, asr #10
139        b       S120
140
141S119:   mov     r3, #0
142        smlal   r11, r3, r0, ip
143        add     r3, r2, r3, lsl #22
144        add     r3, r3, r11, lsr #10
145        mov     r11, #512
146
147S120:   strne   r3, [r1, #-4]
148        cmpne   r2, #0
149        beq     S329
150        teq     ip, r2
151        submi   r0, r0, r6
152        addpl   r0, r0, r6
153
154S329:   cmp     r7, r1                  @ loop back if more samples to do
155        bhi     term_17_loop
156        b       store_1718              @ common exit for terms 17 & 18
157
158/*
159 ******************************************************************************
160 * Loop to handle term = 18 condition
161 *
162 * r0 = dpp->weight_B           r8 = previous left sample
163 * r1 = bptr                    r9 =
164 * r2 = current sample          r10 = second previous right sample
165 * r3 = previous right sample   r11 = 512 (for rounding)
166 * r4 = dpp->weight_A           ip = decorrelation value
167 * r5 = dpp                     sp =
168 * r6 = dpp->delta              lr = second previous left sample
169 * r7 = eptr                    pc =
170 *******************************************************************************
171 */
172
173term_18_loop:
174        sub     ip, r8, lr              @ decorr value =
175        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
176        add     ip, r8, ip, asr #1
177        ldr     r2, [r1], #4            @ get sample & update pointer
178        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
179        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
180        bne     S121
181        cmp     ip, #0
182        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
183        add     r8, r2, r8, asr #10     @  shift, and add to new sample
184        b       S122
185
186S121:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
187        smlal   r11, r8, r4, ip
188        add     r8, r2, r8, lsl #22
189        add     r8, r8, r11, lsr #10
190        mov     r11, #512
191
192S122:   strne   r8, [r1, #-4]           @ if change possible, store sample back
193        cmpne   r2, #0
194        beq     S337
195        teq     ip, r2                  @ update weight based on signs
196        submi   r4, r4, r6
197        addpl   r4, r4, r6
198
199S337:   sub     ip, r3, r10             @ do same thing for right channel
200        mov     r10, r3
201        add     ip, r3, ip, asr #1
202        ldr     r2, [r1], #4
203        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
204        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
205        bne     S123
206        cmp     ip, #0
207        mla     r3, ip, r0, r11
208        add     r3, r2, r3, asr #10
209        b       S124
210
211S123:   mov     r3, #0
212        smlal   r11, r3, r0, ip
213        add     r3, r2, r3, lsl #22
214        add     r3, r3, r11, lsr #10
215        mov     r11, #512
216
217S124:   strne   r3, [r1, #-4]
218        cmpne   r2, #0
219        beq     S341
220        teq     ip, r2
221        submi   r0, r0, r6
222        addpl   r0, r0, r6
223
224S341:   cmp     r7, r1                  @ loop back if more samples to do
225        bhi     term_18_loop
226
227/* common exit for terms 17 & 18 */
228
229store_1718:
230        str     r3, [r5, #48]           @ store sample history into struct
231        str     r8, [r5, #16]
232        str     r10, [r5, #52]
233        str     lr, [r5, #20]
234        b       common_exit             @ and return
235
236/*
237 ******************************************************************************
238 * Loop to handle term = 2 condition
239 * (note that this case can be handled by the default term handler (1-8), but
240 * this special case is faster because it doesn't have to read memory twice)
241 *
242 * r0 = dpp->weight_B           r8 = previous left sample
243 * r1 = bptr                    r9 =
244 * r2 = current sample          r10 = second previous right sample
245 * r3 = previous right sample   r11 = 512 (for rounding)
246 * r4 = dpp->weight_A           ip = decorrelation value
247 * r5 = dpp                     sp =
248 * r6 = dpp->delta              lr = second previous left sample
249 * r7 = eptr                    pc =
250 *******************************************************************************
251 */
252
253term_2_loop:
254        mov     ip, lr                  @ get decorrelation value
255        mov     lr, r8                  @ previous becomes 2nd previous
256        ldr     r2, [r1], #4            @ get sample & update pointer
257        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
258        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
259        bne     S125
260        cmp     ip, #0
261        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
262        add     r8, r2, r8, asr #10     @  shift, and add to new sample
263        b       S126
264
265S125:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
266        smlal   r11, r8, r4, ip
267        add     r8, r2, r8, lsl #22
268        add     r8, r8, r11, lsr #10
269        mov     r11, #512
270
271S126:   strne   r8, [r1, #-4]           @ if change possible, store sample back
272        cmpne   r2, #0
273        beq     S225
274        teq     ip, r2                  @ update weight based on signs
275        submi   r4, r4, r6
276        addpl   r4, r4, r6
277
278S225:   mov     ip, r10                 @ do same thing for right channel
279        mov     r10, r3
280        ldr     r2, [r1], #4
281        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
282        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
283        bne     S127
284        cmp     ip, #0
285        mla     r3, ip, r0, r11
286        add     r3, r2, r3, asr #10
287        b       S128
288
289S127:   mov     r3, #0
290        smlal   r11, r3, r0, ip
291        add     r3, r2, r3, lsl #22
292        add     r3, r3, r11, lsr #10
293        mov     r11, #512
294
295S128:   strne   r3, [r1, #-4]
296        cmpne   r2, #0
297        beq     S229
298        teq     ip, r2
299        submi   r0, r0, r6
300        addpl   r0, r0, r6
301
302S229:   cmp     r7, r1                  @ loop back if more samples to do
303        bhi     term_2_loop
304        b       default_term_exit       @ this exit updates all dpp->samples
305
306/*
307 ******************************************************************************
308 * Loop to handle default term condition
309 *
310 * r0 = dpp->weight_B           r8 = result accumulator
311 * r1 = bptr                    r9 =
312 * r2 = dpp->term               r10 =
313 * r3 = decorrelation value     r11 = 512 (for rounding)
314 * r4 = dpp->weight_A           ip = current sample
315 * r5 = dpp                     sp =
316 * r6 = dpp->delta              lr =
317 * r7 = eptr                    pc =
318 *******************************************************************************
319 */
320
321term_default_loop:
322        ldr     ip, [r1]                @ get original sample
323        ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
324        mov     r8, r3, lsl #11         @ check magnitude by shifting left then right
325        cmp     r3, r8, asr #11         @  and comparing, branch to 64-bit math if different
326        bne     S135
327        cmp     r3, #0
328        mla     r8, r3, r4, r11         @ mult decorr value by weight, round,
329        add     r8, ip, r8, asr #10     @  shift and add to new sample
330        b       S136
331
332S135:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
333        smlal   r11, r8, r4, r3
334        add     r8, ip, r8, lsl #22
335        add     r8, r8, r11, lsr #10
336        mov     r11, #512
337
338S136:   str     r8, [r1], #4            @ store update sample
339        cmpne   ip, #0
340        beq     S350
341        teq     ip, r3                  @ update weight based on signs
342        submi   r4, r4, r6
343        addpl   r4, r4, r6
344
345S350:   ldr     ip, [r1]                @ do the same thing for right channel
346        ldr     r3, [r1, -r2, asl #3]
347        mov     r8, r3, lsl #11         @ check magnitude by shifting left then right
348        cmp     r3, r8, asr #11         @  and comparing, branch to 64-bit math if different
349        bne     S137
350        cmp     r3, #0
351        mla     r8, r3, r0, r11
352        add     r8, ip, r8, asr #10
353        b       S138
354
355S137:   mov     r8, #0
356        smlal   r11, r8, r0, r3
357        add     r8, ip, r8, lsl #22
358        add     r8, r8, r11, lsr #10
359        mov     r11, #512
360
361S138:   str     r8, [r1], #4
362        cmpne   ip, #0
363        beq     S354
364        teq     ip, r3
365        submi   r0, r0, r6
366        addpl   r0, r0, r6
367
368S354:   cmp     r7, r1                  @ loop back if more samples to do
369        bhi     term_default_loop
370
371/*
372 * This exit is used by terms 1-8 to store the previous "term" samples (up to 8)
373 * into the decorr pass structure history
374 */
375
376default_term_exit:
377        ldr     r2, [r5, #0]            @ r2 = dpp->term
378
379S358:   sub     r2, r2, #1
380        sub     r1, r1, #8
381        ldr     r3, [r1, #4]            @ get right sample and store in dpp->samples_B [r2]
382        add     r6, r5, #48
383        str     r3, [r6, r2, asl #2]
384        ldr     r3, [r1, #0]            @ get left sample and store in dpp->samples_A [r2]
385        add     r6, r5, #16
386        str     r3, [r6, r2, asl #2]
387        cmp     r2, #0
388        bne     S358
389        b       common_exit
390
391/*
392 ******************************************************************************
393 * Loop to handle term = -1 condition
394 *
395 * r0 = dpp->weight_B           r8 =
396 * r1 = bptr                    r9 =
397 * r2 = intermediate result     r10 = -1024 (for clipping)
398 * r3 = previous right sample   r11 = 512 (for rounding)
399 * r4 = dpp->weight_A           ip = current sample
400 * r5 = dpp                     sp =
401 * r6 = dpp->delta              lr = updated left sample
402 * r7 = eptr                    pc =
403 *******************************************************************************
404 */
405
406term_minus_1:
407        ldr     r3, [r1, #-4]
408
409term_minus_1_loop:
410        ldr     ip, [r1]                @ for left channel the decorrelation value
411                                        @  is the previous right sample (in r3)
412        mov     lr, r3, lsl #11         @ check magnitude by shifting left then right
413        cmp     r3, lr, asr #11         @  and comparing, branch to 64-bit math if different
414        bne     S142
415        cmp     r3, #0
416        mla     r2, r3, r4, r11
417        add     lr, ip, r2, asr #10
418        b       S143
419
420S142:   mov     lr, #0                  @ use 64-bit multiply to avoid overflow
421        smlal   r11, lr, r4, r3
422        add     lr, ip, lr, lsl #22
423        add     lr, lr, r11, lsr #10
424        mov     r11, #512
425
426S143:   str     lr, [r1], #8
427        cmpne   ip, #0
428        beq     S361
429        teq     ip, r3                  @ update weight based on signs
430        submi   r4, r4, r6
431        addpl   r4, r4, r6
432        cmp     r4, #1024
433        movgt   r4, #1024
434        cmp     r4, r10
435        movlt   r4, r10
436
437S361:   ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
438                                        @  is the just updated right sample (in lr)
439        mov     r3, lr, lsl #11         @ check magnitude by shifting left then right
440        cmp     lr, r3, asr #11         @  and comparing, branch to 64-bit math if different
441        bne     S144
442        cmp     lr, #0
443        mla     r3, lr, r0, r11
444        add     r3, r2, r3, asr #10
445        b       S145
446
447S144:   mov     r3, #0
448        smlal   r11, r3, r0, lr
449        add     r3, r2, r3, lsl #22
450        add     r3, r3, r11, lsr #10
451        mov     r11, #512
452
453S145:   strne   r3, [r1, #-4]
454        cmpne   r2, #0
455        beq     S369
456        teq     r2, lr
457        submi   r0, r0, r6
458        addpl   r0, r0, r6
459        cmp     r0, #1024               @ then clip weight to +/-1024
460        movgt   r0, #1024
461        cmp     r0, r10
462        movlt   r0, r10
463
464S369:   cmp     r7, r1                  @ loop back if more samples to do
465        bhi     term_minus_1_loop
466
467        str     r3, [r5, #16]           @ else store right sample and exit
468        b       common_exit
469
470/*
471 ******************************************************************************
472 * Loop to handle term = -2 condition
473 * (note that the channels are processed in the reverse order here)
474 *
475 * r0 = dpp->weight_B           r8 =
476 * r1 = bptr                    r9 =
477 * r2 = intermediate result     r10 = -1024 (for clipping)
478 * r3 = previous left sample    r11 = 512 (for rounding)
479 * r4 = dpp->weight_A           ip = current sample
480 * r5 = dpp                     sp =
481 * r6 = dpp->delta              lr = updated right sample
482 * r7 = eptr                    pc =
483 *******************************************************************************
484 */
485
486term_minus_2:
487        ldr     r3, [r1, #-8]
488
489term_minus_2_loop:
490        ldr     ip, [r1, #4]            @ for right channel the decorrelation value
491                                        @  is the previous left sample (in r3)
492        mov     lr, r3, lsl #11         @ check magnitude by shifting left then right
493        cmp     r3, lr, asr #11         @  and comparing, branch to 64-bit math if different
494        bne     S146
495        cmp     r3, #0
496        mla     r2, r3, r0, r11
497        add     lr, ip, r2, asr #10
498        b       S147
499
500S146:   mov     lr, #0                  @ use 64-bit multiply to avoid overflow
501        smlal   r11, lr, r0, r3
502        add     lr, ip, lr, lsl #22
503        add     lr, lr, r11, lsr #10
504        mov     r11, #512
505
506S147:   strne   lr, [r1, #4]
507        cmpne   ip, #0
508        beq     S380
509        teq     ip, r3                  @ update weight based on signs
510        submi   r0, r0, r6
511        addpl   r0, r0, r6
512        cmp     r0, #1024               @ then clip weight to +/-1024
513        movgt   r0, #1024
514        cmp     r0, r10
515        movlt   r0, r10
516
517S380:   ldr     r2, [r1, #0]            @ for left channel the decorrelation value
518                                        @  is the just updated left sample (in lr)
519        mov     r3, lr, lsl #11         @ check magnitude by shifting left then right
520        cmp     lr, r3, asr #11         @  and comparing, branch to 64-bit math if different
521        bne     S148
522        cmp     lr, #0
523        mla     r3, lr, r4, r11
524        add     r3, r2, r3, asr #10
525        b       S149
526
527S148:   mov     r3, #0
528        smlal   r11, r3, r4, lr
529        add     r3, r2, r3, lsl #22
530        add     r3, r3, r11, lsr #10
531        mov     r11, #512
532
533S149:   str     r3, [r1], #8
534        cmpne   r2, #0
535        beq     S388
536        teq     r2, lr
537        submi   r4, r4, r6
538        addpl   r4, r4, r6
539        cmp     r4, #1024
540        movgt   r4, #1024
541        cmp     r4, r10
542        movlt   r4, r10
543
544S388:   cmp     r7, r1                  @ loop back if more samples to do
545        bhi     term_minus_2_loop
546
547        str     r3, [r5, #48]           @ else store left channel and exit
548        b       common_exit
549
550/*
551 ******************************************************************************
552 * Loop to handle term = -3 condition
553 *
554 * r0 = dpp->weight_B           r8 = previous left sample
555 * r1 = bptr                    r9 =
556 * r2 = current left sample     r10 = -1024 (for clipping)
557 * r3 = previous right sample   r11 = 512 (for rounding)
558 * r4 = dpp->weight_A           ip = intermediate result
559 * r5 = dpp                     sp =
560 * r6 = dpp->delta              lr =
561 * r7 = eptr                    pc =
562 *******************************************************************************
563 */
564
565term_minus_3:
566        ldr     r3, [r1, #-4]           @ load previous samples
567        ldr     r8, [r1, #-8]
568
569term_minus_3_loop:
570        ldr     ip, [r1]
571        mov     r2, r3, lsl #11         @ check magnitude by shifting left then right
572        cmp     r3, r2, asr #11         @  and comparing, branch to 64-bit math if different
573        bne     S160
574        cmp     r3, #0
575        mla     r2, r3, r4, r11
576        add     r2, ip, r2, asr #10
577        b       S161
578
579S160:   mov     r2, #0                  @ use 64-bit multiply to avoid overflow
580        smlal   r11, r2, r4, r3
581        add     r2, ip, r2, lsl #22
582        add     r2, r2, r11, lsr #10
583        mov     r11, #512
584
585S161:   str     r2, [r1], #4
586        cmpne   ip, #0
587        beq     S399
588        teq     ip, r3                  @ update weight based on signs
589        submi   r4, r4, r6
590        addpl   r4, r4, r6
591        cmp     r4, #1024               @ then clip weight to +/-1024
592        movgt   r4, #1024
593        cmp     r4, r10
594        movlt   r4, r10
595
596S399:   mov     ip, r8                  @ ip = previous left we use now
597        mov     r8, r2                  @ r8 = current left we use next time
598        ldr     r2, [r1], #4
599        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
600        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
601        bne     S162
602        cmp     ip, #0
603        mla     r3, ip, r0, r11
604        add     r3, r2, r3, asr #10
605        b       S163
606
607S162:   mov     r3, #0
608        smlal   r11, r3, r0, ip
609        add     r3, r2, r3, lsl #22
610        add     r3, r3, r11, lsr #10
611        mov     r11, #512
612
613S163:   strne   r3, [r1, #-4]
614        cmpne   r2, #0
615        beq     S407
616        teq     ip, r2
617        submi   r0, r0, r6
618        addpl   r0, r0, r6
619        cmp     r0, #1024
620        movgt   r0, #1024
621        cmp     r0, r10
622        movlt   r0, r10
623
624S407:   cmp     r7, r1                  @ loop back if more samples to do
625        bhi     term_minus_3_loop
626
627        str     r3, [r5, #16]           @ else store previous samples & exit
628        str     r8, [r5, #48]
629
630/*
631 * Before finally exiting we must store weights back for next time
632 */
633
634common_exit:
635        str     r4, [r5, #8]
636        str     r0, [r5, #12]
637        ldmfd   sp!, {r4 - r8, r10, r11, pc}
638
639
640
641/* This is a mono version of the function above. It does not handle negative terms.
642 *
643 * void decorr_mono_pass_cont (struct decorr_pass *dpp,
644 *                             int32_t *buffer,
645 *                             int32_t sample_counti,
646 *                             int32_t long_math);
647 * on entry:
648 *
649 * r0 = struct decorr_pass *dpp
650 * r1 = int32_t *buffer
651 * r2 = int32_t sample_count
652 * r3 = int32_t long_math
653 */
654
655        .arm
656        .type           unpack_decorr_mono_pass_cont_armv7, STT_FUNC
657
658unpack_decorr_mono_pass_cont_armv7:
659
660        stmfd   sp!, {r4 - r8, r11, lr}
661
662        mov     r5, r0                  @ r5 = dpp
663        mov     r11, #512               @ r11 = 512 for rounding
664        ldr     r6, [r0, #4]            @ r6 = dpp->delta
665        ldr     r4, [r0, #8]            @ r4 = dpp->weight_A
666        cmp     r2, #0                  @ exit if no samples to process
667        beq     mono_common_exit
668
669        add     r7, r1, r2, asl #2      @ r7 = buffer ending position
670        ldr     r2, [r5, #0]            @ r2 = dpp->term
671
672        ldr     lr, [r1, #-8]           @ load 2 sample history from buffer
673        ldr     r8, [r1, #-4]
674        cmp     r2, #17
675        beq     mono_term_17_loop
676        cmp     r2, #18
677        beq     mono_term_18_loop
678        cmp     r2, #2
679        beq     mono_term_2_loop
680        b       mono_term_default_loop  @ else handle default (1-8, except 2)
681
682/*
683 ******************************************************************************
684 * Loop to handle term = 17 condition
685 *
686 * r0 =                         r8 = previous sample
687 * r1 = bptr                    r9 =
688 * r2 = current sample          r10 =
689 * r3 =                         r11 = 512 (for rounding)
690 * r4 = dpp->weight_A           ip = current decorrelation value
691 * r5 = dpp                     sp =
692 * r6 = dpp->delta              lr = second previous sample
693 * r7 = eptr                    pc =
694 *******************************************************************************
695 */
696
697mono_term_17_loop:
698        rsb     ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
699        mov     lr, r8                  @ previous becomes 2nd previous
700        ldr     r2, [r1], #4            @ get sample & update pointer
701        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
702        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
703        bne     S717
704        cmp     ip, #0
705        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
706        add     r8, r2, r8, asr #10     @  shift, and add to new sample
707        b       S718
708
709S717:   mov     r8, #0
710        smlal   r11, r8, r4, ip
711        add     r8, r2, r8, lsl #22
712        add     r8, r8, r11, lsr #10
713        mov     r11, #512
714
715S718:   strne   r8, [r1, #-4]           @ if change possible, store sample back
716        cmpne   r2, #0
717        beq     S129
718        teq     ip, r2                  @ update weight based on signs
719        submi   r4, r4, r6
720        addpl   r4, r4, r6
721
722S129:   cmp     r7, r1                  @ loop back if more samples to do
723        bhi     mono_term_17_loop
724        b       mono_store_1718         @ common exit for terms 17 & 18
725
726/*
727 ******************************************************************************
728 * Loop to handle term = 18 condition
729 *
730 * r0 =                         r8 = previous sample
731 * r1 = bptr                    r9 =
732 * r2 = current sample          r10 =
733 * r3 =                         r11 = 512 (for rounding)
734 * r4 = dpp->weight_A           ip = decorrelation value
735 * r5 = dpp                     sp =
736 * r6 = dpp->delta              lr = second previous sample
737 * r7 = eptr                    pc =
738 *******************************************************************************
739 */
740
741mono_term_18_loop:
742        sub     ip, r8, lr              @ decorr value =
743        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
744        add     ip, r8, ip, asr #1
745        ldr     r2, [r1], #4            @ get sample & update pointer
746        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
747        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
748        bne     S817
749        cmp     ip, #0
750        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
751        add     r8, r2, r8, asr #10     @  shift, and add to new sample
752        b       S818
753
754S817:   mov     r8, #0
755        smlal   r11, r8, r4, ip
756        add     r8, r2, r8, lsl #22
757        add     r8, r8, r11, lsr #10
758        mov     r11, #512
759
760S818:   strne   r8, [r1, #-4]           @ if change possible, store sample back
761        cmpne   r2, #0
762        beq     S141
763        teq     ip, r2                  @ update weight based on signs
764        submi   r4, r4, r6
765        addpl   r4, r4, r6
766
767S141:   cmp     r7, r1                  @ loop back if more samples to do
768        bhi     mono_term_18_loop
769
770/* common exit for terms 17 & 18 */
771
772mono_store_1718:
773        str     r8, [r5, #16]           @ store sample history into struct
774        str     lr, [r5, #20]
775        b       mono_common_exit        @ and return
776
777/*
778 ******************************************************************************
779 * Loop to handle term = 2 condition
780 * (note that this case can be handled by the default term handler (1-8), but
781 * this special case is faster because it doesn't have to read memory twice)
782 *
783 * r0 =                         r8 = previous sample
784 * r1 = bptr                    r9 =
785 * r2 = current sample          r10 =
786 * r3 =                         r11 = 512 (for rounding)
787 * r4 = dpp->weight_A           ip = decorrelation value
788 * r5 = dpp                     sp =
789 * r6 = dpp->delta              lr = second previous sample
790 * r7 = eptr                    pc =
791 *******************************************************************************
792 */
793
794mono_term_2_loop:
795        mov     ip, lr                  @ get decorrelation value
796        mov     lr, r8                  @ previous becomes 2nd previous
797        ldr     r2, [r1], #4            @ get sample & update pointer
798        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
799        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
800        bne     S917
801        cmp     ip, #0
802        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
803        add     r8, r2, r8, asr #10     @  shift, and add to new sample
804        b       S918
805
806S917:   mov     r8, #0
807        smlal   r11, r8, r4, ip
808        add     r8, r2, r8, lsl #22
809        add     r8, r8, r11, lsr #10
810        mov     r11, #512
811
812S918:   strne   r8, [r1, #-4]           @ if change possible, store sample back
813        cmpne   r2, #0
814        beq     S029
815        teq     ip, r2                  @ update weight based on signs
816        submi   r4, r4, r6
817        addpl   r4, r4, r6
818
819S029:   cmp     r7, r1                  @ loop back if more samples to do
820        bhi     mono_term_2_loop
821        b       mono_default_term_exit  @ this exit updates all dpp->samples
822
823/*
824 ******************************************************************************
825 * Loop to handle default term condition
826 *
827 * r0 =                         r8 = result accumulator
828 * r1 = bptr                    r9 =
829 * r2 = dpp->term               r10 =
830 * r3 = decorrelation value     r11 = 512 (for rounding)
831 * r4 = dpp->weight_A           ip = current sample
832 * r5 = dpp                     sp =
833 * r6 = dpp->delta              lr =
834 * r7 = eptr                    pc =
835 *******************************************************************************
836 */
837
838mono_term_default_loop:
839        ldr     ip, [r1]                @ get original sample
840        ldr     r3, [r1, -r2, asl #2]   @ get decorrelation value based on term
841        mov     r8, r3, lsl #11         @ check magnitude by shifting left then right
842        cmp     r3, r8, asr #11         @  and comparing, branch to 64-bit math if different
843        bne     S617
844        mla     r8, r3, r4, r11         @ mult decorr value by weight, round,
845        add     r8, ip, r8, asr #10     @  shift and add to new sample
846        b       S618
847
848S617:   mov     r8, #0
849        smlal   r11, r8, r4, r3
850        add     r8, ip, r8, lsl #22
851        add     r8, r8, r11, lsr #10
852        mov     r11, #512
853
854S618:   str     r8, [r1], #4            @ store update sample
855        cmp     r3, #0
856        cmpne   ip, #0
857        beq     S154
858        teq     ip, r3                  @ update weight based on signs
859        submi   r4, r4, r6
860        addpl   r4, r4, r6
861
862S154:   cmp     r7, r1                  @ loop back if more samples to do
863        bhi     mono_term_default_loop
864
865/*
866 * This exit is used by terms 1-8 to store the previous "term" samples (up to 8)
867 * into the decorr pass structure history
868 */
869
870mono_default_term_exit:
871        ldr     r2, [r5, #0]            @ r2 = dpp->term
872
873S158:   sub     r2, r2, #1
874        sub     r1, r1, #4
875        ldr     r3, [r1, #0]            @ get sample and store in dpp->samples_A [r2]
876        add     r6, r5, #16
877        str     r3, [r6, r2, asl #2]
878        cmp     r2, #0
879        bne     S158
880        b       mono_common_exit
881
882/*
883 * Before finally exiting we must store weight back for next time
884 */
885
886mono_common_exit:
887        str     r4, [r5, #8]
888        ldmfd   sp!, {r4 - r8, r11, pc}
889
890#ifdef __ELF__
891        .section .note.GNU-stack,"",%progbits
892#endif
893
894