1; Copyright (c) 2007-2008 CSIRO
2; Copyright (c) 2007-2009 Xiph.Org Foundation
3; Copyright (c) 2013      Parrot
4; Written by Aurélien Zanelli
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
9;
10; - Redistributions of source code must retain the above copyright
11; notice, this list of conditions and the following disclaimer.
12;
13; - Redistributions in binary form must reproduce the above copyright
14; notice, this list of conditions and the following disclaimer in the
15; documentation and/or other materials provided with the distribution.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29  AREA  |.text|, CODE, READONLY
30
31  GET    celt/arm/armopts.s
32
33IF OPUS_ARM_MAY_HAVE_EDSP
34  EXPORT celt_pitch_xcorr_edsp
35ENDIF
36
37IF OPUS_ARM_MAY_HAVE_NEON
38  EXPORT celt_pitch_xcorr_neon
39ENDIF
40
41IF OPUS_ARM_MAY_HAVE_NEON
42
43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
44xcorr_kernel_neon PROC
45xcorr_kernel_neon_start
46  ; input:
47  ;   r3     = int         len
48  ;   r4     = opus_val16 *x
49  ;   r5     = opus_val16 *y
50  ;   q0     = opus_val32  sum[4]
51  ; output:
52  ;   q0     = opus_val32  sum[4]
53  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
54  ; internal usage:
55  ;   r12 = int j
56  ;   d3  = y_3|y_2|y_1|y_0
57  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
58  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
59  ;   q8  = scratch
60  ;
61  ; Load y[0...3]
62  ; This requires len>0 to always be valid (which we assert in the C code).
63  VLD1.16      {d5}, [r5]!
64  SUBS         r12, r3, #8
65  BLE xcorr_kernel_neon_process4
66; Process 8 samples at a time.
67; This loop loads one y value more than we actually need. Therefore we have to
68; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
69; reading past the end of the array.
70xcorr_kernel_neon_process8
71  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
72  ; - 2 cycles of ARM insrtuctions,
73  ; - 10 cycles of load/store/byte permute instructions, and
74  ; - 9 cycles of data processing instructions.
75  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
76  ; latter two categories, meaning the whole loop should run in 10 cycles per
77  ; iteration, barring cache misses.
78  ;
79  ; Load x[0...7]
80  VLD1.16      {d6, d7}, [r4]!
81  ; Unlike VMOV, VAND is a data processing instruction (and doesn't get
82  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
83  VAND         d3, d5, d5
84  SUBS         r12, r12, #8
85  ; Load y[4...11]
86  VLD1.16      {d4, d5}, [r5]!
87  VMLAL.S16    q0, d3, d6[0]
88  VEXT.16      d16, d3, d4, #1
89  VMLAL.S16    q0, d4, d7[0]
90  VEXT.16      d17, d4, d5, #1
91  VMLAL.S16    q0, d16, d6[1]
92  VEXT.16      d16, d3, d4, #2
93  VMLAL.S16    q0, d17, d7[1]
94  VEXT.16      d17, d4, d5, #2
95  VMLAL.S16    q0, d16, d6[2]
96  VEXT.16      d16, d3, d4, #3
97  VMLAL.S16    q0, d17, d7[2]
98  VEXT.16      d17, d4, d5, #3
99  VMLAL.S16    q0, d16, d6[3]
100  VMLAL.S16    q0, d17, d7[3]
101  BGT xcorr_kernel_neon_process8
102; Process 4 samples here if we have > 4 left (still reading one extra y value).
103xcorr_kernel_neon_process4
104  ADDS         r12, r12, #4
105  BLE xcorr_kernel_neon_process2
106  ; Load x[0...3]
107  VLD1.16      d6, [r4]!
108  ; Use VAND since it's a data processing instruction again.
109  VAND         d4, d5, d5
110  SUB          r12, r12, #4
111  ; Load y[4...7]
112  VLD1.16      d5, [r5]!
113  VMLAL.S16    q0, d4, d6[0]
114  VEXT.16      d16, d4, d5, #1
115  VMLAL.S16    q0, d16, d6[1]
116  VEXT.16      d16, d4, d5, #2
117  VMLAL.S16    q0, d16, d6[2]
118  VEXT.16      d16, d4, d5, #3
119  VMLAL.S16    q0, d16, d6[3]
120; Process 2 samples here if we have > 2 left (still reading one extra y value).
121xcorr_kernel_neon_process2
122  ADDS         r12, r12, #2
123  BLE xcorr_kernel_neon_process1
124  ; Load x[0...1]
125  VLD2.16      {d6[],d7[]}, [r4]!
126  ; Use VAND since it's a data processing instruction again.
127  VAND         d4, d5, d5
128  SUB          r12, r12, #2
129  ; Load y[4...5]
130  VLD1.32      {d5[]}, [r5]!
131  VMLAL.S16    q0, d4, d6
132  VEXT.16      d16, d4, d5, #1
133  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
134  ; instead of VEXT, since it's a data-processing instruction.
135  VSRI.64      d5, d4, #32
136  VMLAL.S16    q0, d16, d7
137; Process 1 sample using the extra y value we loaded above.
138xcorr_kernel_neon_process1
139  ; Load next *x
140  VLD1.16      {d6[]}, [r4]!
141  ADDS         r12, r12, #1
142  ; y[0...3] are left in d5 from prior iteration(s) (if any)
143  VMLAL.S16    q0, d5, d6
144  MOVLE        pc, lr
145; Now process 1 last sample, not reading ahead.
146  ; Load last *y
147  VLD1.16      {d4[]}, [r5]!
148  VSRI.64      d4, d5, #16
149  ; Load last *x
150  VLD1.16      {d6[]}, [r4]!
151  VMLAL.S16    q0, d4, d6
152  MOV          pc, lr
153  ENDP
154
155; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
156;  opus_val32 *xcorr, int len, int max_pitch, int arch)
157celt_pitch_xcorr_neon PROC
158  ; input:
159  ;   r0  = opus_val16 *_x
160  ;   r1  = opus_val16 *_y
161  ;   r2  = opus_val32 *xcorr
162  ;   r3  = int         len
163  ; output:
164  ;   r0  = int         maxcorr
165  ; internal usage:
166  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
167  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
168  ;   r6  = int         max_pitch
169  ;   r12 = int         j
170  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
171  ; ignored:
172  ;         int         arch
173  STMFD        sp!, {r4-r6, lr}
174  LDR          r6, [sp, #16]
175  VMOV.S32     q15, #1
176  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
177  SUBS         r6, r6, #4
178  BLT celt_pitch_xcorr_neon_process4_done
179celt_pitch_xcorr_neon_process4
180  ; xcorr_kernel_neon parameters:
181  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
182  MOV          r4, r0
183  MOV          r5, r1
184  VEOR         q0, q0, q0
185  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
186  ; So we don't save/restore any other registers.
187  BL xcorr_kernel_neon_start
188  SUBS         r6, r6, #4
189  VST1.32      {q0}, [r2]!
190  ; _y += 4
191  ADD          r1, r1, #8
192  VMAX.S32     q15, q15, q0
193  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
194  BGE celt_pitch_xcorr_neon_process4
195; We have less than 4 sums left to compute.
196celt_pitch_xcorr_neon_process4_done
197  ADDS         r6, r6, #4
198  ; Reduce maxcorr to a single value
199  VMAX.S32     d30, d30, d31
200  VPMAX.S32    d30, d30, d30
201  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
202  BLE celt_pitch_xcorr_neon_done
203; Now compute each remaining sum one at a time.
204celt_pitch_xcorr_neon_process_remaining
205  MOV          r4, r0
206  MOV          r5, r1
207  VMOV.I32     q0, #0
208  SUBS         r12, r3, #8
209  BLT celt_pitch_xcorr_neon_process_remaining4
210; Sum terms 8 at a time.
211celt_pitch_xcorr_neon_process_remaining_loop8
212  ; Load x[0...7]
213  VLD1.16      {q1}, [r4]!
214  ; Load y[0...7]
215  VLD1.16      {q2}, [r5]!
216  SUBS         r12, r12, #8
217  VMLAL.S16    q0, d4, d2
218  VMLAL.S16    q0, d5, d3
219  BGE celt_pitch_xcorr_neon_process_remaining_loop8
220; Sum terms 4 at a time.
221celt_pitch_xcorr_neon_process_remaining4
222  ADDS         r12, r12, #4
223  BLT celt_pitch_xcorr_neon_process_remaining4_done
224  ; Load x[0...3]
225  VLD1.16      {d2}, [r4]!
226  ; Load y[0...3]
227  VLD1.16      {d3}, [r5]!
228  SUB          r12, r12, #4
229  VMLAL.S16    q0, d3, d2
230celt_pitch_xcorr_neon_process_remaining4_done
231  ; Reduce the sum to a single value.
232  VADD.S32     d0, d0, d1
233  VPADDL.S32   d0, d0
234  ADDS         r12, r12, #4
235  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
236; Sum terms 1 at a time.
237celt_pitch_xcorr_neon_process_remaining_loop1
238  VLD1.16      {d2[]}, [r4]!
239  VLD1.16      {d3[]}, [r5]!
240  SUBS         r12, r12, #1
241  VMLAL.S16    q0, d2, d3
242  BGT celt_pitch_xcorr_neon_process_remaining_loop1
243celt_pitch_xcorr_neon_process_remaining_loop_done
244  VST1.32      {d0[0]}, [r2]!
245  VMAX.S32     d30, d30, d0
246  SUBS         r6, r6, #1
247  ; _y++
248  ADD          r1, r1, #2
249  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
250  BGT celt_pitch_xcorr_neon_process_remaining
251celt_pitch_xcorr_neon_done
252  VMOV.32      r0, d30[0]
253  LDMFD        sp!, {r4-r6, pc}
254  ENDP
255
256ENDIF
257
258IF OPUS_ARM_MAY_HAVE_EDSP
259
260; This will get used on ARMv7 devices without NEON, so it has been optimized
261; to take advantage of dual-issuing where possible.
262xcorr_kernel_edsp PROC
263xcorr_kernel_edsp_start
264  ; input:
265  ;   r3      = int         len
266  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
267  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
268  ;   r6...r9 = opus_val32  sum[4]
269  ; output:
270  ;   r6...r9 = opus_val32  sum[4]
271  ; preserved: r0-r5
272  ; internal usage
273  ;   r2      = int         j
274  ;   r12,r14 = opus_val16  x[4]
275  ;   r10,r11 = opus_val16  y[4]
276  STMFD        sp!, {r2,r4,r5,lr}
277  LDR          r10, [r5], #4      ; Load y[0...1]
278  SUBS         r2, r3, #4         ; j = len-4
279  LDR          r11, [r5], #4      ; Load y[2...3]
280  BLE xcorr_kernel_edsp_process4_done
281  LDR          r12, [r4], #4      ; Load x[0...1]
282  ; Stall
283xcorr_kernel_edsp_process4
284  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
285  ; other. Every other instruction here dual-issues with a multiply, and is
286  ; thus "free". There should be no stalls in the body of the loop.
287  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
288  LDR          r14, [r4], #4      ; Load x[2...3]
289  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
290  SUBS         r2, r2, #4         ; j-=4
291  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
292  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
293  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
294  LDR          r10, [r5], #4      ; Load y[4...5]
295  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
296  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
297  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
298  LDRGT        r12, [r4], #4      ; Load x[0...1]
299  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
300  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
301  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
302  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
303  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
304  LDR          r11, [r5], #4      ; Load y[6...7]
305  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
306  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
307  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
308  BGT xcorr_kernel_edsp_process4
309xcorr_kernel_edsp_process4_done
310  ADDS         r2, r2, #4
311  BLE xcorr_kernel_edsp_done
312  LDRH         r12, [r4], #2      ; r12 = *x++
313  SUBS         r2, r2, #1         ; j--
314  ; Stall
315  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
316  LDRHGT       r14, [r4], #2      ; r14 = *x++
317  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
318  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
319  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
320  BLE xcorr_kernel_edsp_done
321  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
322  SUBS         r2, r2, #1         ; j--
323  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
324  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
325  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
326  LDRHGT       r12, [r4], #2      ; r12 = *x++
327  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
328  BLE xcorr_kernel_edsp_done
329  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
330  CMP          r2, #1             ; j--
331  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
332  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
333  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
334  LDRHGT       r14, [r4]          ; r14 = *x
335  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
336  BLE xcorr_kernel_edsp_done
337  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
338  LDRH         r11, [r5]          ; r11 = y_6 = *y
339  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
340  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
341  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
342xcorr_kernel_edsp_done
343  LDMFD        sp!, {r2,r4,r5,pc}
344  ENDP
345
346celt_pitch_xcorr_edsp PROC
347  ; input:
348  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
349  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
350  ;   r2  = opus_val32 *xcorr
351  ;   r3  = int         len
352  ; output:
353  ;   r0  = maxcorr
354  ; internal usage
355  ;   r4  = opus_val16 *x
356  ;   r5  = opus_val16 *y
357  ;   r6  = opus_val32  sum0
358  ;   r7  = opus_val32  sum1
359  ;   r8  = opus_val32  sum2
360  ;   r9  = opus_val32  sum3
361  ;   r1  = int         max_pitch
362  ;   r12 = int         j
363  ; ignored:
364  ;         int         arch
365  STMFD        sp!, {r4-r11, lr}
366  MOV          r5, r1
367  LDR          r1, [sp, #36]
368  MOV          r4, r0
369  TST          r5, #3
370  ; maxcorr = 1
371  MOV          r0, #1
372  BEQ          celt_pitch_xcorr_edsp_process1u_done
373; Compute one sum at the start to make y 32-bit aligned.
374  SUBS         r12, r3, #4
375  ; r14 = sum = 0
376  MOV          r14, #0
377  LDRH         r8, [r5], #2
378  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
379  LDR          r6, [r4], #4
380  MOV          r8, r8, LSL #16
381celt_pitch_xcorr_edsp_process1u_loop4
382  LDR          r9, [r5], #4
383  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
384  LDR          r7, [r4], #4
385  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
386  LDR          r8, [r5], #4
387  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
388  SUBS         r12, r12, #4         ; j-=4
389  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
390  LDRGT        r6, [r4], #4
391  BGT celt_pitch_xcorr_edsp_process1u_loop4
392  MOV          r8, r8, LSR #16
393celt_pitch_xcorr_edsp_process1u_loop4_done
394  ADDS         r12, r12, #4
395celt_pitch_xcorr_edsp_process1u_loop1
396  LDRHGE       r6, [r4], #2
397  ; Stall
398  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
399  SUBSGE       r12, r12, #1
400  LDRHGT       r8, [r5], #2
401  BGT celt_pitch_xcorr_edsp_process1u_loop1
402  ; Restore _x
403  SUB          r4, r4, r3, LSL #1
404  ; Restore and advance _y
405  SUB          r5, r5, r3, LSL #1
406  ; maxcorr = max(maxcorr, sum)
407  CMP          r0, r14
408  ADD          r5, r5, #2
409  MOVLT        r0, r14
410  SUBS         r1, r1, #1
411  ; xcorr[i] = sum
412  STR          r14, [r2], #4
413  BLE celt_pitch_xcorr_edsp_done
414celt_pitch_xcorr_edsp_process1u_done
415  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
416  SUBS         r1, r1, #4
417  BLT celt_pitch_xcorr_edsp_process2
418celt_pitch_xcorr_edsp_process4
419  ; xcorr_kernel_edsp parameters:
420  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
421  MOV          r6, #0
422  MOV          r7, #0
423  MOV          r8, #0
424  MOV          r9, #0
425  BL xcorr_kernel_edsp_start  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
426  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
427  CMP          r0, r6
428  ; _y+=4
429  ADD          r5, r5, #8
430  MOVLT        r0, r6
431  CMP          r0, r7
432  MOVLT        r0, r7
433  CMP          r0, r8
434  MOVLT        r0, r8
435  CMP          r0, r9
436  MOVLT        r0, r9
437  STMIA        r2!, {r6-r9}
438  SUBS         r1, r1, #4
439  BGE celt_pitch_xcorr_edsp_process4
440celt_pitch_xcorr_edsp_process2
441  ADDS         r1, r1, #2
442  BLT celt_pitch_xcorr_edsp_process1a
443  SUBS         r12, r3, #4
444  ; {r10, r11} = {sum0, sum1} = {0, 0}
445  MOV          r10, #0
446  MOV          r11, #0
447  LDR          r8, [r5], #4
448  BLE celt_pitch_xcorr_edsp_process2_loop_done
449  LDR          r6, [r4], #4
450  LDR          r9, [r5], #4
451celt_pitch_xcorr_edsp_process2_loop4
452  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
453  LDR          r7, [r4], #4
454  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
455  SUBS         r12, r12, #4         ; j-=4
456  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
457  LDR          r8, [r5], #4
458  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
459  LDRGT        r6, [r4], #4
460  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
461  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
462  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
463  LDRGT        r9, [r5], #4
464  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
465  BGT celt_pitch_xcorr_edsp_process2_loop4
466celt_pitch_xcorr_edsp_process2_loop_done
467  ADDS         r12, r12, #2
468  BLE  celt_pitch_xcorr_edsp_process2_1
469  LDR          r6, [r4], #4
470  ; Stall
471  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
472  LDR          r9, [r5], #4
473  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
474  SUB          r12, r12, #2
475  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
476  MOV          r8, r9
477  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
478celt_pitch_xcorr_edsp_process2_1
479  LDRH         r6, [r4], #2
480  ADDS         r12, r12, #1
481  ; Stall
482  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
483  LDRHGT       r7, [r4], #2
484  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
485  BLE celt_pitch_xcorr_edsp_process2_done
486  LDRH         r9, [r5], #2
487  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
488  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
489celt_pitch_xcorr_edsp_process2_done
490  ; Restore _x
491  SUB          r4, r4, r3, LSL #1
492  ; Restore and advance _y
493  SUB          r5, r5, r3, LSL #1
494  ; maxcorr = max(maxcorr, sum0)
495  CMP          r0, r10
496  ADD          r5, r5, #2
497  MOVLT        r0, r10
498  SUB          r1, r1, #2
499  ; maxcorr = max(maxcorr, sum1)
500  CMP          r0, r11
501  ; xcorr[i] = sum
502  STR          r10, [r2], #4
503  MOVLT        r0, r11
504  STR          r11, [r2], #4
505celt_pitch_xcorr_edsp_process1a
506  ADDS         r1, r1, #1
507  BLT celt_pitch_xcorr_edsp_done
508  SUBS         r12, r3, #4
509  ; r14 = sum = 0
510  MOV          r14, #0
511  BLT celt_pitch_xcorr_edsp_process1a_loop_done
512  LDR          r6, [r4], #4
513  LDR          r8, [r5], #4
514  LDR          r7, [r4], #4
515  LDR          r9, [r5], #4
516celt_pitch_xcorr_edsp_process1a_loop4
517  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
518  SUBS         r12, r12, #4         ; j-=4
519  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
520  LDRGE        r6, [r4], #4
521  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
522  LDRGE        r8, [r5], #4
523  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
524  LDRGE        r7, [r4], #4
525  LDRGE        r9, [r5], #4
526  BGE celt_pitch_xcorr_edsp_process1a_loop4
527celt_pitch_xcorr_edsp_process1a_loop_done
528  ADDS         r12, r12, #2
529  LDRGE        r6, [r4], #4
530  LDRGE        r8, [r5], #4
531  ; Stall
532  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
533  SUBGE        r12, r12, #2
534  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
535  ADDS         r12, r12, #1
536  LDRHGE       r6, [r4], #2
537  LDRHGE       r8, [r5], #2
538  ; Stall
539  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
540  ; maxcorr = max(maxcorr, sum)
541  CMP          r0, r14
542  ; xcorr[i] = sum
543  STR          r14, [r2], #4
544  MOVLT        r0, r14
545celt_pitch_xcorr_edsp_done
546  LDMFD        sp!, {r4-r11, pc}
547  ENDP
548
549ENDIF
550
551END
552