1    .syntax unified
2@ Copyright (c) 2007-2008 CSIRO
3@ Copyright (c) 2007-2009 Xiph.Org Foundation
4@ Copyright (c) 2013      Parrot
5@ Written by Aurélien Zanelli
6@
7@ Redistribution and use in source and binary forms, with or without
8@ modification, are permitted provided that the following conditions
9@ are met:
10@
11@ - Redistributions of source code must retain the above copyright
12@ notice, this list of conditions and the following disclaimer.
13@
14@ - Redistributions in binary form must reproduce the above copyright
15@ notice, this list of conditions and the following disclaimer in the
16@ documentation and/or other materials provided with the distribution.
17@
18@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30    .text;   .p2align 2;   .arch armv7-a
31   .fpu neon
32   .object_arch armv4t
33
34  .include "celt/arm/armopts-gnu.S"
35
36 .if OPUS_ARM_MAY_HAVE_EDSP
37  .global celt_pitch_xcorr_edsp
38 .endif
39
40 .if OPUS_ARM_MAY_HAVE_NEON
41  .global celt_pitch_xcorr_neon
42 .endif
43
44 .if OPUS_ARM_MAY_HAVE_NEON
45
46@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
47	.type	xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
48xcorr_kernel_neon_start:
49  @ input:
50  @   r3     = int         len
51  @   r4     = opus_val16 *x
52  @   r5     = opus_val16 *y
53  @   q0     = opus_val32  sum[4]
54  @ output:
55  @   q0     = opus_val32  sum[4]
56  @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
57  @ internal usage:
58  @   r12 = int j
59  @   d3  = y_3|y_2|y_1|y_0
60  @   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
61  @   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
62  @   q8  = scratch
63  @
64  @ Load y[0...3]
65  @ This requires len>0 to always be valid (which we assert in the C code).
66  VLD1.16      {d5}, [r5]!
67  SUBS         r12, r3, #8
68  BLE xcorr_kernel_neon_process4
69@ Process 8 samples at a time.
70@ This loop loads one y value more than we actually need. Therefore we have to
71@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
72@ reading past the end of the array.
73xcorr_kernel_neon_process8:
74  @ This loop has 19 total instructions (10 cycles to issue, minimum), with
75  @ - 2 cycles of ARM insrtuctions,
76  @ - 10 cycles of load/store/byte permute instructions, and
77  @ - 9 cycles of data processing instructions.
78  @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
79  @ latter two categories, meaning the whole loop should run in 10 cycles per
80  @ iteration, barring cache misses.
81  @
82  @ Load x[0...7]
83  VLD1.16      {d6, d7}, [r4]!
84  @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
85  @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
86  VAND         d3, d5, d5
87  SUBS         r12, r12, #8
88  @ Load y[4...11]
89  VLD1.16      {d4, d5}, [r5]!
90  VMLAL.S16    q0, d3, d6[0]
91  VEXT.16      d16, d3, d4, #1
92  VMLAL.S16    q0, d4, d7[0]
93  VEXT.16      d17, d4, d5, #1
94  VMLAL.S16    q0, d16, d6[1]
95  VEXT.16      d16, d3, d4, #2
96  VMLAL.S16    q0, d17, d7[1]
97  VEXT.16      d17, d4, d5, #2
98  VMLAL.S16    q0, d16, d6[2]
99  VEXT.16      d16, d3, d4, #3
100  VMLAL.S16    q0, d17, d7[2]
101  VEXT.16      d17, d4, d5, #3
102  VMLAL.S16    q0, d16, d6[3]
103  VMLAL.S16    q0, d17, d7[3]
104  BGT xcorr_kernel_neon_process8
105@ Process 4 samples here if we have > 4 left (still reading one extra y value).
106xcorr_kernel_neon_process4:
107  ADDS         r12, r12, #4
108  BLE xcorr_kernel_neon_process2
109  @ Load x[0...3]
110  VLD1.16      d6, [r4]!
111  @ Use VAND since it's a data processing instruction again.
112  VAND         d4, d5, d5
113  SUB          r12, r12, #4
114  @ Load y[4...7]
115  VLD1.16      d5, [r5]!
116  VMLAL.S16    q0, d4, d6[0]
117  VEXT.16      d16, d4, d5, #1
118  VMLAL.S16    q0, d16, d6[1]
119  VEXT.16      d16, d4, d5, #2
120  VMLAL.S16    q0, d16, d6[2]
121  VEXT.16      d16, d4, d5, #3
122  VMLAL.S16    q0, d16, d6[3]
123@ Process 2 samples here if we have > 2 left (still reading one extra y value).
124xcorr_kernel_neon_process2:
125  ADDS         r12, r12, #2
126  BLE xcorr_kernel_neon_process1
127  @ Load x[0...1]
128  VLD2.16      {d6[],d7[]}, [r4]!
129  @ Use VAND since it's a data processing instruction again.
130  VAND         d4, d5, d5
131  SUB          r12, r12, #2
132  @ Load y[4...5]
133  VLD1.32      {d5[]}, [r5]!
134  VMLAL.S16    q0, d4, d6
135  VEXT.16      d16, d4, d5, #1
136  @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
137  @ instead of VEXT, since it's a data-processing instruction.
138  VSRI.64      d5, d4, #32
139  VMLAL.S16    q0, d16, d7
140@ Process 1 sample using the extra y value we loaded above.
141xcorr_kernel_neon_process1:
142  @ Load next *x
143  VLD1.16      {d6[]}, [r4]!
144  ADDS         r12, r12, #1
145  @ y[0...3] are left in d5 from prior iteration(s) (if any)
146  VMLAL.S16    q0, d5, d6
147  MOVLE        pc, lr
148@ Now process 1 last sample, not reading ahead.
149  @ Load last *y
150  VLD1.16      {d4[]}, [r5]!
151  VSRI.64      d4, d5, #16
152  @ Load last *x
153  VLD1.16      {d6[]}, [r4]!
154  VMLAL.S16    q0, d4, d6
155  MOV          pc, lr
156	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
157
158@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
159@  opus_val32 *xcorr, int len, int max_pitch, int arch)
160	.type	celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
161  @ input:
162  @   r0  = opus_val16 *_x
163  @   r1  = opus_val16 *_y
164  @   r2  = opus_val32 *xcorr
165  @   r3  = int         len
166  @ output:
167  @   r0  = int         maxcorr
168  @ internal usage:
169  @   r4  = opus_val16 *x (for xcorr_kernel_neon())
170  @   r5  = opus_val16 *y (for xcorr_kernel_neon())
171  @   r6  = int         max_pitch
172  @   r12 = int         j
173  @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
174  @ ignored:
175  @         int         arch
176  STMFD        sp!, {r4-r6, lr}
177  LDR          r6, [sp, #16]
178  VMOV.S32     q15, #1
179  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
180  SUBS         r6, r6, #4
181  BLT celt_pitch_xcorr_neon_process4_done
182celt_pitch_xcorr_neon_process4:
183  @ xcorr_kernel_neon parameters:
184  @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
185  MOV          r4, r0
186  MOV          r5, r1
187  VEOR         q0, q0, q0
188  @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
189  @ So we don't save/restore any other registers.
190  BL xcorr_kernel_neon_start
191  SUBS         r6, r6, #4
192  VST1.32      {q0}, [r2]!
193  @ _y += 4
194  ADD          r1, r1, #8
195  VMAX.S32     q15, q15, q0
196  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
197  BGE celt_pitch_xcorr_neon_process4
198@ We have less than 4 sums left to compute.
199celt_pitch_xcorr_neon_process4_done:
200  ADDS         r6, r6, #4
201  @ Reduce maxcorr to a single value
202  VMAX.S32     d30, d30, d31
203  VPMAX.S32    d30, d30, d30
204  @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
205  BLE celt_pitch_xcorr_neon_done
206@ Now compute each remaining sum one at a time.
207celt_pitch_xcorr_neon_process_remaining:
208  MOV          r4, r0
209  MOV          r5, r1
210  VMOV.I32     q0, #0
211  SUBS         r12, r3, #8
212  BLT celt_pitch_xcorr_neon_process_remaining4
213@ Sum terms 8 at a time.
214celt_pitch_xcorr_neon_process_remaining_loop8:
215  @ Load x[0...7]
216  VLD1.16      {q1}, [r4]!
217  @ Load y[0...7]
218  VLD1.16      {q2}, [r5]!
219  SUBS         r12, r12, #8
220  VMLAL.S16    q0, d4, d2
221  VMLAL.S16    q0, d5, d3
222  BGE celt_pitch_xcorr_neon_process_remaining_loop8
223@ Sum terms 4 at a time.
224celt_pitch_xcorr_neon_process_remaining4:
225  ADDS         r12, r12, #4
226  BLT celt_pitch_xcorr_neon_process_remaining4_done
227  @ Load x[0...3]
228  VLD1.16      {d2}, [r4]!
229  @ Load y[0...3]
230  VLD1.16      {d3}, [r5]!
231  SUB          r12, r12, #4
232  VMLAL.S16    q0, d3, d2
233celt_pitch_xcorr_neon_process_remaining4_done:
234  @ Reduce the sum to a single value.
235  VADD.S32     d0, d0, d1
236  VPADDL.S32   d0, d0
237  ADDS         r12, r12, #4
238  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
239@ Sum terms 1 at a time.
240celt_pitch_xcorr_neon_process_remaining_loop1:
241  VLD1.16      {d2[]}, [r4]!
242  VLD1.16      {d3[]}, [r5]!
243  SUBS         r12, r12, #1
244  VMLAL.S16    q0, d2, d3
245  BGT celt_pitch_xcorr_neon_process_remaining_loop1
246celt_pitch_xcorr_neon_process_remaining_loop_done:
247  VST1.32      {d0[0]}, [r2]!
248  VMAX.S32     d30, d30, d0
249  SUBS         r6, r6, #1
250  @ _y++
251  ADD          r1, r1, #2
252  @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
253  BGT celt_pitch_xcorr_neon_process_remaining
254celt_pitch_xcorr_neon_done:
255  VMOV.32      r0, d30[0]
256  LDMFD        sp!, {r4-r6, pc}
257	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  @ ENDP
258
259 .endif
260
261 .if OPUS_ARM_MAY_HAVE_EDSP
262
263@ This will get used on ARMv7 devices without NEON, so it has been optimized
264@ to take advantage of dual-issuing where possible.
265	.type	xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
266xcorr_kernel_edsp_start:
267  @ input:
268  @   r3      = int         len
269  @   r4      = opus_val16 *_x (must be 32-bit aligned)
270  @   r5      = opus_val16 *_y (must be 32-bit aligned)
271  @   r6...r9 = opus_val32  sum[4]
272  @ output:
273  @   r6...r9 = opus_val32  sum[4]
274  @ preserved: r0-r5
275  @ internal usage
276  @   r2      = int         j
277  @   r12,r14 = opus_val16  x[4]
278  @   r10,r11 = opus_val16  y[4]
279  STMFD        sp!, {r2,r4,r5,lr}
280  LDR          r10, [r5], #4      @ Load y[0...1]
281  SUBS         r2, r3, #4         @ j = len-4
282  LDR          r11, [r5], #4      @ Load y[2...3]
283  BLE xcorr_kernel_edsp_process4_done
284  LDR          r12, [r4], #4      @ Load x[0...1]
285  @ Stall
286xcorr_kernel_edsp_process4:
287  @ The multiplies must issue from pipeline 0, and can't dual-issue with each
288  @ other. Every other instruction here dual-issues with a multiply, and is
289  @ thus "free". There should be no stalls in the body of the loop.
290  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_0,y_0)
291  LDR          r14, [r4], #4      @ Load x[2...3]
292  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x_0,y_1)
293  SUBS         r2, r2, #4         @ j-=4
294  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_0,y_2)
295  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x_0,y_3)
296  SMLATT       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_1,y_1)
297  LDR          r10, [r5], #4      @ Load y[4...5]
298  SMLATB       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],x_1,y_2)
299  SMLATT       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_1,y_3)
300  SMLATB       r9, r12, r10, r9   @ sum[3] = MAC16_16(sum[3],x_1,y_4)
301  LDRGT        r12, [r4], #4      @ Load x[0...1]
302  SMLABB       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_2,y_2)
303  SMLABT       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x_2,y_3)
304  SMLABB       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_2,y_4)
305  SMLABT       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x_2,y_5)
306  SMLATT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_3,y_3)
307  LDR          r11, [r5], #4      @ Load y[6...7]
308  SMLATB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],x_3,y_4)
309  SMLATT       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_3,y_5)
310  SMLATB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],x_3,y_6)
311  BGT xcorr_kernel_edsp_process4
312xcorr_kernel_edsp_process4_done:
313  ADDS         r2, r2, #4
314  BLE xcorr_kernel_edsp_done
315  LDRH         r12, [r4], #2      @ r12 = *x++
316  SUBS         r2, r2, #1         @ j--
317  @ Stall
318  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_0)
319  LDRHGT       r14, [r4], #2      @ r14 = *x++
320  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x,y_1)
321  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_2)
322  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x,y_3)
323  BLE xcorr_kernel_edsp_done
324  SMLABT       r6, r14, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_1)
325  SUBS         r2, r2, #1         @ j--
326  SMLABB       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x,y_2)
327  LDRH         r10, [r5], #2      @ r10 = y_4 = *y++
328  SMLABT       r8, r14, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_3)
329  LDRHGT       r12, [r4], #2      @ r12 = *x++
330  SMLABB       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x,y_4)
331  BLE xcorr_kernel_edsp_done
332  SMLABB       r6, r12, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_2)
333  CMP          r2, #1             @ j--
334  SMLABT       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_3)
335  LDRH         r2, [r5], #2       @ r2 = y_5 = *y++
336  SMLABB       r8, r12, r10, r8   @ sum[2] = MAC16_16(sum[2],tmp,y_4)
337  LDRHGT       r14, [r4]          @ r14 = *x
338  SMLABB       r9, r12, r2, r9    @ sum[3] = MAC16_16(sum[3],tmp,y_5)
339  BLE xcorr_kernel_edsp_done
340  SMLABT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_3)
341  LDRH         r11, [r5]          @ r11 = y_6 = *y
342  SMLABB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_4)
343  SMLABB       r8, r14, r2, r8    @ sum[2] = MAC16_16(sum[2],tmp,y_5)
344  SMLABB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],tmp,y_6)
345xcorr_kernel_edsp_done:
346  LDMFD        sp!, {r2,r4,r5,pc}
347	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
348
349	.type	celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
350  @ input:
351  @   r0  = opus_val16 *_x (must be 32-bit aligned)
352  @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
353  @   r2  = opus_val32 *xcorr
354  @   r3  = int         len
355  @ output:
356  @   r0  = maxcorr
357  @ internal usage
358  @   r4  = opus_val16 *x
359  @   r5  = opus_val16 *y
360  @   r6  = opus_val32  sum0
361  @   r7  = opus_val32  sum1
362  @   r8  = opus_val32  sum2
363  @   r9  = opus_val32  sum3
364  @   r1  = int         max_pitch
365  @   r12 = int         j
366  @ ignored:
367  @         int         arch
368  STMFD        sp!, {r4-r11, lr}
369  MOV          r5, r1
370  LDR          r1, [sp, #36]
371  MOV          r4, r0
372  TST          r5, #3
373  @ maxcorr = 1
374  MOV          r0, #1
375  BEQ          celt_pitch_xcorr_edsp_process1u_done
376@ Compute one sum at the start to make y 32-bit aligned.
377  SUBS         r12, r3, #4
378  @ r14 = sum = 0
379  MOV          r14, #0
380  LDRH         r8, [r5], #2
381  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
382  LDR          r6, [r4], #4
383  MOV          r8, r8, LSL #16
384celt_pitch_xcorr_edsp_process1u_loop4:
385  LDR          r9, [r5], #4
386  SMLABT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
387  LDR          r7, [r4], #4
388  SMLATB       r14, r6, r9, r14     @ sum = MAC16_16(sum, x_1, y_1)
389  LDR          r8, [r5], #4
390  SMLABT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
391  SUBS         r12, r12, #4         @ j-=4
392  SMLATB       r14, r7, r8, r14     @ sum = MAC16_16(sum, x_3, y_3)
393  LDRGT        r6, [r4], #4
394  BGT celt_pitch_xcorr_edsp_process1u_loop4
395  MOV          r8, r8, LSR #16
396celt_pitch_xcorr_edsp_process1u_loop4_done:
397  ADDS         r12, r12, #4
398celt_pitch_xcorr_edsp_process1u_loop1:
399  LDRHGE       r6, [r4], #2
400  @ Stall
401  SMLABBGE     r14, r6, r8, r14    @ sum = MAC16_16(sum, *x, *y)
402  SUBSGE       r12, r12, #1
403  LDRHGT       r8, [r5], #2
404  BGT celt_pitch_xcorr_edsp_process1u_loop1
405  @ Restore _x
406  SUB          r4, r4, r3, LSL #1
407  @ Restore and advance _y
408  SUB          r5, r5, r3, LSL #1
409  @ maxcorr = max(maxcorr, sum)
410  CMP          r0, r14
411  ADD          r5, r5, #2
412  MOVLT        r0, r14
413  SUBS         r1, r1, #1
414  @ xcorr[i] = sum
415  STR          r14, [r2], #4
416  BLE celt_pitch_xcorr_edsp_done
417celt_pitch_xcorr_edsp_process1u_done:
418  @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
419  SUBS         r1, r1, #4
420  BLT celt_pitch_xcorr_edsp_process2
421celt_pitch_xcorr_edsp_process4:
422  @ xcorr_kernel_edsp parameters:
423  @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
424  MOV          r6, #0
425  MOV          r7, #0
426  MOV          r8, #0
427  MOV          r9, #0
428  BL xcorr_kernel_edsp_start  @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
429  @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
430  CMP          r0, r6
431  @ _y+=4
432  ADD          r5, r5, #8
433  MOVLT        r0, r6
434  CMP          r0, r7
435  MOVLT        r0, r7
436  CMP          r0, r8
437  MOVLT        r0, r8
438  CMP          r0, r9
439  MOVLT        r0, r9
440  STMIA        r2!, {r6-r9}
441  SUBS         r1, r1, #4
442  BGE celt_pitch_xcorr_edsp_process4
443celt_pitch_xcorr_edsp_process2:
444  ADDS         r1, r1, #2
445  BLT celt_pitch_xcorr_edsp_process1a
446  SUBS         r12, r3, #4
447  @ {r10, r11} = {sum0, sum1} = {0, 0}
448  MOV          r10, #0
449  MOV          r11, #0
450  LDR          r8, [r5], #4
451  BLE celt_pitch_xcorr_edsp_process2_loop_done
452  LDR          r6, [r4], #4
453  LDR          r9, [r5], #4
454celt_pitch_xcorr_edsp_process2_loop4:
455  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
456  LDR          r7, [r4], #4
457  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
458  SUBS         r12, r12, #4         @ j-=4
459  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
460  LDR          r8, [r5], #4
461  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
462  LDRGT        r6, [r4], #4
463  SMLABB       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_2, y_2)
464  SMLABT       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_2, y_3)
465  SMLATT       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_3, y_3)
466  LDRGT        r9, [r5], #4
467  SMLATB       r11, r7, r8, r11     @ sum1 = MAC16_16(sum1, x_3, y_4)
468  BGT celt_pitch_xcorr_edsp_process2_loop4
469celt_pitch_xcorr_edsp_process2_loop_done:
470  ADDS         r12, r12, #2
471  BLE  celt_pitch_xcorr_edsp_process2_1
472  LDR          r6, [r4], #4
473  @ Stall
474  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
475  LDR          r9, [r5], #4
476  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
477  SUB          r12, r12, #2
478  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
479  MOV          r8, r9
480  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
481celt_pitch_xcorr_edsp_process2_1:
482  LDRH         r6, [r4], #2
483  ADDS         r12, r12, #1
484  @ Stall
485  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
486  LDRHGT       r7, [r4], #2
487  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
488  BLE celt_pitch_xcorr_edsp_process2_done
489  LDRH         r9, [r5], #2
490  SMLABT       r10, r7, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_1)
491  SMLABB       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_0, y_2)
492celt_pitch_xcorr_edsp_process2_done:
493  @ Restore _x
494  SUB          r4, r4, r3, LSL #1
495  @ Restore and advance _y
496  SUB          r5, r5, r3, LSL #1
497  @ maxcorr = max(maxcorr, sum0)
498  CMP          r0, r10
499  ADD          r5, r5, #2
500  MOVLT        r0, r10
501  SUB          r1, r1, #2
502  @ maxcorr = max(maxcorr, sum1)
503  CMP          r0, r11
504  @ xcorr[i] = sum
505  STR          r10, [r2], #4
506  MOVLT        r0, r11
507  STR          r11, [r2], #4
508celt_pitch_xcorr_edsp_process1a:
509  ADDS         r1, r1, #1
510  BLT celt_pitch_xcorr_edsp_done
511  SUBS         r12, r3, #4
512  @ r14 = sum = 0
513  MOV          r14, #0
514  BLT celt_pitch_xcorr_edsp_process1a_loop_done
515  LDR          r6, [r4], #4
516  LDR          r8, [r5], #4
517  LDR          r7, [r4], #4
518  LDR          r9, [r5], #4
519celt_pitch_xcorr_edsp_process1a_loop4:
520  SMLABB       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
521  SUBS         r12, r12, #4         @ j-=4
522  SMLATT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
523  LDRGE        r6, [r4], #4
524  SMLABB       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
525  LDRGE        r8, [r5], #4
526  SMLATT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_3, y_3)
527  LDRGE        r7, [r4], #4
528  LDRGE        r9, [r5], #4
529  BGE celt_pitch_xcorr_edsp_process1a_loop4
530celt_pitch_xcorr_edsp_process1a_loop_done:
531  ADDS         r12, r12, #2
532  LDRGE        r6, [r4], #4
533  LDRGE        r8, [r5], #4
534  @ Stall
535  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
536  SUBGE        r12, r12, #2
537  SMLATTGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
538  ADDS         r12, r12, #1
539  LDRHGE       r6, [r4], #2
540  LDRHGE       r8, [r5], #2
541  @ Stall
542  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, *x, *y)
543  @ maxcorr = max(maxcorr, sum)
544  CMP          r0, r14
545  @ xcorr[i] = sum
546  STR          r14, [r2], #4
547  MOVLT        r0, r14
548celt_pitch_xcorr_edsp_done:
549  LDMFD        sp!, {r4-r11, pc}
550	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  @ ENDP
551
552 .endif
553
554@ END:
555    .section	.note.GNU-stack,"",%progbits
556