1    .syntax unified
2@ Copyright (c) 2007-2008 CSIRO
3@ Copyright (c) 2007-2009 Xiph.Org Foundation
4@ Copyright (c) 2013      Parrot
5@ Written by Aurélien Zanelli
6@
7@ Redistribution and use in source and binary forms, with or without
8@ modification, are permitted provided that the following conditions
9@ are met:
10@
11@ - Redistributions of source code must retain the above copyright
12@ notice, this list of conditions and the following disclaimer.
13@
14@ - Redistributions in binary form must reproduce the above copyright
15@ notice, this list of conditions and the following disclaimer in the
16@ documentation and/or other materials provided with the distribution.
17@
18@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30    .text;   .p2align 2;   .arch armv7-a
31   .fpu neon
32   .object_arch armv4t
33
34  .include "celt/arm/armopts-gnu.S"
35
36 .if OPUS_ARM_MAY_HAVE_EDSP
37  .global celt_pitch_xcorr_edsp
38 .endif
39
40 .if OPUS_ARM_MAY_HAVE_NEON
41  .global celt_pitch_xcorr_neon
42 .endif
43
44 .if OPUS_ARM_MAY_HAVE_NEON
45
46@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
47; xcorr_kernel_neon: @ PROC
48xcorr_kernel_neon_start:
49  @ input:
50  @   r3     = int         len
51  @   r4     = opus_val16 *x
52  @   r5     = opus_val16 *y
53  @   q0     = opus_val32  sum[4]
54  @ output:
55  @   q0     = opus_val32  sum[4]
56  @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
57  @ internal usage:
58  @   r12 = int j
59  @   d3  = y_3|y_2|y_1|y_0
60  @   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
61  @   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
62  @   q8  = scratch
63  @
64  @ Load y[0...3]
65  @ This requires len>0 to always be valid (which we assert in the C code).
66  VLD1.16      {d5}, [r5]!
67  SUBS         r12, r3, #8
68  BLE xcorr_kernel_neon_process4
69@ Process 8 samples at a time.
70@ This loop loads one y value more than we actually need. Therefore we have to
71@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
72@ reading past the end of the array.
73xcorr_kernel_neon_process8:
74  @ This loop has 19 total instructions (10 cycles to issue, minimum), with
75  @ - 2 cycles of ARM insrtuctions,
76  @ - 10 cycles of load/store/byte permute instructions, and
77  @ - 9 cycles of data processing instructions.
78  @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
79  @ latter two categories, meaning the whole loop should run in 10 cycles per
80  @ iteration, barring cache misses.
81  @
82  @ Load x[0...7]
83  VLD1.16      {d6, d7}, [r4]!
84  @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
85  @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
86  VAND         d3, d5, d5
87  SUBS         r12, r12, #8
88  @ Load y[4...11]
89  VLD1.16      {d4, d5}, [r5]!
90  VMLAL.S16    q0, d3, d6[0]
91  VEXT.16      d16, d3, d4, #1
92  VMLAL.S16    q0, d4, d7[0]
93  VEXT.16      d17, d4, d5, #1
94  VMLAL.S16    q0, d16, d6[1]
95  VEXT.16      d16, d3, d4, #2
96  VMLAL.S16    q0, d17, d7[1]
97  VEXT.16      d17, d4, d5, #2
98  VMLAL.S16    q0, d16, d6[2]
99  VEXT.16      d16, d3, d4, #3
100  VMLAL.S16    q0, d17, d7[2]
101  VEXT.16      d17, d4, d5, #3
102  VMLAL.S16    q0, d16, d6[3]
103  VMLAL.S16    q0, d17, d7[3]
104  BGT xcorr_kernel_neon_process8
105@ Process 4 samples here if we have > 4 left (still reading one extra y value).
106xcorr_kernel_neon_process4:
107  ADDS         r12, r12, #4
108  BLE xcorr_kernel_neon_process2
109  @ Load x[0...3]
110  VLD1.16      d6, [r4]!
111  @ Use VAND since it's a data processing instruction again.
112  VAND         d4, d5, d5
113  SUB          r12, r12, #4
114  @ Load y[4...7]
115  VLD1.16      d5, [r5]!
116  VMLAL.S16    q0, d4, d6[0]
117  VEXT.16      d16, d4, d5, #1
118  VMLAL.S16    q0, d16, d6[1]
119  VEXT.16      d16, d4, d5, #2
120  VMLAL.S16    q0, d16, d6[2]
121  VEXT.16      d16, d4, d5, #3
122  VMLAL.S16    q0, d16, d6[3]
123@ Process 2 samples here if we have > 2 left (still reading one extra y value).
124xcorr_kernel_neon_process2:
125  ADDS         r12, r12, #2
126  BLE xcorr_kernel_neon_process1
127  @ Load x[0...1]
128  VLD2.16      {d6[],d7[]}, [r4]!
129  @ Use VAND since it's a data processing instruction again.
130  VAND         d4, d5, d5
131  SUB          r12, r12, #2
132  @ Load y[4...5]
133  VLD1.32      {d5[]}, [r5]!
134  VMLAL.S16    q0, d4, d6
135  VEXT.16      d16, d4, d5, #1
136  @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
137  @ instead of VEXT, since it's a data-processing instruction.
138  VSRI.64      d5, d4, #32
139  VMLAL.S16    q0, d16, d7
140@ Process 1 sample using the extra y value we loaded above.
141xcorr_kernel_neon_process1:
142  @ Load next *x
143  VLD1.16      {d6[]}, [r4]!
144  ADDS         r12, r12, #1
145  @ y[0...3] are left in d5 from prior iteration(s) (if any)
146  VMLAL.S16    q0, d5, d6
147  MOVLE        pc, lr
148@ Now process 1 last sample, not reading ahead.
149  @ Load last *y
150  VLD1.16      {d4[]}, [r5]!
151  VSRI.64      d4, d5, #16
152  @ Load last *x
153  VLD1.16      {d6[]}, [r4]!
154  VMLAL.S16    q0, d4, d6
155  MOV          pc, lr
156	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
157
158@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
159@  opus_val32 *xcorr, int len, int max_pitch)
160; celt_pitch_xcorr_neon: @ PROC
161  @ input:
162  @   r0  = opus_val16 *_x
163  @   r1  = opus_val16 *_y
164  @   r2  = opus_val32 *xcorr
165  @   r3  = int         len
166  @ output:
167  @   r0  = int         maxcorr
168  @ internal usage:
169  @   r4  = opus_val16 *x (for xcorr_kernel_neon())
170  @   r5  = opus_val16 *y (for xcorr_kernel_neon())
171  @   r6  = int         max_pitch
172  @   r12 = int         j
173  @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
174  STMFD        sp!, {r4-r6, lr}
175  LDR          r6, [sp, #16]
176  VMOV.S32     q15, #1
177  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
178  SUBS         r6, r6, #4
179  BLT celt_pitch_xcorr_neon_process4_done
180celt_pitch_xcorr_neon_process4:
181  @ xcorr_kernel_neon parameters:
182  @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
183  MOV          r4, r0
184  MOV          r5, r1
185  VEOR         q0, q0, q0
186  @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
187  @ So we don't save/restore any other registers.
188  BL xcorr_kernel_neon_start
189  SUBS         r6, r6, #4
190  VST1.32      {q0}, [r2]!
191  @ _y += 4
192  ADD          r1, r1, #8
193  VMAX.S32     q15, q15, q0
194  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
195  BGE celt_pitch_xcorr_neon_process4
196@ We have less than 4 sums left to compute.
197celt_pitch_xcorr_neon_process4_done:
198  ADDS         r6, r6, #4
199  @ Reduce maxcorr to a single value
200  VMAX.S32     d30, d30, d31
201  VPMAX.S32    d30, d30, d30
202  @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
203  BLE celt_pitch_xcorr_neon_done
204@ Now compute each remaining sum one at a time.
205celt_pitch_xcorr_neon_process_remaining:
206  MOV          r4, r0
207  MOV          r5, r1
208  VMOV.I32     q0, #0
209  SUBS         r12, r3, #8
210  BLT celt_pitch_xcorr_neon_process_remaining4
211@ Sum terms 8 at a time.
212celt_pitch_xcorr_neon_process_remaining_loop8:
213  @ Load x[0...7]
214  VLD1.16      {q1}, [r4]!
215  @ Load y[0...7]
216  VLD1.16      {q2}, [r5]!
217  SUBS         r12, r12, #8
218  VMLAL.S16    q0, d4, d2
219  VMLAL.S16    q0, d5, d3
220  BGE celt_pitch_xcorr_neon_process_remaining_loop8
221@ Sum terms 4 at a time.
222celt_pitch_xcorr_neon_process_remaining4:
223  ADDS         r12, r12, #4
224  BLT celt_pitch_xcorr_neon_process_remaining4_done
225  @ Load x[0...3]
226  VLD1.16      {d2}, [r4]!
227  @ Load y[0...3]
228  VLD1.16      {d3}, [r5]!
229  SUB          r12, r12, #4
230  VMLAL.S16    q0, d3, d2
231celt_pitch_xcorr_neon_process_remaining4_done:
232  @ Reduce the sum to a single value.
233  VADD.S32     d0, d0, d1
234  VPADDL.S32   d0, d0
235  ADDS         r12, r12, #4
236  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
237@ Sum terms 1 at a time.
238celt_pitch_xcorr_neon_process_remaining_loop1:
239  VLD1.16      {d2[]}, [r4]!
240  VLD1.16      {d3[]}, [r5]!
241  SUBS         r12, r12, #1
242  VMLAL.S16    q0, d2, d3
243  BGT celt_pitch_xcorr_neon_process_remaining_loop1
244celt_pitch_xcorr_neon_process_remaining_loop_done:
245  VST1.32      {d0[0]}, [r2]!
246  VMAX.S32     d30, d30, d0
247  SUBS         r6, r6, #1
248  @ _y++
249  ADD          r1, r1, #2
250  @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
251  BGT celt_pitch_xcorr_neon_process_remaining
252celt_pitch_xcorr_neon_done:
253  VMOV.32      r0, d30[0]
254  LDMFD        sp!, {r4-r6, pc}
255	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  @ ENDP
256
257 .endif
258
259 .if OPUS_ARM_MAY_HAVE_EDSP
260
261@ This will get used on ARMv7 devices without NEON, so it has been optimized
262@ to take advantage of dual-issuing where possible.
263; xcorr_kernel_edsp: @ PROC
264xcorr_kernel_edsp_start:
265  @ input:
266  @   r3      = int         len
267  @   r4      = opus_val16 *_x (must be 32-bit aligned)
268  @   r5      = opus_val16 *_y (must be 32-bit aligned)
269  @   r6...r9 = opus_val32  sum[4]
270  @ output:
271  @   r6...r9 = opus_val32  sum[4]
272  @ preserved: r0-r5
273  @ internal usage
274  @   r2      = int         j
275  @   r12,r14 = opus_val16  x[4]
276  @   r10,r11 = opus_val16  y[4]
277  STMFD        sp!, {r2,r4,r5,lr}
278  LDR          r10, [r5], #4      @ Load y[0...1]
279  SUBS         r2, r3, #4         @ j = len-4
280  LDR          r11, [r5], #4      @ Load y[2...3]
281  BLE xcorr_kernel_edsp_process4_done
282  LDR          r12, [r4], #4      @ Load x[0...1]
283  @ Stall
284xcorr_kernel_edsp_process4:
285  @ The multiplies must issue from pipeline 0, and can't dual-issue with each
286  @ other. Every other instruction here dual-issues with a multiply, and is
287  @ thus "free". There should be no stalls in the body of the loop.
288  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_0,y_0)
289  LDR          r14, [r4], #4      @ Load x[2...3]
290  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x_0,y_1)
291  SUBS         r2, r2, #4         @ j-=4
292  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_0,y_2)
293  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x_0,y_3)
294  SMLATT       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_1,y_1)
295  LDR          r10, [r5], #4      @ Load y[4...5]
296  SMLATB       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],x_1,y_2)
297  SMLATT       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_1,y_3)
298  SMLATB       r9, r12, r10, r9   @ sum[3] = MAC16_16(sum[3],x_1,y_4)
299  LDRGT        r12, [r4], #4      @ Load x[0...1]
300  SMLABB       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_2,y_2)
301  SMLABT       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x_2,y_3)
302  SMLABB       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_2,y_4)
303  SMLABT       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x_2,y_5)
304  SMLATT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_3,y_3)
305  LDR          r11, [r5], #4      @ Load y[6...7]
306  SMLATB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],x_3,y_4)
307  SMLATT       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_3,y_5)
308  SMLATB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],x_3,y_6)
309  BGT xcorr_kernel_edsp_process4
310xcorr_kernel_edsp_process4_done:
311  ADDS         r2, r2, #4
312  BLE xcorr_kernel_edsp_done
313  LDRH         r12, [r4], #2      @ r12 = *x++
314  SUBS         r2, r2, #1         @ j--
315  @ Stall
316  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_0)
317  LDRHGT       r14, [r4], #2      @ r14 = *x++
318  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x,y_1)
319  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_2)
320  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x,y_3)
321  BLE xcorr_kernel_edsp_done
322  SMLABT       r6, r14, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_1)
323  SUBS         r2, r2, #1         @ j--
324  SMLABB       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x,y_2)
325  LDRH         r10, [r5], #2      @ r10 = y_4 = *y++
326  SMLABT       r8, r14, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_3)
327  LDRHGT       r12, [r4], #2      @ r12 = *x++
328  SMLABB       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x,y_4)
329  BLE xcorr_kernel_edsp_done
330  SMLABB       r6, r12, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_2)
331  CMP          r2, #1             @ j--
332  SMLABT       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_3)
333  LDRH         r2, [r5], #2       @ r2 = y_5 = *y++
334  SMLABB       r8, r12, r10, r8   @ sum[2] = MAC16_16(sum[2],tmp,y_4)
335  LDRHGT       r14, [r4]          @ r14 = *x
336  SMLABB       r9, r12, r2, r9    @ sum[3] = MAC16_16(sum[3],tmp,y_5)
337  BLE xcorr_kernel_edsp_done
338  SMLABT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_3)
339  LDRH         r11, [r5]          @ r11 = y_6 = *y
340  SMLABB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_4)
341  SMLABB       r8, r14, r2, r8    @ sum[2] = MAC16_16(sum[2],tmp,y_5)
342  SMLABB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],tmp,y_6)
343xcorr_kernel_edsp_done:
344  LDMFD        sp!, {r2,r4,r5,pc}
345	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
346
347; celt_pitch_xcorr_edsp: @ PROC
348  @ input:
349  @   r0  = opus_val16 *_x (must be 32-bit aligned)
350  @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
351  @   r2  = opus_val32 *xcorr
352  @   r3  = int         len
353  @ output:
354  @   r0  = maxcorr
355  @ internal usage
356  @   r4  = opus_val16 *x
357  @   r5  = opus_val16 *y
358  @   r6  = opus_val32  sum0
359  @   r7  = opus_val32  sum1
360  @   r8  = opus_val32  sum2
361  @   r9  = opus_val32  sum3
362  @   r1  = int         max_pitch
363  @   r12 = int         j
364  STMFD        sp!, {r4-r11, lr}
365  MOV          r5, r1
366  LDR          r1, [sp, #36]
367  MOV          r4, r0
368  TST          r5, #3
369  @ maxcorr = 1
370  MOV          r0, #1
371  BEQ          celt_pitch_xcorr_edsp_process1u_done
372@ Compute one sum at the start to make y 32-bit aligned.
373  SUBS         r12, r3, #4
374  @ r14 = sum = 0
375  MOV          r14, #0
376  LDRH         r8, [r5], #2
377  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
378  LDR          r6, [r4], #4
379  MOV          r8, r8, LSL #16
380celt_pitch_xcorr_edsp_process1u_loop4:
381  LDR          r9, [r5], #4
382  SMLABT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
383  LDR          r7, [r4], #4
384  SMLATB       r14, r6, r9, r14     @ sum = MAC16_16(sum, x_1, y_1)
385  LDR          r8, [r5], #4
386  SMLABT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
387  SUBS         r12, r12, #4         @ j-=4
388  SMLATB       r14, r7, r8, r14     @ sum = MAC16_16(sum, x_3, y_3)
389  LDRGT        r6, [r4], #4
390  BGT celt_pitch_xcorr_edsp_process1u_loop4
391  MOV          r8, r8, LSR #16
392celt_pitch_xcorr_edsp_process1u_loop4_done:
393  ADDS         r12, r12, #4
394celt_pitch_xcorr_edsp_process1u_loop1:
395  LDRHGE       r6, [r4], #2
396  @ Stall
397  SMLABBGE     r14, r6, r8, r14    @ sum = MAC16_16(sum, *x, *y)
398  SUBSGE       r12, r12, #1
399  LDRHGT       r8, [r5], #2
400  BGT celt_pitch_xcorr_edsp_process1u_loop1
401  @ Restore _x
402  SUB          r4, r4, r3, LSL #1
403  @ Restore and advance _y
404  SUB          r5, r5, r3, LSL #1
405  @ maxcorr = max(maxcorr, sum)
406  CMP          r0, r14
407  ADD          r5, r5, #2
408  MOVLT        r0, r14
409  SUBS         r1, r1, #1
410  @ xcorr[i] = sum
411  STR          r14, [r2], #4
412  BLE celt_pitch_xcorr_edsp_done
413celt_pitch_xcorr_edsp_process1u_done:
414  @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
415  SUBS         r1, r1, #4
416  BLT celt_pitch_xcorr_edsp_process2
417celt_pitch_xcorr_edsp_process4:
418  @ xcorr_kernel_edsp parameters:
419  @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
420  MOV          r6, #0
421  MOV          r7, #0
422  MOV          r8, #0
423  MOV          r9, #0
424  BL xcorr_kernel_edsp_start  @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
425  @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
426  CMP          r0, r6
427  @ _y+=4
428  ADD          r5, r5, #8
429  MOVLT        r0, r6
430  CMP          r0, r7
431  MOVLT        r0, r7
432  CMP          r0, r8
433  MOVLT        r0, r8
434  CMP          r0, r9
435  MOVLT        r0, r9
436  STMIA        r2!, {r6-r9}
437  SUBS         r1, r1, #4
438  BGE celt_pitch_xcorr_edsp_process4
439celt_pitch_xcorr_edsp_process2:
440  ADDS         r1, r1, #2
441  BLT celt_pitch_xcorr_edsp_process1a
442  SUBS         r12, r3, #4
443  @ {r10, r11} = {sum0, sum1} = {0, 0}
444  MOV          r10, #0
445  MOV          r11, #0
446  LDR          r8, [r5], #4
447  BLE celt_pitch_xcorr_edsp_process2_loop_done
448  LDR          r6, [r4], #4
449  LDR          r9, [r5], #4
450celt_pitch_xcorr_edsp_process2_loop4:
451  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
452  LDR          r7, [r4], #4
453  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
454  SUBS         r12, r12, #4         @ j-=4
455  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
456  LDR          r8, [r5], #4
457  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
458  LDRGT        r6, [r4], #4
459  SMLABB       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_2, y_2)
460  SMLABT       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_2, y_3)
461  SMLATT       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_3, y_3)
462  LDRGT        r9, [r5], #4
463  SMLATB       r11, r7, r8, r11     @ sum1 = MAC16_16(sum1, x_3, y_4)
464  BGT celt_pitch_xcorr_edsp_process2_loop4
465celt_pitch_xcorr_edsp_process2_loop_done:
466  ADDS         r12, r12, #2
467  BLE  celt_pitch_xcorr_edsp_process2_1
468  LDR          r6, [r4], #4
469  @ Stall
470  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
471  LDR          r9, [r5], #4
472  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
473  SUB          r12, r12, #2
474  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
475  MOV          r8, r9
476  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
477celt_pitch_xcorr_edsp_process2_1:
478  LDRH         r6, [r4], #2
479  ADDS         r12, r12, #1
480  @ Stall
481  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
482  LDRHGT       r7, [r4], #2
483  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
484  BLE celt_pitch_xcorr_edsp_process2_done
485  LDRH         r9, [r5], #2
486  SMLABT       r10, r7, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_1)
487  SMLABB       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_0, y_2)
488celt_pitch_xcorr_edsp_process2_done:
489  @ Restore _x
490  SUB          r4, r4, r3, LSL #1
491  @ Restore and advance _y
492  SUB          r5, r5, r3, LSL #1
493  @ maxcorr = max(maxcorr, sum0)
494  CMP          r0, r10
495  ADD          r5, r5, #2
496  MOVLT        r0, r10
497  SUB          r1, r1, #2
498  @ maxcorr = max(maxcorr, sum1)
499  CMP          r0, r11
500  @ xcorr[i] = sum
501  STR          r10, [r2], #4
502  MOVLT        r0, r11
503  STR          r11, [r2], #4
504celt_pitch_xcorr_edsp_process1a:
505  ADDS         r1, r1, #1
506  BLT celt_pitch_xcorr_edsp_done
507  SUBS         r12, r3, #4
508  @ r14 = sum = 0
509  MOV          r14, #0
510  BLT celt_pitch_xcorr_edsp_process1a_loop_done
511  LDR          r6, [r4], #4
512  LDR          r8, [r5], #4
513  LDR          r7, [r4], #4
514  LDR          r9, [r5], #4
515celt_pitch_xcorr_edsp_process1a_loop4:
516  SMLABB       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
517  SUBS         r12, r12, #4         @ j-=4
518  SMLATT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
519  LDRGE        r6, [r4], #4
520  SMLABB       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
521  LDRGE        r8, [r5], #4
522  SMLATT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_3, y_3)
523  LDRGE        r7, [r4], #4
524  LDRGE        r9, [r5], #4
525  BGE celt_pitch_xcorr_edsp_process1a_loop4
526celt_pitch_xcorr_edsp_process1a_loop_done:
527  ADDS         r12, r12, #2
528  LDRGE        r6, [r4], #4
529  LDRGE        r8, [r5], #4
530  @ Stall
531  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
532  SUBGE        r12, r12, #2
533  SMLATTGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
534  ADDS         r12, r12, #1
535  LDRHGE       r6, [r4], #2
536  LDRHGE       r8, [r5], #2
537  @ Stall
538  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, *x, *y)
539  @ maxcorr = max(maxcorr, sum)
540  CMP          r0, r14
541  @ xcorr[i] = sum
542  STR          r14, [r2], #4
543  MOVLT        r0, r14
544celt_pitch_xcorr_edsp_done:
545  LDMFD        sp!, {r4-r11, pc}
546	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  @ ENDP
547
548 .endif
549
550@ END:
551    .section	.note.GNU-stack,"",%progbits
552