1; Copyright (c) 2007-2008 CSIRO
2; Copyright (c) 2007-2009 Xiph.Org Foundation
3; Copyright (c) 2013      Parrot
4; Written by Aurélien Zanelli
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
9;
10; - Redistributions of source code must retain the above copyright
11; notice, this list of conditions and the following disclaimer.
12;
13; - Redistributions in binary form must reproduce the above copyright
14; notice, this list of conditions and the following disclaimer in the
15; documentation and/or other materials provided with the distribution.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29  AREA  |.text|, CODE, READONLY
30
31  GET    celt/arm/armopts.s
32
33IF OPUS_ARM_MAY_HAVE_EDSP
34  EXPORT celt_pitch_xcorr_edsp
35ENDIF
36
37IF OPUS_ARM_MAY_HAVE_NEON
38  EXPORT celt_pitch_xcorr_neon
39ENDIF
40
41IF OPUS_ARM_MAY_HAVE_NEON
42
43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
44xcorr_kernel_neon PROC
45xcorr_kernel_neon_start
46  ; input:
47  ;   r3     = int         len
48  ;   r4     = opus_val16 *x
49  ;   r5     = opus_val16 *y
50  ;   q0     = opus_val32  sum[4]
51  ; output:
52  ;   q0     = opus_val32  sum[4]
53  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
54  ; internal usage:
55  ;   r12 = int j
56  ;   d3  = y_3|y_2|y_1|y_0
57  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
58  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
59  ;   q8  = scratch
60  ;
61  ; Load y[0...3]
62  ; This requires len>0 to always be valid (which we assert in the C code).
63  VLD1.16      {d5}, [r5]!
64  SUBS         r12, r3, #8
65  BLE xcorr_kernel_neon_process4
66; Process 8 samples at a time.
67; This loop loads one y value more than we actually need. Therefore we have to
68; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
69; reading past the end of the array.
70xcorr_kernel_neon_process8
71  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
72  ; - 2 cycles of ARM insrtuctions,
73  ; - 10 cycles of load/store/byte permute instructions, and
74  ; - 9 cycles of data processing instructions.
75  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
76  ; latter two categories, meaning the whole loop should run in 10 cycles per
77  ; iteration, barring cache misses.
78  ;
79  ; Load x[0...7]
80  VLD1.16      {d6, d7}, [r4]!
81  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
82  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
83  VAND         d3, d5, d5
84  SUBS         r12, r12, #8
85  ; Load y[4...11]
86  VLD1.16      {d4, d5}, [r5]!
87  VMLAL.S16    q0, d3, d6[0]
88  VEXT.16      d16, d3, d4, #1
89  VMLAL.S16    q0, d4, d7[0]
90  VEXT.16      d17, d4, d5, #1
91  VMLAL.S16    q0, d16, d6[1]
92  VEXT.16      d16, d3, d4, #2
93  VMLAL.S16    q0, d17, d7[1]
94  VEXT.16      d17, d4, d5, #2
95  VMLAL.S16    q0, d16, d6[2]
96  VEXT.16      d16, d3, d4, #3
97  VMLAL.S16    q0, d17, d7[2]
98  VEXT.16      d17, d4, d5, #3
99  VMLAL.S16    q0, d16, d6[3]
100  VMLAL.S16    q0, d17, d7[3]
101  BGT xcorr_kernel_neon_process8
102; Process 4 samples here if we have > 4 left (still reading one extra y value).
103xcorr_kernel_neon_process4
104  ADDS         r12, r12, #4
105  BLE xcorr_kernel_neon_process2
106  ; Load x[0...3]
107  VLD1.16      d6, [r4]!
108  ; Use VAND since it's a data processing instruction again.
109  VAND         d4, d5, d5
110  SUB          r12, r12, #4
111  ; Load y[4...7]
112  VLD1.16      d5, [r5]!
113  VMLAL.S16    q0, d4, d6[0]
114  VEXT.16      d16, d4, d5, #1
115  VMLAL.S16    q0, d16, d6[1]
116  VEXT.16      d16, d4, d5, #2
117  VMLAL.S16    q0, d16, d6[2]
118  VEXT.16      d16, d4, d5, #3
119  VMLAL.S16    q0, d16, d6[3]
120; Process 2 samples here if we have > 2 left (still reading one extra y value).
121xcorr_kernel_neon_process2
122  ADDS         r12, r12, #2
123  BLE xcorr_kernel_neon_process1
124  ; Load x[0...1]
125  VLD2.16      {d6[],d7[]}, [r4]!
126  ; Use VAND since it's a data processing instruction again.
127  VAND         d4, d5, d5
128  SUB          r12, r12, #2
129  ; Load y[4...5]
130  VLD1.32      {d5[]}, [r5]!
131  VMLAL.S16    q0, d4, d6
132  VEXT.16      d16, d4, d5, #1
133  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
134  ; instead of VEXT, since it's a data-processing instruction.
135  VSRI.64      d5, d4, #32
136  VMLAL.S16    q0, d16, d7
137; Process 1 sample using the extra y value we loaded above.
138xcorr_kernel_neon_process1
139  ; Load next *x
140  VLD1.16      {d6[]}, [r4]!
141  ADDS         r12, r12, #1
142  ; y[0...3] are left in d5 from prior iteration(s) (if any)
143  VMLAL.S16    q0, d5, d6
144  MOVLE        pc, lr
145; Now process 1 last sample, not reading ahead.
146  ; Load last *y
147  VLD1.16      {d4[]}, [r5]!
148  VSRI.64      d4, d5, #16
149  ; Load last *x
150  VLD1.16      {d6[]}, [r4]!
151  VMLAL.S16    q0, d4, d6
152  MOV          pc, lr
153  ENDP
154
155; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
156;  opus_val32 *xcorr, int len, int max_pitch)
157celt_pitch_xcorr_neon PROC
158  ; input:
159  ;   r0  = opus_val16 *_x
160  ;   r1  = opus_val16 *_y
161  ;   r2  = opus_val32 *xcorr
162  ;   r3  = int         len
163  ; output:
164  ;   r0  = int         maxcorr
165  ; internal usage:
166  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
167  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
168  ;   r6  = int         max_pitch
169  ;   r12 = int         j
170  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
171  STMFD        sp!, {r4-r6, lr}
172  LDR          r6, [sp, #16]
173  VMOV.S32     q15, #1
174  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
175  SUBS         r6, r6, #4
176  BLT celt_pitch_xcorr_neon_process4_done
177celt_pitch_xcorr_neon_process4
178  ; xcorr_kernel_neon parameters:
179  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
180  MOV          r4, r0
181  MOV          r5, r1
182  VEOR         q0, q0, q0
183  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
184  ; So we don't save/restore any other registers.
185  BL xcorr_kernel_neon_start
186  SUBS         r6, r6, #4
187  VST1.32      {q0}, [r2]!
188  ; _y += 4
189  ADD          r1, r1, #8
190  VMAX.S32     q15, q15, q0
191  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
192  BGE celt_pitch_xcorr_neon_process4
193; We have less than 4 sums left to compute.
194celt_pitch_xcorr_neon_process4_done
195  ADDS         r6, r6, #4
196  ; Reduce maxcorr to a single value
197  VMAX.S32     d30, d30, d31
198  VPMAX.S32    d30, d30, d30
199  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
200  BLE celt_pitch_xcorr_neon_done
201; Now compute each remaining sum one at a time.
202celt_pitch_xcorr_neon_process_remaining
203  MOV          r4, r0
204  MOV          r5, r1
205  VMOV.I32     q0, #0
206  SUBS         r12, r3, #8
207  BLT celt_pitch_xcorr_neon_process_remaining4
208; Sum terms 8 at a time.
209celt_pitch_xcorr_neon_process_remaining_loop8
210  ; Load x[0...7]
211  VLD1.16      {q1}, [r4]!
212  ; Load y[0...7]
213  VLD1.16      {q2}, [r5]!
214  SUBS         r12, r12, #8
215  VMLAL.S16    q0, d4, d2
216  VMLAL.S16    q0, d5, d3
217  BGE celt_pitch_xcorr_neon_process_remaining_loop8
218; Sum terms 4 at a time.
219celt_pitch_xcorr_neon_process_remaining4
220  ADDS         r12, r12, #4
221  BLT celt_pitch_xcorr_neon_process_remaining4_done
222  ; Load x[0...3]
223  VLD1.16      {d2}, [r4]!
224  ; Load y[0...3]
225  VLD1.16      {d3}, [r5]!
226  SUB          r12, r12, #4
227  VMLAL.S16    q0, d3, d2
228celt_pitch_xcorr_neon_process_remaining4_done
229  ; Reduce the sum to a single value.
230  VADD.S32     d0, d0, d1
231  VPADDL.S32   d0, d0
232  ADDS         r12, r12, #4
233  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
234; Sum terms 1 at a time.
235celt_pitch_xcorr_neon_process_remaining_loop1
236  VLD1.16      {d2[]}, [r4]!
237  VLD1.16      {d3[]}, [r5]!
238  SUBS         r12, r12, #1
239  VMLAL.S16    q0, d2, d3
240  BGT celt_pitch_xcorr_neon_process_remaining_loop1
241celt_pitch_xcorr_neon_process_remaining_loop_done
242  VST1.32      {d0[0]}, [r2]!
243  VMAX.S32     d30, d30, d0
244  SUBS         r6, r6, #1
245  ; _y++
246  ADD          r1, r1, #2
247  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
248  BGT celt_pitch_xcorr_neon_process_remaining
249celt_pitch_xcorr_neon_done
250  VMOV.32      r0, d30[0]
251  LDMFD        sp!, {r4-r6, pc}
252  ENDP
253
254ENDIF
255
256IF OPUS_ARM_MAY_HAVE_EDSP
257
258; This will get used on ARMv7 devices without NEON, so it has been optimized
259; to take advantage of dual-issuing where possible.
260xcorr_kernel_edsp PROC
261xcorr_kernel_edsp_start
262  ; input:
263  ;   r3      = int         len
264  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
265  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
266  ;   r6...r9 = opus_val32  sum[4]
267  ; output:
268  ;   r6...r9 = opus_val32  sum[4]
269  ; preserved: r0-r5
270  ; internal usage
271  ;   r2      = int         j
272  ;   r12,r14 = opus_val16  x[4]
273  ;   r10,r11 = opus_val16  y[4]
274  STMFD        sp!, {r2,r4,r5,lr}
275  LDR          r10, [r5], #4      ; Load y[0...1]
276  SUBS         r2, r3, #4         ; j = len-4
277  LDR          r11, [r5], #4      ; Load y[2...3]
278  BLE xcorr_kernel_edsp_process4_done
279  LDR          r12, [r4], #4      ; Load x[0...1]
280  ; Stall
281xcorr_kernel_edsp_process4
282  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
283  ; other. Every other instruction here dual-issues with a multiply, and is
284  ; thus "free". There should be no stalls in the body of the loop.
285  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
286  LDR          r14, [r4], #4      ; Load x[2...3]
287  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
288  SUBS         r2, r2, #4         ; j-=4
289  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
290  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
291  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
292  LDR          r10, [r5], #4      ; Load y[4...5]
293  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
294  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
295  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
296  LDRGT        r12, [r4], #4      ; Load x[0...1]
297  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
298  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
299  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
300  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
301  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
302  LDR          r11, [r5], #4      ; Load y[6...7]
303  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
304  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
305  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
306  BGT xcorr_kernel_edsp_process4
307xcorr_kernel_edsp_process4_done
308  ADDS         r2, r2, #4
309  BLE xcorr_kernel_edsp_done
310  LDRH         r12, [r4], #2      ; r12 = *x++
311  SUBS         r2, r2, #1         ; j--
312  ; Stall
313  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
314  LDRHGT       r14, [r4], #2      ; r14 = *x++
315  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
316  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
317  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
318  BLE xcorr_kernel_edsp_done
319  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
320  SUBS         r2, r2, #1         ; j--
321  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
322  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
323  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
324  LDRHGT       r12, [r4], #2      ; r12 = *x++
325  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
326  BLE xcorr_kernel_edsp_done
327  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
328  CMP          r2, #1             ; j--
329  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
330  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
331  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
332  LDRHGT       r14, [r4]          ; r14 = *x
333  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
334  BLE xcorr_kernel_edsp_done
335  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
336  LDRH         r11, [r5]          ; r11 = y_6 = *y
337  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
338  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
339  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
340xcorr_kernel_edsp_done
341  LDMFD        sp!, {r2,r4,r5,pc}
342  ENDP
343
344celt_pitch_xcorr_edsp PROC
345  ; input:
346  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
347  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
348  ;   r2  = opus_val32 *xcorr
349  ;   r3  = int         len
350  ; output:
351  ;   r0  = maxcorr
352  ; internal usage
353  ;   r4  = opus_val16 *x
354  ;   r5  = opus_val16 *y
355  ;   r6  = opus_val32  sum0
356  ;   r7  = opus_val32  sum1
357  ;   r8  = opus_val32  sum2
358  ;   r9  = opus_val32  sum3
359  ;   r1  = int         max_pitch
360  ;   r12 = int         j
361  STMFD        sp!, {r4-r11, lr}
362  MOV          r5, r1
363  LDR          r1, [sp, #36]
364  MOV          r4, r0
365  TST          r5, #3
366  ; maxcorr = 1
367  MOV          r0, #1
368  BEQ          celt_pitch_xcorr_edsp_process1u_done
369; Compute one sum at the start to make y 32-bit aligned.
370  SUBS         r12, r3, #4
371  ; r14 = sum = 0
372  MOV          r14, #0
373  LDRH         r8, [r5], #2
374  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
375  LDR          r6, [r4], #4
376  MOV          r8, r8, LSL #16
377celt_pitch_xcorr_edsp_process1u_loop4
378  LDR          r9, [r5], #4
379  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
380  LDR          r7, [r4], #4
381  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
382  LDR          r8, [r5], #4
383  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
384  SUBS         r12, r12, #4         ; j-=4
385  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
386  LDRGT        r6, [r4], #4
387  BGT celt_pitch_xcorr_edsp_process1u_loop4
388  MOV          r8, r8, LSR #16
389celt_pitch_xcorr_edsp_process1u_loop4_done
390  ADDS         r12, r12, #4
391celt_pitch_xcorr_edsp_process1u_loop1
392  LDRHGE       r6, [r4], #2
393  ; Stall
394  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
395  SUBSGE       r12, r12, #1
396  LDRHGT       r8, [r5], #2
397  BGT celt_pitch_xcorr_edsp_process1u_loop1
398  ; Restore _x
399  SUB          r4, r4, r3, LSL #1
400  ; Restore and advance _y
401  SUB          r5, r5, r3, LSL #1
402  ; maxcorr = max(maxcorr, sum)
403  CMP          r0, r14
404  ADD          r5, r5, #2
405  MOVLT        r0, r14
406  SUBS         r1, r1, #1
407  ; xcorr[i] = sum
408  STR          r14, [r2], #4
409  BLE celt_pitch_xcorr_edsp_done
410celt_pitch_xcorr_edsp_process1u_done
411  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
412  SUBS         r1, r1, #4
413  BLT celt_pitch_xcorr_edsp_process2
414celt_pitch_xcorr_edsp_process4
415  ; xcorr_kernel_edsp parameters:
416  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
417  MOV          r6, #0
418  MOV          r7, #0
419  MOV          r8, #0
420  MOV          r9, #0
421  BL xcorr_kernel_edsp_start  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
422  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
423  CMP          r0, r6
424  ; _y+=4
425  ADD          r5, r5, #8
426  MOVLT        r0, r6
427  CMP          r0, r7
428  MOVLT        r0, r7
429  CMP          r0, r8
430  MOVLT        r0, r8
431  CMP          r0, r9
432  MOVLT        r0, r9
433  STMIA        r2!, {r6-r9}
434  SUBS         r1, r1, #4
435  BGE celt_pitch_xcorr_edsp_process4
436celt_pitch_xcorr_edsp_process2
437  ADDS         r1, r1, #2
438  BLT celt_pitch_xcorr_edsp_process1a
439  SUBS         r12, r3, #4
440  ; {r10, r11} = {sum0, sum1} = {0, 0}
441  MOV          r10, #0
442  MOV          r11, #0
443  LDR          r8, [r5], #4
444  BLE celt_pitch_xcorr_edsp_process2_loop_done
445  LDR          r6, [r4], #4
446  LDR          r9, [r5], #4
447celt_pitch_xcorr_edsp_process2_loop4
448  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
449  LDR          r7, [r4], #4
450  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
451  SUBS         r12, r12, #4         ; j-=4
452  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
453  LDR          r8, [r5], #4
454  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
455  LDRGT        r6, [r4], #4
456  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
457  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
458  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
459  LDRGT        r9, [r5], #4
460  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
461  BGT celt_pitch_xcorr_edsp_process2_loop4
462celt_pitch_xcorr_edsp_process2_loop_done
463  ADDS         r12, r12, #2
464  BLE  celt_pitch_xcorr_edsp_process2_1
465  LDR          r6, [r4], #4
466  ; Stall
467  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
468  LDR          r9, [r5], #4
469  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
470  SUB          r12, r12, #2
471  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
472  MOV          r8, r9
473  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
474celt_pitch_xcorr_edsp_process2_1
475  LDRH         r6, [r4], #2
476  ADDS         r12, r12, #1
477  ; Stall
478  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
479  LDRHGT       r7, [r4], #2
480  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
481  BLE celt_pitch_xcorr_edsp_process2_done
482  LDRH         r9, [r5], #2
483  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
484  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
485celt_pitch_xcorr_edsp_process2_done
486  ; Restore _x
487  SUB          r4, r4, r3, LSL #1
488  ; Restore and advance _y
489  SUB          r5, r5, r3, LSL #1
490  ; maxcorr = max(maxcorr, sum0)
491  CMP          r0, r10
492  ADD          r5, r5, #2
493  MOVLT        r0, r10
494  SUB          r1, r1, #2
495  ; maxcorr = max(maxcorr, sum1)
496  CMP          r0, r11
497  ; xcorr[i] = sum
498  STR          r10, [r2], #4
499  MOVLT        r0, r11
500  STR          r11, [r2], #4
501celt_pitch_xcorr_edsp_process1a
502  ADDS         r1, r1, #1
503  BLT celt_pitch_xcorr_edsp_done
504  SUBS         r12, r3, #4
505  ; r14 = sum = 0
506  MOV          r14, #0
507  BLT celt_pitch_xcorr_edsp_process1a_loop_done
508  LDR          r6, [r4], #4
509  LDR          r8, [r5], #4
510  LDR          r7, [r4], #4
511  LDR          r9, [r5], #4
512celt_pitch_xcorr_edsp_process1a_loop4
513  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
514  SUBS         r12, r12, #4         ; j-=4
515  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
516  LDRGE        r6, [r4], #4
517  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
518  LDRGE        r8, [r5], #4
519  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
520  LDRGE        r7, [r4], #4
521  LDRGE        r9, [r5], #4
522  BGE celt_pitch_xcorr_edsp_process1a_loop4
523celt_pitch_xcorr_edsp_process1a_loop_done
524  ADDS         r12, r12, #2
525  LDRGE        r6, [r4], #4
526  LDRGE        r8, [r5], #4
527  ; Stall
528  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
529  SUBGE        r12, r12, #2
530  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
531  ADDS         r12, r12, #1
532  LDRHGE       r6, [r4], #2
533  LDRHGE       r8, [r5], #2
534  ; Stall
535  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
536  ; maxcorr = max(maxcorr, sum)
537  CMP          r0, r14
538  ; xcorr[i] = sum
539  STR          r14, [r2], #4
540  MOVLT        r0, r14
541celt_pitch_xcorr_edsp_done
542  LDMFD        sp!, {r4-r11, pc}
543  ENDP
544
545ENDIF
546
547END
548