1; Copyright (c) 2007-2008 CSIRO 2; Copyright (c) 2007-2009 Xiph.Org Foundation 3; Copyright (c) 2013 Parrot 4; Written by Aurélien Zanelli 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions 8; are met: 9; 10; - Redistributions of source code must retain the above copyright 11; notice, this list of conditions and the following disclaimer. 12; 13; - Redistributions in binary form must reproduce the above copyright 14; notice, this list of conditions and the following disclaimer in the 15; documentation and/or other materials provided with the distribution. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 AREA |.text|, CODE, READONLY 30 31 GET celt/arm/armopts.s 32 33IF OPUS_ARM_MAY_HAVE_EDSP 34 EXPORT celt_pitch_xcorr_edsp 35ENDIF 36 37IF OPUS_ARM_MAY_HAVE_NEON 38 EXPORT celt_pitch_xcorr_neon 39ENDIF 40 41IF OPUS_ARM_MAY_HAVE_NEON 42 43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 44xcorr_kernel_neon PROC 45xcorr_kernel_neon_start 46 ; input: 47 ; r3 = int len 48 ; r4 = opus_val16 *x 49 ; r5 = opus_val16 *y 50 ; q0 = opus_val32 sum[4] 51 ; output: 52 ; q0 = opus_val32 sum[4] 53 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 54 ; internal usage: 55 ; r12 = int j 56 ; d3 = y_3|y_2|y_1|y_0 57 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 58 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 59 ; q8 = scratch 60 ; 61 ; Load y[0...3] 62 ; This requires len>0 to always be valid (which we assert in the C code). 63 VLD1.16 {d5}, [r5]! 64 SUBS r12, r3, #8 65 BLE xcorr_kernel_neon_process4 66; Process 8 samples at a time. 67; This loop loads one y value more than we actually need. Therefore we have to 68; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid 69; reading past the end of the array. 70xcorr_kernel_neon_process8 71 ; This loop has 19 total instructions (10 cycles to issue, minimum), with 72 ; - 2 cycles of ARM insrtuctions, 73 ; - 10 cycles of load/store/byte permute instructions, and 74 ; - 9 cycles of data processing instructions. 75 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the 76 ; latter two categories, meaning the whole loop should run in 10 cycles per 77 ; iteration, barring cache misses. 78 ; 79 ; Load x[0...7] 80 VLD1.16 {d6, d7}, [r4]! 81 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get 82 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. 83 VAND d3, d5, d5 84 SUBS r12, r12, #8 85 ; Load y[4...11] 86 VLD1.16 {d4, d5}, [r5]! 87 VMLAL.S16 q0, d3, d6[0] 88 VEXT.16 d16, d3, d4, #1 89 VMLAL.S16 q0, d4, d7[0] 90 VEXT.16 d17, d4, d5, #1 91 VMLAL.S16 q0, d16, d6[1] 92 VEXT.16 d16, d3, d4, #2 93 VMLAL.S16 q0, d17, d7[1] 94 VEXT.16 d17, d4, d5, #2 95 VMLAL.S16 q0, d16, d6[2] 96 VEXT.16 d16, d3, d4, #3 97 VMLAL.S16 q0, d17, d7[2] 98 VEXT.16 d17, d4, d5, #3 99 VMLAL.S16 q0, d16, d6[3] 100 VMLAL.S16 q0, d17, d7[3] 101 BGT xcorr_kernel_neon_process8 102; Process 4 samples here if we have > 4 left (still reading one extra y value). 103xcorr_kernel_neon_process4 104 ADDS r12, r12, #4 105 BLE xcorr_kernel_neon_process2 106 ; Load x[0...3] 107 VLD1.16 d6, [r4]! 108 ; Use VAND since it's a data processing instruction again. 109 VAND d4, d5, d5 110 SUB r12, r12, #4 111 ; Load y[4...7] 112 VLD1.16 d5, [r5]! 113 VMLAL.S16 q0, d4, d6[0] 114 VEXT.16 d16, d4, d5, #1 115 VMLAL.S16 q0, d16, d6[1] 116 VEXT.16 d16, d4, d5, #2 117 VMLAL.S16 q0, d16, d6[2] 118 VEXT.16 d16, d4, d5, #3 119 VMLAL.S16 q0, d16, d6[3] 120; Process 2 samples here if we have > 2 left (still reading one extra y value). 121xcorr_kernel_neon_process2 122 ADDS r12, r12, #2 123 BLE xcorr_kernel_neon_process1 124 ; Load x[0...1] 125 VLD2.16 {d6[],d7[]}, [r4]! 126 ; Use VAND since it's a data processing instruction again. 127 VAND d4, d5, d5 128 SUB r12, r12, #2 129 ; Load y[4...5] 130 VLD1.32 {d5[]}, [r5]! 131 VMLAL.S16 q0, d4, d6 132 VEXT.16 d16, d4, d5, #1 133 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI 134 ; instead of VEXT, since it's a data-processing instruction. 135 VSRI.64 d5, d4, #32 136 VMLAL.S16 q0, d16, d7 137; Process 1 sample using the extra y value we loaded above. 138xcorr_kernel_neon_process1 139 ; Load next *x 140 VLD1.16 {d6[]}, [r4]! 141 ADDS r12, r12, #1 142 ; y[0...3] are left in d5 from prior iteration(s) (if any) 143 VMLAL.S16 q0, d5, d6 144 MOVLE pc, lr 145; Now process 1 last sample, not reading ahead. 146 ; Load last *y 147 VLD1.16 {d4[]}, [r5]! 148 VSRI.64 d4, d5, #16 149 ; Load last *x 150 VLD1.16 {d6[]}, [r4]! 151 VMLAL.S16 q0, d4, d6 152 MOV pc, lr 153 ENDP 154 155; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, 156; opus_val32 *xcorr, int len, int max_pitch) 157celt_pitch_xcorr_neon PROC 158 ; input: 159 ; r0 = opus_val16 *_x 160 ; r1 = opus_val16 *_y 161 ; r2 = opus_val32 *xcorr 162 ; r3 = int len 163 ; output: 164 ; r0 = int maxcorr 165 ; internal usage: 166 ; r4 = opus_val16 *x (for xcorr_kernel_neon()) 167 ; r5 = opus_val16 *y (for xcorr_kernel_neon()) 168 ; r6 = int max_pitch 169 ; r12 = int j 170 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) 171 STMFD sp!, {r4-r6, lr} 172 LDR r6, [sp, #16] 173 VMOV.S32 q15, #1 174 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 175 SUBS r6, r6, #4 176 BLT celt_pitch_xcorr_neon_process4_done 177celt_pitch_xcorr_neon_process4 178 ; xcorr_kernel_neon parameters: 179 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} 180 MOV r4, r0 181 MOV r5, r1 182 VEOR q0, q0, q0 183 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. 184 ; So we don't save/restore any other registers. 185 BL xcorr_kernel_neon_start 186 SUBS r6, r6, #4 187 VST1.32 {q0}, [r2]! 188 ; _y += 4 189 ADD r1, r1, #8 190 VMAX.S32 q15, q15, q0 191 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 192 BGE celt_pitch_xcorr_neon_process4 193; We have less than 4 sums left to compute. 194celt_pitch_xcorr_neon_process4_done 195 ADDS r6, r6, #4 196 ; Reduce maxcorr to a single value 197 VMAX.S32 d30, d30, d31 198 VPMAX.S32 d30, d30, d30 199 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done 200 BLE celt_pitch_xcorr_neon_done 201; Now compute each remaining sum one at a time. 202celt_pitch_xcorr_neon_process_remaining 203 MOV r4, r0 204 MOV r5, r1 205 VMOV.I32 q0, #0 206 SUBS r12, r3, #8 207 BLT celt_pitch_xcorr_neon_process_remaining4 208; Sum terms 8 at a time. 209celt_pitch_xcorr_neon_process_remaining_loop8 210 ; Load x[0...7] 211 VLD1.16 {q1}, [r4]! 212 ; Load y[0...7] 213 VLD1.16 {q2}, [r5]! 214 SUBS r12, r12, #8 215 VMLAL.S16 q0, d4, d2 216 VMLAL.S16 q0, d5, d3 217 BGE celt_pitch_xcorr_neon_process_remaining_loop8 218; Sum terms 4 at a time. 219celt_pitch_xcorr_neon_process_remaining4 220 ADDS r12, r12, #4 221 BLT celt_pitch_xcorr_neon_process_remaining4_done 222 ; Load x[0...3] 223 VLD1.16 {d2}, [r4]! 224 ; Load y[0...3] 225 VLD1.16 {d3}, [r5]! 226 SUB r12, r12, #4 227 VMLAL.S16 q0, d3, d2 228celt_pitch_xcorr_neon_process_remaining4_done 229 ; Reduce the sum to a single value. 230 VADD.S32 d0, d0, d1 231 VPADDL.S32 d0, d0 232 ADDS r12, r12, #4 233 BLE celt_pitch_xcorr_neon_process_remaining_loop_done 234; Sum terms 1 at a time. 235celt_pitch_xcorr_neon_process_remaining_loop1 236 VLD1.16 {d2[]}, [r4]! 237 VLD1.16 {d3[]}, [r5]! 238 SUBS r12, r12, #1 239 VMLAL.S16 q0, d2, d3 240 BGT celt_pitch_xcorr_neon_process_remaining_loop1 241celt_pitch_xcorr_neon_process_remaining_loop_done 242 VST1.32 {d0[0]}, [r2]! 243 VMAX.S32 d30, d30, d0 244 SUBS r6, r6, #1 245 ; _y++ 246 ADD r1, r1, #2 247 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining 248 BGT celt_pitch_xcorr_neon_process_remaining 249celt_pitch_xcorr_neon_done 250 VMOV.32 r0, d30[0] 251 LDMFD sp!, {r4-r6, pc} 252 ENDP 253 254ENDIF 255 256IF OPUS_ARM_MAY_HAVE_EDSP 257 258; This will get used on ARMv7 devices without NEON, so it has been optimized 259; to take advantage of dual-issuing where possible. 260xcorr_kernel_edsp PROC 261xcorr_kernel_edsp_start 262 ; input: 263 ; r3 = int len 264 ; r4 = opus_val16 *_x (must be 32-bit aligned) 265 ; r5 = opus_val16 *_y (must be 32-bit aligned) 266 ; r6...r9 = opus_val32 sum[4] 267 ; output: 268 ; r6...r9 = opus_val32 sum[4] 269 ; preserved: r0-r5 270 ; internal usage 271 ; r2 = int j 272 ; r12,r14 = opus_val16 x[4] 273 ; r10,r11 = opus_val16 y[4] 274 STMFD sp!, {r2,r4,r5,lr} 275 LDR r10, [r5], #4 ; Load y[0...1] 276 SUBS r2, r3, #4 ; j = len-4 277 LDR r11, [r5], #4 ; Load y[2...3] 278 BLE xcorr_kernel_edsp_process4_done 279 LDR r12, [r4], #4 ; Load x[0...1] 280 ; Stall 281xcorr_kernel_edsp_process4 282 ; The multiplies must issue from pipeline 0, and can't dual-issue with each 283 ; other. Every other instruction here dual-issues with a multiply, and is 284 ; thus "free". There should be no stalls in the body of the loop. 285 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) 286 LDR r14, [r4], #4 ; Load x[2...3] 287 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) 288 SUBS r2, r2, #4 ; j-=4 289 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) 290 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) 291 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) 292 LDR r10, [r5], #4 ; Load y[4...5] 293 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) 294 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) 295 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) 296 LDRGT r12, [r4], #4 ; Load x[0...1] 297 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) 298 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) 299 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) 300 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) 301 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) 302 LDR r11, [r5], #4 ; Load y[6...7] 303 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) 304 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) 305 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) 306 BGT xcorr_kernel_edsp_process4 307xcorr_kernel_edsp_process4_done 308 ADDS r2, r2, #4 309 BLE xcorr_kernel_edsp_done 310 LDRH r12, [r4], #2 ; r12 = *x++ 311 SUBS r2, r2, #1 ; j-- 312 ; Stall 313 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) 314 LDRHGT r14, [r4], #2 ; r14 = *x++ 315 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) 316 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) 317 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) 318 BLE xcorr_kernel_edsp_done 319 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) 320 SUBS r2, r2, #1 ; j-- 321 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) 322 LDRH r10, [r5], #2 ; r10 = y_4 = *y++ 323 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) 324 LDRHGT r12, [r4], #2 ; r12 = *x++ 325 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) 326 BLE xcorr_kernel_edsp_done 327 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) 328 CMP r2, #1 ; j-- 329 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) 330 LDRH r2, [r5], #2 ; r2 = y_5 = *y++ 331 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) 332 LDRHGT r14, [r4] ; r14 = *x 333 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) 334 BLE xcorr_kernel_edsp_done 335 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) 336 LDRH r11, [r5] ; r11 = y_6 = *y 337 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) 338 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) 339 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) 340xcorr_kernel_edsp_done 341 LDMFD sp!, {r2,r4,r5,pc} 342 ENDP 343 344celt_pitch_xcorr_edsp PROC 345 ; input: 346 ; r0 = opus_val16 *_x (must be 32-bit aligned) 347 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned) 348 ; r2 = opus_val32 *xcorr 349 ; r3 = int len 350 ; output: 351 ; r0 = maxcorr 352 ; internal usage 353 ; r4 = opus_val16 *x 354 ; r5 = opus_val16 *y 355 ; r6 = opus_val32 sum0 356 ; r7 = opus_val32 sum1 357 ; r8 = opus_val32 sum2 358 ; r9 = opus_val32 sum3 359 ; r1 = int max_pitch 360 ; r12 = int j 361 STMFD sp!, {r4-r11, lr} 362 MOV r5, r1 363 LDR r1, [sp, #36] 364 MOV r4, r0 365 TST r5, #3 366 ; maxcorr = 1 367 MOV r0, #1 368 BEQ celt_pitch_xcorr_edsp_process1u_done 369; Compute one sum at the start to make y 32-bit aligned. 370 SUBS r12, r3, #4 371 ; r14 = sum = 0 372 MOV r14, #0 373 LDRH r8, [r5], #2 374 BLE celt_pitch_xcorr_edsp_process1u_loop4_done 375 LDR r6, [r4], #4 376 MOV r8, r8, LSL #16 377celt_pitch_xcorr_edsp_process1u_loop4 378 LDR r9, [r5], #4 379 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 380 LDR r7, [r4], #4 381 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1) 382 LDR r8, [r5], #4 383 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) 384 SUBS r12, r12, #4 ; j-=4 385 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3) 386 LDRGT r6, [r4], #4 387 BGT celt_pitch_xcorr_edsp_process1u_loop4 388 MOV r8, r8, LSR #16 389celt_pitch_xcorr_edsp_process1u_loop4_done 390 ADDS r12, r12, #4 391celt_pitch_xcorr_edsp_process1u_loop1 392 LDRHGE r6, [r4], #2 393 ; Stall 394 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) 395 SUBSGE r12, r12, #1 396 LDRHGT r8, [r5], #2 397 BGT celt_pitch_xcorr_edsp_process1u_loop1 398 ; Restore _x 399 SUB r4, r4, r3, LSL #1 400 ; Restore and advance _y 401 SUB r5, r5, r3, LSL #1 402 ; maxcorr = max(maxcorr, sum) 403 CMP r0, r14 404 ADD r5, r5, #2 405 MOVLT r0, r14 406 SUBS r1, r1, #1 407 ; xcorr[i] = sum 408 STR r14, [r2], #4 409 BLE celt_pitch_xcorr_edsp_done 410celt_pitch_xcorr_edsp_process1u_done 411 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 412 SUBS r1, r1, #4 413 BLT celt_pitch_xcorr_edsp_process2 414celt_pitch_xcorr_edsp_process4 415 ; xcorr_kernel_edsp parameters: 416 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} 417 MOV r6, #0 418 MOV r7, #0 419 MOV r8, #0 420 MOV r9, #0 421 BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) 422 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) 423 CMP r0, r6 424 ; _y+=4 425 ADD r5, r5, #8 426 MOVLT r0, r6 427 CMP r0, r7 428 MOVLT r0, r7 429 CMP r0, r8 430 MOVLT r0, r8 431 CMP r0, r9 432 MOVLT r0, r9 433 STMIA r2!, {r6-r9} 434 SUBS r1, r1, #4 435 BGE celt_pitch_xcorr_edsp_process4 436celt_pitch_xcorr_edsp_process2 437 ADDS r1, r1, #2 438 BLT celt_pitch_xcorr_edsp_process1a 439 SUBS r12, r3, #4 440 ; {r10, r11} = {sum0, sum1} = {0, 0} 441 MOV r10, #0 442 MOV r11, #0 443 LDR r8, [r5], #4 444 BLE celt_pitch_xcorr_edsp_process2_loop_done 445 LDR r6, [r4], #4 446 LDR r9, [r5], #4 447celt_pitch_xcorr_edsp_process2_loop4 448 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 449 LDR r7, [r4], #4 450 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 451 SUBS r12, r12, #4 ; j-=4 452 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) 453 LDR r8, [r5], #4 454 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) 455 LDRGT r6, [r4], #4 456 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2) 457 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3) 458 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3) 459 LDRGT r9, [r5], #4 460 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4) 461 BGT celt_pitch_xcorr_edsp_process2_loop4 462celt_pitch_xcorr_edsp_process2_loop_done 463 ADDS r12, r12, #2 464 BLE celt_pitch_xcorr_edsp_process2_1 465 LDR r6, [r4], #4 466 ; Stall 467 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 468 LDR r9, [r5], #4 469 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 470 SUB r12, r12, #2 471 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) 472 MOV r8, r9 473 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) 474celt_pitch_xcorr_edsp_process2_1 475 LDRH r6, [r4], #2 476 ADDS r12, r12, #1 477 ; Stall 478 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 479 LDRHGT r7, [r4], #2 480 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 481 BLE celt_pitch_xcorr_edsp_process2_done 482 LDRH r9, [r5], #2 483 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1) 484 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2) 485celt_pitch_xcorr_edsp_process2_done 486 ; Restore _x 487 SUB r4, r4, r3, LSL #1 488 ; Restore and advance _y 489 SUB r5, r5, r3, LSL #1 490 ; maxcorr = max(maxcorr, sum0) 491 CMP r0, r10 492 ADD r5, r5, #2 493 MOVLT r0, r10 494 SUB r1, r1, #2 495 ; maxcorr = max(maxcorr, sum1) 496 CMP r0, r11 497 ; xcorr[i] = sum 498 STR r10, [r2], #4 499 MOVLT r0, r11 500 STR r11, [r2], #4 501celt_pitch_xcorr_edsp_process1a 502 ADDS r1, r1, #1 503 BLT celt_pitch_xcorr_edsp_done 504 SUBS r12, r3, #4 505 ; r14 = sum = 0 506 MOV r14, #0 507 BLT celt_pitch_xcorr_edsp_process1a_loop_done 508 LDR r6, [r4], #4 509 LDR r8, [r5], #4 510 LDR r7, [r4], #4 511 LDR r9, [r5], #4 512celt_pitch_xcorr_edsp_process1a_loop4 513 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 514 SUBS r12, r12, #4 ; j-=4 515 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) 516 LDRGE r6, [r4], #4 517 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) 518 LDRGE r8, [r5], #4 519 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) 520 LDRGE r7, [r4], #4 521 LDRGE r9, [r5], #4 522 BGE celt_pitch_xcorr_edsp_process1a_loop4 523celt_pitch_xcorr_edsp_process1a_loop_done 524 ADDS r12, r12, #2 525 LDRGE r6, [r4], #4 526 LDRGE r8, [r5], #4 527 ; Stall 528 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 529 SUBGE r12, r12, #2 530 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) 531 ADDS r12, r12, #1 532 LDRHGE r6, [r4], #2 533 LDRHGE r8, [r5], #2 534 ; Stall 535 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) 536 ; maxcorr = max(maxcorr, sum) 537 CMP r0, r14 538 ; xcorr[i] = sum 539 STR r14, [r2], #4 540 MOVLT r0, r14 541celt_pitch_xcorr_edsp_done 542 LDMFD sp!, {r4-r11, pc} 543 ENDP 544 545ENDIF 546 547END 548