1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from armv4-mont.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3bc3d5698SJohn Baldwin 4bc3d5698SJohn Baldwin#if defined(__thumb2__) 5bc3d5698SJohn Baldwin.syntax unified 6bc3d5698SJohn Baldwin.thumb 7bc3d5698SJohn Baldwin#else 8bc3d5698SJohn Baldwin.code 32 9bc3d5698SJohn Baldwin#endif 10bc3d5698SJohn Baldwin 11c0855eaaSJohn Baldwin.text 12c0855eaaSJohn Baldwin 13bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 14bc3d5698SJohn Baldwin.align 5 15bc3d5698SJohn Baldwin.LOPENSSL_armcap: 16c0855eaaSJohn Baldwin# ifdef _WIN32 17c0855eaaSJohn Baldwin.word OPENSSL_armcap_P 18c0855eaaSJohn Baldwin# else 19bc3d5698SJohn Baldwin.word OPENSSL_armcap_P-.Lbn_mul_mont 20bc3d5698SJohn Baldwin# endif 21c0855eaaSJohn Baldwin#endif 22bc3d5698SJohn Baldwin 23bc3d5698SJohn Baldwin.globl bn_mul_mont 24bc3d5698SJohn Baldwin.type bn_mul_mont,%function 25bc3d5698SJohn Baldwin 26bc3d5698SJohn Baldwin.align 5 27bc3d5698SJohn Baldwinbn_mul_mont: 28bc3d5698SJohn Baldwin.Lbn_mul_mont: 29bc3d5698SJohn Baldwin ldr ip,[sp,#4] @ load num 30bc3d5698SJohn Baldwin stmdb sp!,{r0,r2} @ sp points at argument block 31bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 32bc3d5698SJohn Baldwin tst ip,#7 33bc3d5698SJohn Baldwin bne .Lialu 34c0855eaaSJohn Baldwin ldr r0,.LOPENSSL_armcap 35c0855eaaSJohn Baldwin#if !defined(_WIN32) 36c0855eaaSJohn Baldwin adr r2,.Lbn_mul_mont 37bc3d5698SJohn Baldwin ldr r0,[r0,r2] 38c0855eaaSJohn Baldwin# endif 39c0855eaaSJohn Baldwin# if defined(__APPLE__) || defined(_WIN32) 40bc3d5698SJohn Baldwin ldr r0,[r0] 41bc3d5698SJohn Baldwin# endif 42bc3d5698SJohn Baldwin tst r0,#ARMV7_NEON @ NEON available? 43bc3d5698SJohn Baldwin ldmia sp, {r0,r2} 44bc3d5698SJohn Baldwin beq .Lialu 45bc3d5698SJohn Baldwin add sp,sp,#8 46bc3d5698SJohn Baldwin b bn_mul8x_mont_neon 47bc3d5698SJohn Baldwin.align 4 48bc3d5698SJohn Baldwin.Lialu: 49bc3d5698SJohn Baldwin#endif 50bc3d5698SJohn Baldwin cmp ip,#2 51bc3d5698SJohn Baldwin mov r0,ip @ load num 52bc3d5698SJohn Baldwin#ifdef __thumb2__ 53bc3d5698SJohn Baldwin ittt lt 54bc3d5698SJohn Baldwin#endif 55bc3d5698SJohn Baldwin movlt r0,#0 56bc3d5698SJohn Baldwin addlt sp,sp,#2*4 57bc3d5698SJohn Baldwin blt .Labrt 58bc3d5698SJohn Baldwin 59bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers 60bc3d5698SJohn Baldwin 61bc3d5698SJohn Baldwin mov r0,r0,lsl#2 @ rescale r0 for byte count 62bc3d5698SJohn Baldwin sub sp,sp,r0 @ alloca(4*num) 63bc3d5698SJohn Baldwin sub sp,sp,#4 @ +extra dword 64bc3d5698SJohn Baldwin sub r0,r0,#4 @ "num=num-1" 65bc3d5698SJohn Baldwin add r4,r2,r0 @ &bp[num-1] 66bc3d5698SJohn Baldwin 67bc3d5698SJohn Baldwin add r0,sp,r0 @ r0 to point at &tp[num-1] 68bc3d5698SJohn Baldwin ldr r8,[r0,#14*4] @ &n0 69bc3d5698SJohn Baldwin ldr r2,[r2] @ bp[0] 70bc3d5698SJohn Baldwin ldr r5,[r1],#4 @ ap[0],ap++ 71bc3d5698SJohn Baldwin ldr r6,[r3],#4 @ np[0],np++ 72bc3d5698SJohn Baldwin ldr r8,[r8] @ *n0 73bc3d5698SJohn Baldwin str r4,[r0,#15*4] @ save &bp[num] 74bc3d5698SJohn Baldwin 75bc3d5698SJohn Baldwin umull r10,r11,r5,r2 @ ap[0]*bp[0] 76bc3d5698SJohn Baldwin str r8,[r0,#14*4] @ save n0 value 77bc3d5698SJohn Baldwin mul r8,r10,r8 @ "tp[0]"*n0 78bc3d5698SJohn Baldwin mov r12,#0 79bc3d5698SJohn Baldwin umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" 80bc3d5698SJohn Baldwin mov r4,sp 81bc3d5698SJohn Baldwin 82bc3d5698SJohn Baldwin.L1st: 83bc3d5698SJohn Baldwin ldr r5,[r1],#4 @ ap[j],ap++ 84bc3d5698SJohn Baldwin mov r10,r11 85bc3d5698SJohn Baldwin ldr r6,[r3],#4 @ np[j],np++ 86bc3d5698SJohn Baldwin mov r11,#0 87bc3d5698SJohn Baldwin umlal r10,r11,r5,r2 @ ap[j]*bp[0] 88bc3d5698SJohn Baldwin mov r14,#0 89bc3d5698SJohn Baldwin umlal r12,r14,r6,r8 @ np[j]*n0 90bc3d5698SJohn Baldwin adds r12,r12,r10 91bc3d5698SJohn Baldwin str r12,[r4],#4 @ tp[j-1]=,tp++ 92bc3d5698SJohn Baldwin adc r12,r14,#0 93bc3d5698SJohn Baldwin cmp r4,r0 94bc3d5698SJohn Baldwin bne .L1st 95bc3d5698SJohn Baldwin 96bc3d5698SJohn Baldwin adds r12,r12,r11 97bc3d5698SJohn Baldwin ldr r4,[r0,#13*4] @ restore bp 98bc3d5698SJohn Baldwin mov r14,#0 99bc3d5698SJohn Baldwin ldr r8,[r0,#14*4] @ restore n0 100bc3d5698SJohn Baldwin adc r14,r14,#0 101bc3d5698SJohn Baldwin str r12,[r0] @ tp[num-1]= 102bc3d5698SJohn Baldwin mov r7,sp 103bc3d5698SJohn Baldwin str r14,[r0,#4] @ tp[num]= 104bc3d5698SJohn Baldwin 105bc3d5698SJohn Baldwin.Louter: 106bc3d5698SJohn Baldwin sub r7,r0,r7 @ "original" r0-1 value 107bc3d5698SJohn Baldwin sub r1,r1,r7 @ "rewind" ap to &ap[1] 108bc3d5698SJohn Baldwin ldr r2,[r4,#4]! @ *(++bp) 109bc3d5698SJohn Baldwin sub r3,r3,r7 @ "rewind" np to &np[1] 110bc3d5698SJohn Baldwin ldr r5,[r1,#-4] @ ap[0] 111bc3d5698SJohn Baldwin ldr r10,[sp] @ tp[0] 112bc3d5698SJohn Baldwin ldr r6,[r3,#-4] @ np[0] 113bc3d5698SJohn Baldwin ldr r7,[sp,#4] @ tp[1] 114bc3d5698SJohn Baldwin 115bc3d5698SJohn Baldwin mov r11,#0 116bc3d5698SJohn Baldwin umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] 117bc3d5698SJohn Baldwin str r4,[r0,#13*4] @ save bp 118bc3d5698SJohn Baldwin mul r8,r10,r8 119bc3d5698SJohn Baldwin mov r12,#0 120bc3d5698SJohn Baldwin umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" 121bc3d5698SJohn Baldwin mov r4,sp 122bc3d5698SJohn Baldwin 123bc3d5698SJohn Baldwin.Linner: 124bc3d5698SJohn Baldwin ldr r5,[r1],#4 @ ap[j],ap++ 125bc3d5698SJohn Baldwin adds r10,r11,r7 @ +=tp[j] 126bc3d5698SJohn Baldwin ldr r6,[r3],#4 @ np[j],np++ 127bc3d5698SJohn Baldwin mov r11,#0 128bc3d5698SJohn Baldwin umlal r10,r11,r5,r2 @ ap[j]*bp[i] 129bc3d5698SJohn Baldwin mov r14,#0 130bc3d5698SJohn Baldwin umlal r12,r14,r6,r8 @ np[j]*n0 131bc3d5698SJohn Baldwin adc r11,r11,#0 132bc3d5698SJohn Baldwin ldr r7,[r4,#8] @ tp[j+1] 133bc3d5698SJohn Baldwin adds r12,r12,r10 134bc3d5698SJohn Baldwin str r12,[r4],#4 @ tp[j-1]=,tp++ 135bc3d5698SJohn Baldwin adc r12,r14,#0 136bc3d5698SJohn Baldwin cmp r4,r0 137bc3d5698SJohn Baldwin bne .Linner 138bc3d5698SJohn Baldwin 139bc3d5698SJohn Baldwin adds r12,r12,r11 140bc3d5698SJohn Baldwin mov r14,#0 141bc3d5698SJohn Baldwin ldr r4,[r0,#13*4] @ restore bp 142bc3d5698SJohn Baldwin adc r14,r14,#0 143bc3d5698SJohn Baldwin ldr r8,[r0,#14*4] @ restore n0 144bc3d5698SJohn Baldwin adds r12,r12,r7 145bc3d5698SJohn Baldwin ldr r7,[r0,#15*4] @ restore &bp[num] 146bc3d5698SJohn Baldwin adc r14,r14,#0 147bc3d5698SJohn Baldwin str r12,[r0] @ tp[num-1]= 148bc3d5698SJohn Baldwin str r14,[r0,#4] @ tp[num]= 149bc3d5698SJohn Baldwin 150bc3d5698SJohn Baldwin cmp r4,r7 151bc3d5698SJohn Baldwin#ifdef __thumb2__ 152bc3d5698SJohn Baldwin itt ne 153bc3d5698SJohn Baldwin#endif 154bc3d5698SJohn Baldwin movne r7,sp 155bc3d5698SJohn Baldwin bne .Louter 156bc3d5698SJohn Baldwin 157bc3d5698SJohn Baldwin ldr r2,[r0,#12*4] @ pull rp 158bc3d5698SJohn Baldwin mov r5,sp 159bc3d5698SJohn Baldwin add r0,r0,#4 @ r0 to point at &tp[num] 160bc3d5698SJohn Baldwin sub r5,r0,r5 @ "original" num value 161bc3d5698SJohn Baldwin mov r4,sp @ "rewind" r4 162bc3d5698SJohn Baldwin mov r1,r4 @ "borrow" r1 163bc3d5698SJohn Baldwin sub r3,r3,r5 @ "rewind" r3 to &np[0] 164bc3d5698SJohn Baldwin 165bc3d5698SJohn Baldwin subs r7,r7,r7 @ "clear" carry flag 166bc3d5698SJohn Baldwin.Lsub: ldr r7,[r4],#4 167bc3d5698SJohn Baldwin ldr r6,[r3],#4 168bc3d5698SJohn Baldwin sbcs r7,r7,r6 @ tp[j]-np[j] 169bc3d5698SJohn Baldwin str r7,[r2],#4 @ rp[j]= 170bc3d5698SJohn Baldwin teq r4,r0 @ preserve carry 171bc3d5698SJohn Baldwin bne .Lsub 172bc3d5698SJohn Baldwin sbcs r14,r14,#0 @ upmost carry 173bc3d5698SJohn Baldwin mov r4,sp @ "rewind" r4 174bc3d5698SJohn Baldwin sub r2,r2,r5 @ "rewind" r2 175bc3d5698SJohn Baldwin 176bc3d5698SJohn Baldwin.Lcopy: ldr r7,[r4] @ conditional copy 177bc3d5698SJohn Baldwin ldr r5,[r2] 178bc3d5698SJohn Baldwin str sp,[r4],#4 @ zap tp 179bc3d5698SJohn Baldwin#ifdef __thumb2__ 180bc3d5698SJohn Baldwin it cc 181bc3d5698SJohn Baldwin#endif 182bc3d5698SJohn Baldwin movcc r5,r7 183bc3d5698SJohn Baldwin str r5,[r2],#4 184bc3d5698SJohn Baldwin teq r4,r0 @ preserve carry 185bc3d5698SJohn Baldwin bne .Lcopy 186bc3d5698SJohn Baldwin 187bc3d5698SJohn Baldwin mov sp,r0 188bc3d5698SJohn Baldwin add sp,sp,#4 @ skip over tp[num+1] 189bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers 190bc3d5698SJohn Baldwin add sp,sp,#2*4 @ skip over {r0,r2} 191bc3d5698SJohn Baldwin mov r0,#1 192bc3d5698SJohn Baldwin.Labrt: 193bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 194bc3d5698SJohn Baldwin bx lr @ bx lr 195bc3d5698SJohn Baldwin#else 196bc3d5698SJohn Baldwin tst lr,#1 197bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 198bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 199bc3d5698SJohn Baldwin#endif 200bc3d5698SJohn Baldwin.size bn_mul_mont,.-bn_mul_mont 201bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 202bc3d5698SJohn Baldwin.arch armv7-a 203bc3d5698SJohn Baldwin.fpu neon 204bc3d5698SJohn Baldwin 205bc3d5698SJohn Baldwin.type bn_mul8x_mont_neon,%function 206bc3d5698SJohn Baldwin.align 5 207bc3d5698SJohn Baldwinbn_mul8x_mont_neon: 208bc3d5698SJohn Baldwin mov ip,sp 209bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 210bc3d5698SJohn Baldwin vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 211bc3d5698SJohn Baldwin ldmia ip,{r4,r5} @ load rest of parameter block 212bc3d5698SJohn Baldwin mov ip,sp 213bc3d5698SJohn Baldwin 214bc3d5698SJohn Baldwin cmp r5,#8 215bc3d5698SJohn Baldwin bhi .LNEON_8n 216bc3d5698SJohn Baldwin 217bc3d5698SJohn Baldwin @ special case for r5==8, everything is in register bank... 218bc3d5698SJohn Baldwin 219bc3d5698SJohn Baldwin vld1.32 {d28[0]}, [r2,:32]! 220bc3d5698SJohn Baldwin veor d8,d8,d8 221bc3d5698SJohn Baldwin sub r7,sp,r5,lsl#4 222bc3d5698SJohn Baldwin vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( 223bc3d5698SJohn Baldwin and r7,r7,#-64 224bc3d5698SJohn Baldwin vld1.32 {d30[0]}, [r4,:32] 225bc3d5698SJohn Baldwin mov sp,r7 @ alloca 226bc3d5698SJohn Baldwin vzip.16 d28,d8 227bc3d5698SJohn Baldwin 228bc3d5698SJohn Baldwin vmull.u32 q6,d28,d0[0] 229bc3d5698SJohn Baldwin vmull.u32 q7,d28,d0[1] 230bc3d5698SJohn Baldwin vmull.u32 q8,d28,d1[0] 231bc3d5698SJohn Baldwin vshl.i64 d29,d13,#16 232bc3d5698SJohn Baldwin vmull.u32 q9,d28,d1[1] 233bc3d5698SJohn Baldwin 234bc3d5698SJohn Baldwin vadd.u64 d29,d29,d12 235bc3d5698SJohn Baldwin veor d8,d8,d8 236bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 237bc3d5698SJohn Baldwin 238bc3d5698SJohn Baldwin vmull.u32 q10,d28,d2[0] 239bc3d5698SJohn Baldwin vld1.32 {d4,d5,d6,d7}, [r3]! 240bc3d5698SJohn Baldwin vmull.u32 q11,d28,d2[1] 241bc3d5698SJohn Baldwin vmull.u32 q12,d28,d3[0] 242bc3d5698SJohn Baldwin vzip.16 d29,d8 243bc3d5698SJohn Baldwin vmull.u32 q13,d28,d3[1] 244bc3d5698SJohn Baldwin 245bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4[0] 246bc3d5698SJohn Baldwin sub r9,r5,#1 247bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d4[1] 248bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d5[0] 249bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d5[1] 250bc3d5698SJohn Baldwin 251bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d6[0] 252bc3d5698SJohn Baldwin vmov q5,q6 253bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d6[1] 254bc3d5698SJohn Baldwin vmov q6,q7 255bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d7[0] 256bc3d5698SJohn Baldwin vmov q7,q8 257bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d7[1] 258bc3d5698SJohn Baldwin vmov q8,q9 259bc3d5698SJohn Baldwin vmov q9,q10 260bc3d5698SJohn Baldwin vshr.u64 d10,d10,#16 261bc3d5698SJohn Baldwin vmov q10,q11 262bc3d5698SJohn Baldwin vmov q11,q12 263bc3d5698SJohn Baldwin vadd.u64 d10,d10,d11 264bc3d5698SJohn Baldwin vmov q12,q13 265bc3d5698SJohn Baldwin veor q13,q13 266bc3d5698SJohn Baldwin vshr.u64 d10,d10,#16 267bc3d5698SJohn Baldwin 268bc3d5698SJohn Baldwin b .LNEON_outer8 269bc3d5698SJohn Baldwin 270bc3d5698SJohn Baldwin.align 4 271bc3d5698SJohn Baldwin.LNEON_outer8: 272bc3d5698SJohn Baldwin vld1.32 {d28[0]}, [r2,:32]! 273bc3d5698SJohn Baldwin veor d8,d8,d8 274bc3d5698SJohn Baldwin vzip.16 d28,d8 275bc3d5698SJohn Baldwin vadd.u64 d12,d12,d10 276bc3d5698SJohn Baldwin 277bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d0[0] 278bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d0[1] 279bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d1[0] 280bc3d5698SJohn Baldwin vshl.i64 d29,d13,#16 281bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d1[1] 282bc3d5698SJohn Baldwin 283bc3d5698SJohn Baldwin vadd.u64 d29,d29,d12 284bc3d5698SJohn Baldwin veor d8,d8,d8 285bc3d5698SJohn Baldwin subs r9,r9,#1 286bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 287bc3d5698SJohn Baldwin 288bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d2[0] 289bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d2[1] 290bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d3[0] 291bc3d5698SJohn Baldwin vzip.16 d29,d8 292bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d3[1] 293bc3d5698SJohn Baldwin 294bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4[0] 295bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d4[1] 296bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d5[0] 297bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d5[1] 298bc3d5698SJohn Baldwin 299bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d6[0] 300bc3d5698SJohn Baldwin vmov q5,q6 301bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d6[1] 302bc3d5698SJohn Baldwin vmov q6,q7 303bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d7[0] 304bc3d5698SJohn Baldwin vmov q7,q8 305bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d7[1] 306bc3d5698SJohn Baldwin vmov q8,q9 307bc3d5698SJohn Baldwin vmov q9,q10 308bc3d5698SJohn Baldwin vshr.u64 d10,d10,#16 309bc3d5698SJohn Baldwin vmov q10,q11 310bc3d5698SJohn Baldwin vmov q11,q12 311bc3d5698SJohn Baldwin vadd.u64 d10,d10,d11 312bc3d5698SJohn Baldwin vmov q12,q13 313bc3d5698SJohn Baldwin veor q13,q13 314bc3d5698SJohn Baldwin vshr.u64 d10,d10,#16 315bc3d5698SJohn Baldwin 316bc3d5698SJohn Baldwin bne .LNEON_outer8 317bc3d5698SJohn Baldwin 318bc3d5698SJohn Baldwin vadd.u64 d12,d12,d10 319bc3d5698SJohn Baldwin mov r7,sp 320bc3d5698SJohn Baldwin vshr.u64 d10,d12,#16 321bc3d5698SJohn Baldwin mov r8,r5 322bc3d5698SJohn Baldwin vadd.u64 d13,d13,d10 323bc3d5698SJohn Baldwin add r6,sp,#96 324bc3d5698SJohn Baldwin vshr.u64 d10,d13,#16 325bc3d5698SJohn Baldwin vzip.16 d12,d13 326bc3d5698SJohn Baldwin 327bc3d5698SJohn Baldwin b .LNEON_tail_entry 328bc3d5698SJohn Baldwin 329bc3d5698SJohn Baldwin.align 4 330bc3d5698SJohn Baldwin.LNEON_8n: 331bc3d5698SJohn Baldwin veor q6,q6,q6 332bc3d5698SJohn Baldwin sub r7,sp,#128 333bc3d5698SJohn Baldwin veor q7,q7,q7 334bc3d5698SJohn Baldwin sub r7,r7,r5,lsl#4 335bc3d5698SJohn Baldwin veor q8,q8,q8 336bc3d5698SJohn Baldwin and r7,r7,#-64 337bc3d5698SJohn Baldwin veor q9,q9,q9 338bc3d5698SJohn Baldwin mov sp,r7 @ alloca 339bc3d5698SJohn Baldwin veor q10,q10,q10 340bc3d5698SJohn Baldwin add r7,r7,#256 341bc3d5698SJohn Baldwin veor q11,q11,q11 342bc3d5698SJohn Baldwin sub r8,r5,#8 343bc3d5698SJohn Baldwin veor q12,q12,q12 344bc3d5698SJohn Baldwin veor q13,q13,q13 345bc3d5698SJohn Baldwin 346bc3d5698SJohn Baldwin.LNEON_8n_init: 347bc3d5698SJohn Baldwin vst1.64 {q6,q7},[r7,:256]! 348bc3d5698SJohn Baldwin subs r8,r8,#8 349bc3d5698SJohn Baldwin vst1.64 {q8,q9},[r7,:256]! 350bc3d5698SJohn Baldwin vst1.64 {q10,q11},[r7,:256]! 351bc3d5698SJohn Baldwin vst1.64 {q12,q13},[r7,:256]! 352bc3d5698SJohn Baldwin bne .LNEON_8n_init 353bc3d5698SJohn Baldwin 354bc3d5698SJohn Baldwin add r6,sp,#256 355bc3d5698SJohn Baldwin vld1.32 {d0,d1,d2,d3},[r1]! 356bc3d5698SJohn Baldwin add r10,sp,#8 357bc3d5698SJohn Baldwin vld1.32 {d30[0]},[r4,:32] 358bc3d5698SJohn Baldwin mov r9,r5 359bc3d5698SJohn Baldwin b .LNEON_8n_outer 360bc3d5698SJohn Baldwin 361bc3d5698SJohn Baldwin.align 4 362bc3d5698SJohn Baldwin.LNEON_8n_outer: 363bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 364bc3d5698SJohn Baldwin veor d8,d8,d8 365bc3d5698SJohn Baldwin vzip.16 d28,d8 366bc3d5698SJohn Baldwin add r7,sp,#128 367bc3d5698SJohn Baldwin vld1.32 {d4,d5,d6,d7},[r3]! 368bc3d5698SJohn Baldwin 369bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d0[0] 370bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d0[1] 371bc3d5698SJohn Baldwin veor d8,d8,d8 372bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d1[0] 373bc3d5698SJohn Baldwin vshl.i64 d29,d13,#16 374bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d1[1] 375bc3d5698SJohn Baldwin vadd.u64 d29,d29,d12 376bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d2[0] 377bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 378bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d2[1] 379bc3d5698SJohn Baldwin vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] 380bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d3[0] 381bc3d5698SJohn Baldwin vzip.16 d29,d8 382bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d3[1] 383bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 384bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4[0] 385bc3d5698SJohn Baldwin veor d10,d10,d10 386bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d4[1] 387bc3d5698SJohn Baldwin vzip.16 d28,d10 388bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d5[0] 389bc3d5698SJohn Baldwin vshr.u64 d12,d12,#16 390bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d5[1] 391bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d6[0] 392bc3d5698SJohn Baldwin vadd.u64 d12,d12,d13 393bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d6[1] 394bc3d5698SJohn Baldwin vshr.u64 d12,d12,#16 395bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d7[0] 396bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d7[1] 397bc3d5698SJohn Baldwin vadd.u64 d14,d14,d12 398bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] 399bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d0[0] 400bc3d5698SJohn Baldwin vld1.64 {q6},[r6,:128]! 401bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d0[1] 402bc3d5698SJohn Baldwin veor d8,d8,d8 403bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d1[0] 404bc3d5698SJohn Baldwin vshl.i64 d29,d15,#16 405bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d1[1] 406bc3d5698SJohn Baldwin vadd.u64 d29,d29,d14 407bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d2[0] 408bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 409bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d2[1] 410bc3d5698SJohn Baldwin vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] 411bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d3[0] 412bc3d5698SJohn Baldwin vzip.16 d29,d8 413bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d3[1] 414bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 415bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d4[0] 416bc3d5698SJohn Baldwin veor d10,d10,d10 417bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d4[1] 418bc3d5698SJohn Baldwin vzip.16 d28,d10 419bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d5[0] 420bc3d5698SJohn Baldwin vshr.u64 d14,d14,#16 421bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d5[1] 422bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d6[0] 423bc3d5698SJohn Baldwin vadd.u64 d14,d14,d15 424bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d6[1] 425bc3d5698SJohn Baldwin vshr.u64 d14,d14,#16 426bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d7[0] 427bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d7[1] 428bc3d5698SJohn Baldwin vadd.u64 d16,d16,d14 429bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] 430bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d0[0] 431bc3d5698SJohn Baldwin vld1.64 {q7},[r6,:128]! 432bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d0[1] 433bc3d5698SJohn Baldwin veor d8,d8,d8 434bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d1[0] 435bc3d5698SJohn Baldwin vshl.i64 d29,d17,#16 436bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d1[1] 437bc3d5698SJohn Baldwin vadd.u64 d29,d29,d16 438bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d2[0] 439bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 440bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d2[1] 441bc3d5698SJohn Baldwin vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] 442bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d3[0] 443bc3d5698SJohn Baldwin vzip.16 d29,d8 444bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d3[1] 445bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 446bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d4[0] 447bc3d5698SJohn Baldwin veor d10,d10,d10 448bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d4[1] 449bc3d5698SJohn Baldwin vzip.16 d28,d10 450bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d5[0] 451bc3d5698SJohn Baldwin vshr.u64 d16,d16,#16 452bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d5[1] 453bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d6[0] 454bc3d5698SJohn Baldwin vadd.u64 d16,d16,d17 455bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d6[1] 456bc3d5698SJohn Baldwin vshr.u64 d16,d16,#16 457bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d7[0] 458bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d7[1] 459bc3d5698SJohn Baldwin vadd.u64 d18,d18,d16 460bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] 461bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d0[0] 462bc3d5698SJohn Baldwin vld1.64 {q8},[r6,:128]! 463bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d0[1] 464bc3d5698SJohn Baldwin veor d8,d8,d8 465bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d1[0] 466bc3d5698SJohn Baldwin vshl.i64 d29,d19,#16 467bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d1[1] 468bc3d5698SJohn Baldwin vadd.u64 d29,d29,d18 469bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d2[0] 470bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 471bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d2[1] 472bc3d5698SJohn Baldwin vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] 473bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d3[0] 474bc3d5698SJohn Baldwin vzip.16 d29,d8 475bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d3[1] 476bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 477bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d4[0] 478bc3d5698SJohn Baldwin veor d10,d10,d10 479bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d4[1] 480bc3d5698SJohn Baldwin vzip.16 d28,d10 481bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d5[0] 482bc3d5698SJohn Baldwin vshr.u64 d18,d18,#16 483bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d5[1] 484bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d6[0] 485bc3d5698SJohn Baldwin vadd.u64 d18,d18,d19 486bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d6[1] 487bc3d5698SJohn Baldwin vshr.u64 d18,d18,#16 488bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d7[0] 489bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d7[1] 490bc3d5698SJohn Baldwin vadd.u64 d20,d20,d18 491bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] 492bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d0[0] 493bc3d5698SJohn Baldwin vld1.64 {q9},[r6,:128]! 494bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d0[1] 495bc3d5698SJohn Baldwin veor d8,d8,d8 496bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d1[0] 497bc3d5698SJohn Baldwin vshl.i64 d29,d21,#16 498bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d1[1] 499bc3d5698SJohn Baldwin vadd.u64 d29,d29,d20 500bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d2[0] 501bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 502bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d2[1] 503bc3d5698SJohn Baldwin vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] 504bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d3[0] 505bc3d5698SJohn Baldwin vzip.16 d29,d8 506bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d3[1] 507bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 508bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d4[0] 509bc3d5698SJohn Baldwin veor d10,d10,d10 510bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d4[1] 511bc3d5698SJohn Baldwin vzip.16 d28,d10 512bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d5[0] 513bc3d5698SJohn Baldwin vshr.u64 d20,d20,#16 514bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d5[1] 515bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d6[0] 516bc3d5698SJohn Baldwin vadd.u64 d20,d20,d21 517bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d6[1] 518bc3d5698SJohn Baldwin vshr.u64 d20,d20,#16 519bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d7[0] 520bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d7[1] 521bc3d5698SJohn Baldwin vadd.u64 d22,d22,d20 522bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] 523bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d0[0] 524bc3d5698SJohn Baldwin vld1.64 {q10},[r6,:128]! 525bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d0[1] 526bc3d5698SJohn Baldwin veor d8,d8,d8 527bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d1[0] 528bc3d5698SJohn Baldwin vshl.i64 d29,d23,#16 529bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d1[1] 530bc3d5698SJohn Baldwin vadd.u64 d29,d29,d22 531bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d2[0] 532bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 533bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d2[1] 534bc3d5698SJohn Baldwin vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] 535bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d3[0] 536bc3d5698SJohn Baldwin vzip.16 d29,d8 537bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d3[1] 538bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 539bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d4[0] 540bc3d5698SJohn Baldwin veor d10,d10,d10 541bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d4[1] 542bc3d5698SJohn Baldwin vzip.16 d28,d10 543bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d5[0] 544bc3d5698SJohn Baldwin vshr.u64 d22,d22,#16 545bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d5[1] 546bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d6[0] 547bc3d5698SJohn Baldwin vadd.u64 d22,d22,d23 548bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d6[1] 549bc3d5698SJohn Baldwin vshr.u64 d22,d22,#16 550bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d7[0] 551bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d7[1] 552bc3d5698SJohn Baldwin vadd.u64 d24,d24,d22 553bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] 554bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d0[0] 555bc3d5698SJohn Baldwin vld1.64 {q11},[r6,:128]! 556bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d0[1] 557bc3d5698SJohn Baldwin veor d8,d8,d8 558bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d1[0] 559bc3d5698SJohn Baldwin vshl.i64 d29,d25,#16 560bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d1[1] 561bc3d5698SJohn Baldwin vadd.u64 d29,d29,d24 562bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d2[0] 563bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 564bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d2[1] 565bc3d5698SJohn Baldwin vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] 566bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d3[0] 567bc3d5698SJohn Baldwin vzip.16 d29,d8 568bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d3[1] 569bc3d5698SJohn Baldwin vld1.32 {d28[0]},[r2,:32]! @ *b++ 570bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d4[0] 571bc3d5698SJohn Baldwin veor d10,d10,d10 572bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d4[1] 573bc3d5698SJohn Baldwin vzip.16 d28,d10 574bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d5[0] 575bc3d5698SJohn Baldwin vshr.u64 d24,d24,#16 576bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d5[1] 577bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d6[0] 578bc3d5698SJohn Baldwin vadd.u64 d24,d24,d25 579bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d6[1] 580bc3d5698SJohn Baldwin vshr.u64 d24,d24,#16 581bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d7[0] 582bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d7[1] 583bc3d5698SJohn Baldwin vadd.u64 d26,d26,d24 584bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] 585bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d0[0] 586bc3d5698SJohn Baldwin vld1.64 {q12},[r6,:128]! 587bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d0[1] 588bc3d5698SJohn Baldwin veor d8,d8,d8 589bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d1[0] 590bc3d5698SJohn Baldwin vshl.i64 d29,d27,#16 591bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d1[1] 592bc3d5698SJohn Baldwin vadd.u64 d29,d29,d26 593bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d2[0] 594bc3d5698SJohn Baldwin vmul.u32 d29,d29,d30 595bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d2[1] 596bc3d5698SJohn Baldwin vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] 597bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d3[0] 598bc3d5698SJohn Baldwin vzip.16 d29,d8 599bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d3[1] 600bc3d5698SJohn Baldwin vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] 601bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d4[0] 602bc3d5698SJohn Baldwin vld1.32 {d0,d1,d2,d3},[r1]! 603bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4[1] 604bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d5[0] 605bc3d5698SJohn Baldwin vshr.u64 d26,d26,#16 606bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d5[1] 607bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d6[0] 608bc3d5698SJohn Baldwin vadd.u64 d26,d26,d27 609bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d6[1] 610bc3d5698SJohn Baldwin vshr.u64 d26,d26,#16 611bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d7[0] 612bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d7[1] 613bc3d5698SJohn Baldwin vadd.u64 d12,d12,d26 614bc3d5698SJohn Baldwin vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] 615bc3d5698SJohn Baldwin add r10,sp,#8 @ rewind 616bc3d5698SJohn Baldwin sub r8,r5,#8 617bc3d5698SJohn Baldwin b .LNEON_8n_inner 618bc3d5698SJohn Baldwin 619bc3d5698SJohn Baldwin.align 4 620bc3d5698SJohn Baldwin.LNEON_8n_inner: 621bc3d5698SJohn Baldwin subs r8,r8,#8 622bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d0[0] 623bc3d5698SJohn Baldwin vld1.64 {q13},[r6,:128] 624bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d0[1] 625bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] 626bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d1[0] 627bc3d5698SJohn Baldwin vld1.32 {d4,d5,d6,d7},[r3]! 628bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d1[1] 629bc3d5698SJohn Baldwin it ne 630bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 631bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d2[0] 632bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d2[1] 633bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d3[0] 634bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d3[1] 635bc3d5698SJohn Baldwin vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] 636bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4[0] 637bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d4[1] 638bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d5[0] 639bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d5[1] 640bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d6[0] 641bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d6[1] 642bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d7[0] 643bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d7[1] 644bc3d5698SJohn Baldwin vst1.64 {q6},[r7,:128]! 645bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d0[0] 646bc3d5698SJohn Baldwin vld1.64 {q6},[r6,:128] 647bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d0[1] 648bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] 649bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d1[0] 650bc3d5698SJohn Baldwin it ne 651bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 652bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d1[1] 653bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d2[0] 654bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d2[1] 655bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d3[0] 656bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d3[1] 657bc3d5698SJohn Baldwin vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] 658bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d4[0] 659bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d4[1] 660bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d5[0] 661bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d5[1] 662bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d6[0] 663bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d6[1] 664bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d7[0] 665bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d7[1] 666bc3d5698SJohn Baldwin vst1.64 {q7},[r7,:128]! 667bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d0[0] 668bc3d5698SJohn Baldwin vld1.64 {q7},[r6,:128] 669bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d0[1] 670bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] 671bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d1[0] 672bc3d5698SJohn Baldwin it ne 673bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 674bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d1[1] 675bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d2[0] 676bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d2[1] 677bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d3[0] 678bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d3[1] 679bc3d5698SJohn Baldwin vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] 680bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d4[0] 681bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d4[1] 682bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d5[0] 683bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d5[1] 684bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d6[0] 685bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d6[1] 686bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d7[0] 687bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d7[1] 688bc3d5698SJohn Baldwin vst1.64 {q8},[r7,:128]! 689bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d0[0] 690bc3d5698SJohn Baldwin vld1.64 {q8},[r6,:128] 691bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d0[1] 692bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] 693bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d1[0] 694bc3d5698SJohn Baldwin it ne 695bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 696bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d1[1] 697bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d2[0] 698bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d2[1] 699bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d3[0] 700bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d3[1] 701bc3d5698SJohn Baldwin vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] 702bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d4[0] 703bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d4[1] 704bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d5[0] 705bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d5[1] 706bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d6[0] 707bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d6[1] 708bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d7[0] 709bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d7[1] 710bc3d5698SJohn Baldwin vst1.64 {q9},[r7,:128]! 711bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d0[0] 712bc3d5698SJohn Baldwin vld1.64 {q9},[r6,:128] 713bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d0[1] 714bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] 715bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d1[0] 716bc3d5698SJohn Baldwin it ne 717bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 718bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d1[1] 719bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d2[0] 720bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d2[1] 721bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d3[0] 722bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d3[1] 723bc3d5698SJohn Baldwin vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] 724bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d4[0] 725bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d4[1] 726bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d5[0] 727bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d5[1] 728bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d6[0] 729bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d6[1] 730bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d7[0] 731bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d7[1] 732bc3d5698SJohn Baldwin vst1.64 {q10},[r7,:128]! 733bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d0[0] 734bc3d5698SJohn Baldwin vld1.64 {q10},[r6,:128] 735bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d0[1] 736bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] 737bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d1[0] 738bc3d5698SJohn Baldwin it ne 739bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 740bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d1[1] 741bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d2[0] 742bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d2[1] 743bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d3[0] 744bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d3[1] 745bc3d5698SJohn Baldwin vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] 746bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d4[0] 747bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d4[1] 748bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d5[0] 749bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d5[1] 750bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d6[0] 751bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d6[1] 752bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d7[0] 753bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d7[1] 754bc3d5698SJohn Baldwin vst1.64 {q11},[r7,:128]! 755bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d0[0] 756bc3d5698SJohn Baldwin vld1.64 {q11},[r6,:128] 757bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d0[1] 758bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] 759bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d1[0] 760bc3d5698SJohn Baldwin it ne 761bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 762bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d1[1] 763bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d2[0] 764bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d2[1] 765bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d3[0] 766bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d3[1] 767bc3d5698SJohn Baldwin vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] 768bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d4[0] 769bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d4[1] 770bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d5[0] 771bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d5[1] 772bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d6[0] 773bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d6[1] 774bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d7[0] 775bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d7[1] 776bc3d5698SJohn Baldwin vst1.64 {q12},[r7,:128]! 777bc3d5698SJohn Baldwin vmlal.u32 q13,d28,d0[0] 778bc3d5698SJohn Baldwin vld1.64 {q12},[r6,:128] 779bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d0[1] 780bc3d5698SJohn Baldwin vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] 781bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d1[0] 782bc3d5698SJohn Baldwin it ne 783bc3d5698SJohn Baldwin addne r6,r6,#16 @ don't advance in last iteration 784bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d1[1] 785bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d2[0] 786bc3d5698SJohn Baldwin vmlal.u32 q10,d28,d2[1] 787bc3d5698SJohn Baldwin vmlal.u32 q11,d28,d3[0] 788bc3d5698SJohn Baldwin vmlal.u32 q12,d28,d3[1] 789bc3d5698SJohn Baldwin it eq 790bc3d5698SJohn Baldwin subeq r1,r1,r5,lsl#2 @ rewind 791bc3d5698SJohn Baldwin vmlal.u32 q13,d29,d4[0] 792bc3d5698SJohn Baldwin vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] 793bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4[1] 794bc3d5698SJohn Baldwin vld1.32 {d0,d1,d2,d3},[r1]! 795bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d5[0] 796bc3d5698SJohn Baldwin add r10,sp,#8 @ rewind 797bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d5[1] 798bc3d5698SJohn Baldwin vmlal.u32 q9,d29,d6[0] 799bc3d5698SJohn Baldwin vmlal.u32 q10,d29,d6[1] 800bc3d5698SJohn Baldwin vmlal.u32 q11,d29,d7[0] 801bc3d5698SJohn Baldwin vst1.64 {q13},[r7,:128]! 802bc3d5698SJohn Baldwin vmlal.u32 q12,d29,d7[1] 803bc3d5698SJohn Baldwin 804bc3d5698SJohn Baldwin bne .LNEON_8n_inner 805bc3d5698SJohn Baldwin add r6,sp,#128 806bc3d5698SJohn Baldwin vst1.64 {q6,q7},[r7,:256]! 807bc3d5698SJohn Baldwin veor q2,q2,q2 @ d4-d5 808bc3d5698SJohn Baldwin vst1.64 {q8,q9},[r7,:256]! 809bc3d5698SJohn Baldwin veor q3,q3,q3 @ d6-d7 810bc3d5698SJohn Baldwin vst1.64 {q10,q11},[r7,:256]! 811bc3d5698SJohn Baldwin vst1.64 {q12},[r7,:128] 812bc3d5698SJohn Baldwin 813bc3d5698SJohn Baldwin subs r9,r9,#8 814bc3d5698SJohn Baldwin vld1.64 {q6,q7},[r6,:256]! 815bc3d5698SJohn Baldwin vld1.64 {q8,q9},[r6,:256]! 816bc3d5698SJohn Baldwin vld1.64 {q10,q11},[r6,:256]! 817bc3d5698SJohn Baldwin vld1.64 {q12,q13},[r6,:256]! 818bc3d5698SJohn Baldwin 819bc3d5698SJohn Baldwin itt ne 820bc3d5698SJohn Baldwin subne r3,r3,r5,lsl#2 @ rewind 821bc3d5698SJohn Baldwin bne .LNEON_8n_outer 822bc3d5698SJohn Baldwin 823bc3d5698SJohn Baldwin add r7,sp,#128 824bc3d5698SJohn Baldwin vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame 825bc3d5698SJohn Baldwin vshr.u64 d10,d12,#16 826bc3d5698SJohn Baldwin vst1.64 {q2,q3},[sp,:256]! 827bc3d5698SJohn Baldwin vadd.u64 d13,d13,d10 828bc3d5698SJohn Baldwin vst1.64 {q2,q3}, [sp,:256]! 829bc3d5698SJohn Baldwin vshr.u64 d10,d13,#16 830bc3d5698SJohn Baldwin vst1.64 {q2,q3}, [sp,:256]! 831bc3d5698SJohn Baldwin vzip.16 d12,d13 832bc3d5698SJohn Baldwin 833bc3d5698SJohn Baldwin mov r8,r5 834bc3d5698SJohn Baldwin b .LNEON_tail_entry 835bc3d5698SJohn Baldwin 836bc3d5698SJohn Baldwin.align 4 837bc3d5698SJohn Baldwin.LNEON_tail: 838bc3d5698SJohn Baldwin vadd.u64 d12,d12,d10 839bc3d5698SJohn Baldwin vshr.u64 d10,d12,#16 840bc3d5698SJohn Baldwin vld1.64 {q8,q9}, [r6, :256]! 841bc3d5698SJohn Baldwin vadd.u64 d13,d13,d10 842bc3d5698SJohn Baldwin vld1.64 {q10,q11}, [r6, :256]! 843bc3d5698SJohn Baldwin vshr.u64 d10,d13,#16 844bc3d5698SJohn Baldwin vld1.64 {q12,q13}, [r6, :256]! 845bc3d5698SJohn Baldwin vzip.16 d12,d13 846bc3d5698SJohn Baldwin 847bc3d5698SJohn Baldwin.LNEON_tail_entry: 848bc3d5698SJohn Baldwin vadd.u64 d14,d14,d10 849bc3d5698SJohn Baldwin vst1.32 {d12[0]}, [r7, :32]! 850bc3d5698SJohn Baldwin vshr.u64 d10,d14,#16 851bc3d5698SJohn Baldwin vadd.u64 d15,d15,d10 852bc3d5698SJohn Baldwin vshr.u64 d10,d15,#16 853bc3d5698SJohn Baldwin vzip.16 d14,d15 854bc3d5698SJohn Baldwin vadd.u64 d16,d16,d10 855bc3d5698SJohn Baldwin vst1.32 {d14[0]}, [r7, :32]! 856bc3d5698SJohn Baldwin vshr.u64 d10,d16,#16 857bc3d5698SJohn Baldwin vadd.u64 d17,d17,d10 858bc3d5698SJohn Baldwin vshr.u64 d10,d17,#16 859bc3d5698SJohn Baldwin vzip.16 d16,d17 860bc3d5698SJohn Baldwin vadd.u64 d18,d18,d10 861bc3d5698SJohn Baldwin vst1.32 {d16[0]}, [r7, :32]! 862bc3d5698SJohn Baldwin vshr.u64 d10,d18,#16 863bc3d5698SJohn Baldwin vadd.u64 d19,d19,d10 864bc3d5698SJohn Baldwin vshr.u64 d10,d19,#16 865bc3d5698SJohn Baldwin vzip.16 d18,d19 866bc3d5698SJohn Baldwin vadd.u64 d20,d20,d10 867bc3d5698SJohn Baldwin vst1.32 {d18[0]}, [r7, :32]! 868bc3d5698SJohn Baldwin vshr.u64 d10,d20,#16 869bc3d5698SJohn Baldwin vadd.u64 d21,d21,d10 870bc3d5698SJohn Baldwin vshr.u64 d10,d21,#16 871bc3d5698SJohn Baldwin vzip.16 d20,d21 872bc3d5698SJohn Baldwin vadd.u64 d22,d22,d10 873bc3d5698SJohn Baldwin vst1.32 {d20[0]}, [r7, :32]! 874bc3d5698SJohn Baldwin vshr.u64 d10,d22,#16 875bc3d5698SJohn Baldwin vadd.u64 d23,d23,d10 876bc3d5698SJohn Baldwin vshr.u64 d10,d23,#16 877bc3d5698SJohn Baldwin vzip.16 d22,d23 878bc3d5698SJohn Baldwin vadd.u64 d24,d24,d10 879bc3d5698SJohn Baldwin vst1.32 {d22[0]}, [r7, :32]! 880bc3d5698SJohn Baldwin vshr.u64 d10,d24,#16 881bc3d5698SJohn Baldwin vadd.u64 d25,d25,d10 882bc3d5698SJohn Baldwin vshr.u64 d10,d25,#16 883bc3d5698SJohn Baldwin vzip.16 d24,d25 884bc3d5698SJohn Baldwin vadd.u64 d26,d26,d10 885bc3d5698SJohn Baldwin vst1.32 {d24[0]}, [r7, :32]! 886bc3d5698SJohn Baldwin vshr.u64 d10,d26,#16 887bc3d5698SJohn Baldwin vadd.u64 d27,d27,d10 888bc3d5698SJohn Baldwin vshr.u64 d10,d27,#16 889bc3d5698SJohn Baldwin vzip.16 d26,d27 890bc3d5698SJohn Baldwin vld1.64 {q6,q7}, [r6, :256]! 891bc3d5698SJohn Baldwin subs r8,r8,#8 892bc3d5698SJohn Baldwin vst1.32 {d26[0]}, [r7, :32]! 893bc3d5698SJohn Baldwin bne .LNEON_tail 894bc3d5698SJohn Baldwin 895bc3d5698SJohn Baldwin vst1.32 {d10[0]}, [r7, :32] @ top-most bit 896bc3d5698SJohn Baldwin sub r3,r3,r5,lsl#2 @ rewind r3 897bc3d5698SJohn Baldwin subs r1,sp,#0 @ clear carry flag 898bc3d5698SJohn Baldwin add r2,sp,r5,lsl#2 899bc3d5698SJohn Baldwin 900bc3d5698SJohn Baldwin.LNEON_sub: 901bc3d5698SJohn Baldwin ldmia r1!, {r4,r5,r6,r7} 902bc3d5698SJohn Baldwin ldmia r3!, {r8,r9,r10,r11} 903bc3d5698SJohn Baldwin sbcs r8, r4,r8 904bc3d5698SJohn Baldwin sbcs r9, r5,r9 905bc3d5698SJohn Baldwin sbcs r10,r6,r10 906bc3d5698SJohn Baldwin sbcs r11,r7,r11 907bc3d5698SJohn Baldwin teq r1,r2 @ preserves carry 908bc3d5698SJohn Baldwin stmia r0!, {r8,r9,r10,r11} 909bc3d5698SJohn Baldwin bne .LNEON_sub 910bc3d5698SJohn Baldwin 911bc3d5698SJohn Baldwin ldr r10, [r1] @ load top-most bit 912bc3d5698SJohn Baldwin mov r11,sp 913bc3d5698SJohn Baldwin veor q0,q0,q0 914bc3d5698SJohn Baldwin sub r11,r2,r11 @ this is num*4 915bc3d5698SJohn Baldwin veor q1,q1,q1 916bc3d5698SJohn Baldwin mov r1,sp 917bc3d5698SJohn Baldwin sub r0,r0,r11 @ rewind r0 918bc3d5698SJohn Baldwin mov r3,r2 @ second 3/4th of frame 919bc3d5698SJohn Baldwin sbcs r10,r10,#0 @ result is carry flag 920bc3d5698SJohn Baldwin 921bc3d5698SJohn Baldwin.LNEON_copy_n_zap: 922bc3d5698SJohn Baldwin ldmia r1!, {r4,r5,r6,r7} 923bc3d5698SJohn Baldwin ldmia r0, {r8,r9,r10,r11} 924bc3d5698SJohn Baldwin it cc 925bc3d5698SJohn Baldwin movcc r8, r4 926bc3d5698SJohn Baldwin vst1.64 {q0,q1}, [r3,:256]! @ wipe 927bc3d5698SJohn Baldwin itt cc 928bc3d5698SJohn Baldwin movcc r9, r5 929bc3d5698SJohn Baldwin movcc r10,r6 930bc3d5698SJohn Baldwin vst1.64 {q0,q1}, [r3,:256]! @ wipe 931bc3d5698SJohn Baldwin it cc 932bc3d5698SJohn Baldwin movcc r11,r7 933bc3d5698SJohn Baldwin ldmia r1, {r4,r5,r6,r7} 934bc3d5698SJohn Baldwin stmia r0!, {r8,r9,r10,r11} 935bc3d5698SJohn Baldwin sub r1,r1,#16 936bc3d5698SJohn Baldwin ldmia r0, {r8,r9,r10,r11} 937bc3d5698SJohn Baldwin it cc 938bc3d5698SJohn Baldwin movcc r8, r4 939bc3d5698SJohn Baldwin vst1.64 {q0,q1}, [r1,:256]! @ wipe 940bc3d5698SJohn Baldwin itt cc 941bc3d5698SJohn Baldwin movcc r9, r5 942bc3d5698SJohn Baldwin movcc r10,r6 943bc3d5698SJohn Baldwin vst1.64 {q0,q1}, [r3,:256]! @ wipe 944bc3d5698SJohn Baldwin it cc 945bc3d5698SJohn Baldwin movcc r11,r7 946bc3d5698SJohn Baldwin teq r1,r2 @ preserves carry 947bc3d5698SJohn Baldwin stmia r0!, {r8,r9,r10,r11} 948bc3d5698SJohn Baldwin bne .LNEON_copy_n_zap 949bc3d5698SJohn Baldwin 950bc3d5698SJohn Baldwin mov sp,ip 951bc3d5698SJohn Baldwin vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} 952bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 953bc3d5698SJohn Baldwin bx lr @ bx lr 954bc3d5698SJohn Baldwin.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 955bc3d5698SJohn Baldwin#endif 956bc3d5698SJohn Baldwin.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 957bc3d5698SJohn Baldwin.align 2 958bc3d5698SJohn Baldwin.align 2 959bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 960bc3d5698SJohn Baldwin.comm OPENSSL_armcap_P,4,4 961bc3d5698SJohn Baldwin#endif 962