1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ChaCha20 for C64x+. 18# 19# October 2015 20# 21# Performance is 3.54 cycles per processed byte, which is ~4.3 times 22# faster than code generated by TI compiler. Compiler also disables 23# interrupts for some reason, thus making interrupt response time 24# dependent on input length. This module on the other hand is free 25# from such limitation. 26 27$output=pop; 28open STDOUT,">$output"; 29 30($OUT,$INP,$LEN,$KEYB,$COUNTERA)=("A4","B4","A6","B6","A8"); 31($KEYA,$COUNTERB,$STEP)=("A7","B7","A3"); 32 33@X= ("A16","B16","A17","B17","A18","B18","A19","B19", 34 "A20","B20","A21","B21","A22","B22","A23","B23"); 35@Y= ("A24","B24","A25","B25","A26","B26","A27","B27", 36 "A28","B28","A29","B29","A30","B30","A31","B31"); 37@DAT=("A6", "A7", "B6", "B7", "A8", "A9", "B8", "B9", 38 "A10","A11","B10","B11","A12","A13","B12","B13"); 39 40# yes, overlaps with @DAT, used only in 2x interleave code path... 41@K2x=("A6", "B6", "A7", "B7", "A8", "B8", "A9", "B9", 42 "A10","B10","A11","B11","A2", "B2", "A13","B13"); 43 44$code.=<<___; 45 .text 46 47 .if .ASSEMBLER_VERSION<7000000 48 .asg 0,__TI_EABI__ 49 .endif 50 .if __TI_EABI__ 51 .asg ChaCha20_ctr32,_ChaCha20_ctr32 52 .endif 53 54 .asg B3,RA 55 .asg A15,FP 56 .asg B15,SP 57 58 .global _ChaCha20_ctr32 59 .align 32 60_ChaCha20_ctr32: 61 .asmfunc stack_usage(40+64) 62 MV $LEN,A0 ; reassign 63 [!A0] BNOP RA ; no data 64|| [A0] STW FP,*SP--(40+64) ; save frame pointer and alloca(40+64) 65|| [A0] MV SP,FP 66 [A0] STDW B13:B12,*SP[4+8] ; ABI says so 67|| [A0] MV $KEYB,$KEYA 68|| [A0] MV $COUNTERA,$COUNTERB 69 [A0] STDW B11:B10,*SP[3+8] 70|| [A0] STDW A13:A12,*FP[-3] 71 [A0] STDW A11:A10,*FP[-4] 72|| [A0] MVK 128,$STEP ; 2 * input block size 73 74 [A0] LDW *${KEYA}[0],@Y[4] ; load key 75|| [A0] LDW *${KEYB}[1],@Y[5] 76|| [A0] MVK 0x00007865,@Y[0] ; synthesize sigma 77|| [A0] MVK 0x0000646e,@Y[1] 78 [A0] LDW *${KEYA}[2],@Y[6] 79|| [A0] LDW *${KEYB}[3],@Y[7] 80|| [A0] MVKH 0x61700000,@Y[0] 81|| [A0] MVKH 0x33200000,@Y[1] 82 LDW *${KEYA}[4],@Y[8] 83|| LDW *${KEYB}[5],@Y[9] 84|| MVK 0x00002d32,@Y[2] 85|| MVK 0x00006574,@Y[3] 86 LDW *${KEYA}[6],@Y[10] 87|| LDW *${KEYB}[7],@Y[11] 88|| MVKH 0x79620000,@Y[2] 89|| MVKH 0x6b200000,@Y[3] 90 LDW *${COUNTERA}[0],@Y[12] ; load counter||nonce 91|| LDW *${COUNTERB}[1],@Y[13] 92|| CMPLTU A0,$STEP,A1 ; is length < 2*blocks? 93 LDW *${COUNTERA}[2],@Y[14] 94|| LDW *${COUNTERB}[3],@Y[15] 95|| [A1] BNOP top1x? 96 [A1] MVK 64,$STEP ; input block size 97|| MVK 10,B0 ; inner loop counter 98 99 DMV @Y[2],@Y[0],@X[2]:@X[0] ; copy block 100|| DMV @Y[3],@Y[1],@X[3]:@X[1] 101||[!A1] STDW @Y[2]:@Y[0],*FP[-12] ; offload key material to stack 102||[!A1] STDW @Y[3]:@Y[1],*SP[2] 103 DMV @Y[6],@Y[4],@X[6]:@X[4] 104|| DMV @Y[7],@Y[5],@X[7]:@X[5] 105||[!A1] STDW @Y[6]:@Y[4],*FP[-10] 106||[!A1] STDW @Y[7]:@Y[5],*SP[4] 107 DMV @Y[10],@Y[8],@X[10]:@X[8] 108|| DMV @Y[11],@Y[9],@X[11]:@X[9] 109||[!A1] STDW @Y[10]:@Y[8],*FP[-8] 110||[!A1] STDW @Y[11]:@Y[9],*SP[6] 111 DMV @Y[14],@Y[12],@X[14]:@X[12] 112|| DMV @Y[15],@Y[13],@X[15]:@X[13] 113||[!A1] MV @Y[12],@K2x[12] ; counter 114||[!A1] MV @Y[13],@K2x[13] 115||[!A1] STW @Y[14],*FP[-6*2] 116||[!A1] STW @Y[15],*SP[8*2] 117___ 118{ ################################################################ 119 # 2x interleave gives 50% performance improvement 120 # 121my ($a0,$a1,$a2,$a3) = (0..3); 122my ($b0,$b1,$b2,$b3) = (4..7); 123my ($c0,$c1,$c2,$c3) = (8..11); 124my ($d0,$d1,$d2,$d3) = (12..15); 125 126$code.=<<___; 127outer2x?: 128 ADD @X[$b1],@X[$a1],@X[$a1] 129|| ADD @X[$b2],@X[$a2],@X[$a2] 130|| ADD @X[$b0],@X[$a0],@X[$a0] 131|| ADD @X[$b3],@X[$a3],@X[$a3] 132|| DMV @Y[2],@Y[0],@K2x[2]:@K2x[0] 133|| DMV @Y[3],@Y[1],@K2x[3]:@K2x[1] 134 XOR @X[$a1],@X[$d1],@X[$d1] 135|| XOR @X[$a2],@X[$d2],@X[$d2] 136|| XOR @X[$a0],@X[$d0],@X[$d0] 137|| XOR @X[$a3],@X[$d3],@X[$d3] 138|| DMV @Y[6],@Y[4],@K2x[6]:@K2x[4] 139|| DMV @Y[7],@Y[5],@K2x[7]:@K2x[5] 140 SWAP2 @X[$d1],@X[$d1] ; rotate by 16 141|| SWAP2 @X[$d2],@X[$d2] 142|| SWAP2 @X[$d0],@X[$d0] 143|| SWAP2 @X[$d3],@X[$d3] 144 145 ADD @X[$d1],@X[$c1],@X[$c1] 146|| ADD @X[$d2],@X[$c2],@X[$c2] 147|| ADD @X[$d0],@X[$c0],@X[$c0] 148|| ADD @X[$d3],@X[$c3],@X[$c3] 149|| DMV @Y[10],@Y[8],@K2x[10]:@K2x[8] 150|| DMV @Y[11],@Y[9],@K2x[11]:@K2x[9] 151 XOR @X[$c1],@X[$b1],@X[$b1] 152|| XOR @X[$c2],@X[$b2],@X[$b2] 153|| XOR @X[$c0],@X[$b0],@X[$b0] 154|| XOR @X[$c3],@X[$b3],@X[$b3] 155|| ADD 1,@Y[12],@Y[12] ; adjust counter for 2nd block 156 ROTL @X[$b1],12,@X[$b1] 157|| ROTL @X[$b2],12,@X[$b2] 158|| MV @Y[14],@K2x[14] 159|| MV @Y[15],@K2x[15] 160top2x?: 161 ROTL @X[$b0],12,@X[$b0] 162|| ROTL @X[$b3],12,@X[$b3] 163|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 164|| ADD @Y[$b2],@Y[$a2],@Y[$a2] 165 ADD @Y[$b0],@Y[$a0],@Y[$a0] 166|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 167 168|| ADD @X[$b1],@X[$a1],@X[$a1] 169|| ADD @X[$b2],@X[$a2],@X[$a2] 170|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 171|| XOR @Y[$a2],@Y[$d2],@Y[$d2] 172 XOR @Y[$a0],@Y[$d0],@Y[$d0] 173|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 174|| ADD @X[$b0],@X[$a0],@X[$a0] 175|| ADD @X[$b3],@X[$a3],@X[$a3] 176|| XOR @X[$a1],@X[$d1],@X[$d1] 177|| XOR @X[$a2],@X[$d2],@X[$d2] 178 XOR @X[$a0],@X[$d0],@X[$d0] 179|| XOR @X[$a3],@X[$d3],@X[$d3] 180|| ROTL @X[$d1],8,@X[$d1] 181|| ROTL @X[$d2],8,@X[$d2] 182|| SWAP2 @Y[$d1],@Y[$d1] ; rotate by 16 183|| SWAP2 @Y[$d2],@Y[$d2] 184|| SWAP2 @Y[$d0],@Y[$d0] 185|| SWAP2 @Y[$d3],@Y[$d3] 186 ROTL @X[$d0],8,@X[$d0] 187|| ROTL @X[$d3],8,@X[$d3] 188|| ADD @Y[$d1],@Y[$c1],@Y[$c1] 189|| ADD @Y[$d2],@Y[$c2],@Y[$c2] 190|| ADD @Y[$d0],@Y[$c0],@Y[$c0] 191|| ADD @Y[$d3],@Y[$c3],@Y[$c3] 192|| BNOP middle2x1? ; protect from interrupt 193 194 ADD @X[$d1],@X[$c1],@X[$c1] 195|| ADD @X[$d2],@X[$c2],@X[$c2] 196|| XOR @Y[$c1],@Y[$b1],@Y[$b1] 197|| XOR @Y[$c2],@Y[$b2],@Y[$b2] 198|| XOR @Y[$c0],@Y[$b0],@Y[$b0] 199|| XOR @Y[$c3],@Y[$b3],@Y[$b3] 200 ADD @X[$d0],@X[$c0],@X[$c0] 201|| ADD @X[$d3],@X[$c3],@X[$c3] 202|| XOR @X[$c1],@X[$b1],@X[$b1] 203|| XOR @X[$c2],@X[$b2],@X[$b2] 204|| ROTL @X[$d1],0,@X[$d2] ; moved to avoid cross-path stall 205|| ROTL @X[$d2],0,@X[$d3] 206 XOR @X[$c0],@X[$b0],@X[$b0] 207|| XOR @X[$c3],@X[$b3],@X[$b3] 208|| MV @X[$d0],@X[$d1] 209|| MV @X[$d3],@X[$d0] 210|| ROTL @Y[$b1],12,@Y[$b1] 211|| ROTL @Y[$b2],12,@Y[$b2] 212 ROTL @X[$b1],7,@X[$b0] ; avoided cross-path stall 213|| ROTL @X[$b2],7,@X[$b1] 214 ROTL @X[$b0],7,@X[$b3] 215|| ROTL @X[$b3],7,@X[$b2] 216middle2x1?: 217 218 ROTL @Y[$b0],12,@Y[$b0] 219|| ROTL @Y[$b3],12,@Y[$b3] 220|| ADD @X[$b0],@X[$a0],@X[$a0] 221|| ADD @X[$b1],@X[$a1],@X[$a1] 222 ADD @X[$b2],@X[$a2],@X[$a2] 223|| ADD @X[$b3],@X[$a3],@X[$a3] 224 225|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 226|| ADD @Y[$b2],@Y[$a2],@Y[$a2] 227|| XOR @X[$a0],@X[$d0],@X[$d0] 228|| XOR @X[$a1],@X[$d1],@X[$d1] 229 XOR @X[$a2],@X[$d2],@X[$d2] 230|| XOR @X[$a3],@X[$d3],@X[$d3] 231|| ADD @Y[$b0],@Y[$a0],@Y[$a0] 232|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 233|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 234|| XOR @Y[$a2],@Y[$d2],@Y[$d2] 235 XOR @Y[$a0],@Y[$d0],@Y[$d0] 236|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 237|| ROTL @Y[$d1],8,@Y[$d1] 238|| ROTL @Y[$d2],8,@Y[$d2] 239|| SWAP2 @X[$d0],@X[$d0] ; rotate by 16 240|| SWAP2 @X[$d1],@X[$d1] 241|| SWAP2 @X[$d2],@X[$d2] 242|| SWAP2 @X[$d3],@X[$d3] 243 ROTL @Y[$d0],8,@Y[$d0] 244|| ROTL @Y[$d3],8,@Y[$d3] 245|| ADD @X[$d0],@X[$c2],@X[$c2] 246|| ADD @X[$d1],@X[$c3],@X[$c3] 247|| ADD @X[$d2],@X[$c0],@X[$c0] 248|| ADD @X[$d3],@X[$c1],@X[$c1] 249|| BNOP middle2x2? ; protect from interrupt 250 251 ADD @Y[$d1],@Y[$c1],@Y[$c1] 252|| ADD @Y[$d2],@Y[$c2],@Y[$c2] 253|| XOR @X[$c2],@X[$b0],@X[$b0] 254|| XOR @X[$c3],@X[$b1],@X[$b1] 255|| XOR @X[$c0],@X[$b2],@X[$b2] 256|| XOR @X[$c1],@X[$b3],@X[$b3] 257 ADD @Y[$d0],@Y[$c0],@Y[$c0] 258|| ADD @Y[$d3],@Y[$c3],@Y[$c3] 259|| XOR @Y[$c1],@Y[$b1],@Y[$b1] 260|| XOR @Y[$c2],@Y[$b2],@Y[$b2] 261|| ROTL @Y[$d1],0,@Y[$d2] ; moved to avoid cross-path stall 262|| ROTL @Y[$d2],0,@Y[$d3] 263 XOR @Y[$c0],@Y[$b0],@Y[$b0] 264|| XOR @Y[$c3],@Y[$b3],@Y[$b3] 265|| MV @Y[$d0],@Y[$d1] 266|| MV @Y[$d3],@Y[$d0] 267|| ROTL @X[$b0],12,@X[$b0] 268|| ROTL @X[$b1],12,@X[$b1] 269 ROTL @Y[$b1],7,@Y[$b0] ; avoided cross-path stall 270|| ROTL @Y[$b2],7,@Y[$b1] 271 ROTL @Y[$b0],7,@Y[$b3] 272|| ROTL @Y[$b3],7,@Y[$b2] 273middle2x2?: 274 275 ROTL @X[$b2],12,@X[$b2] 276|| ROTL @X[$b3],12,@X[$b3] 277|| ADD @Y[$b0],@Y[$a0],@Y[$a0] 278|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 279 ADD @Y[$b2],@Y[$a2],@Y[$a2] 280|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 281 282|| ADD @X[$b0],@X[$a0],@X[$a0] 283|| ADD @X[$b1],@X[$a1],@X[$a1] 284|| XOR @Y[$a0],@Y[$d0],@Y[$d0] 285|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 286 XOR @Y[$a2],@Y[$d2],@Y[$d2] 287|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 288|| ADD @X[$b2],@X[$a2],@X[$a2] 289|| ADD @X[$b3],@X[$a3],@X[$a3] 290|| XOR @X[$a0],@X[$d0],@X[$d0] 291|| XOR @X[$a1],@X[$d1],@X[$d1] 292 XOR @X[$a2],@X[$d2],@X[$d2] 293|| XOR @X[$a3],@X[$d3],@X[$d3] 294|| ROTL @X[$d0],8,@X[$d0] 295|| ROTL @X[$d1],8,@X[$d1] 296|| SWAP2 @Y[$d0],@Y[$d0] ; rotate by 16 297|| SWAP2 @Y[$d1],@Y[$d1] 298|| SWAP2 @Y[$d2],@Y[$d2] 299|| SWAP2 @Y[$d3],@Y[$d3] 300 ROTL @X[$d2],8,@X[$d2] 301|| ROTL @X[$d3],8,@X[$d3] 302|| ADD @Y[$d0],@Y[$c2],@Y[$c2] 303|| ADD @Y[$d1],@Y[$c3],@Y[$c3] 304|| ADD @Y[$d2],@Y[$c0],@Y[$c0] 305|| ADD @Y[$d3],@Y[$c1],@Y[$c1] 306|| BNOP bottom2x1? ; protect from interrupt 307 308 ADD @X[$d0],@X[$c2],@X[$c2] 309|| ADD @X[$d1],@X[$c3],@X[$c3] 310|| XOR @Y[$c2],@Y[$b0],@Y[$b0] 311|| XOR @Y[$c3],@Y[$b1],@Y[$b1] 312|| XOR @Y[$c0],@Y[$b2],@Y[$b2] 313|| XOR @Y[$c1],@Y[$b3],@Y[$b3] 314 ADD @X[$d2],@X[$c0],@X[$c0] 315|| ADD @X[$d3],@X[$c1],@X[$c1] 316|| XOR @X[$c2],@X[$b0],@X[$b0] 317|| XOR @X[$c3],@X[$b1],@X[$b1] 318|| ROTL @X[$d0],0,@X[$d3] ; moved to avoid cross-path stall 319|| ROTL @X[$d1],0,@X[$d0] 320 XOR @X[$c0],@X[$b2],@X[$b2] 321|| XOR @X[$c1],@X[$b3],@X[$b3] 322|| MV @X[$d2],@X[$d1] 323|| MV @X[$d3],@X[$d2] 324|| ROTL @Y[$b0],12,@Y[$b0] 325|| ROTL @Y[$b1],12,@Y[$b1] 326 ROTL @X[$b0],7,@X[$b1] ; avoided cross-path stall 327|| ROTL @X[$b1],7,@X[$b2] 328 ROTL @X[$b2],7,@X[$b3] 329|| ROTL @X[$b3],7,@X[$b0] 330|| [B0] SUB B0,1,B0 ; decrement inner loop counter 331bottom2x1?: 332 333 ROTL @Y[$b2],12,@Y[$b2] 334|| ROTL @Y[$b3],12,@Y[$b3] 335|| [B0] ADD @X[$b1],@X[$a1],@X[$a1] ; modulo-scheduled 336|| [B0] ADD @X[$b2],@X[$a2],@X[$a2] 337 [B0] ADD @X[$b0],@X[$a0],@X[$a0] 338|| [B0] ADD @X[$b3],@X[$a3],@X[$a3] 339 340|| ADD @Y[$b0],@Y[$a0],@Y[$a0] 341|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 342|| [B0] XOR @X[$a1],@X[$d1],@X[$d1] 343|| [B0] XOR @X[$a2],@X[$d2],@X[$d2] 344 [B0] XOR @X[$a0],@X[$d0],@X[$d0] 345|| [B0] XOR @X[$a3],@X[$d3],@X[$d3] 346|| ADD @Y[$b2],@Y[$a2],@Y[$a2] 347|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 348|| XOR @Y[$a0],@Y[$d0],@Y[$d0] 349|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 350 XOR @Y[$a2],@Y[$d2],@Y[$d2] 351|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 352|| ROTL @Y[$d0],8,@Y[$d0] 353|| ROTL @Y[$d1],8,@Y[$d1] 354|| [B0] SWAP2 @X[$d1],@X[$d1] ; rotate by 16 355|| [B0] SWAP2 @X[$d2],@X[$d2] 356|| [B0] SWAP2 @X[$d0],@X[$d0] 357|| [B0] SWAP2 @X[$d3],@X[$d3] 358 ROTL @Y[$d2],8,@Y[$d2] 359|| ROTL @Y[$d3],8,@Y[$d3] 360|| [B0] ADD @X[$d1],@X[$c1],@X[$c1] 361|| [B0] ADD @X[$d2],@X[$c2],@X[$c2] 362|| [B0] ADD @X[$d0],@X[$c0],@X[$c0] 363|| [B0] ADD @X[$d3],@X[$c3],@X[$c3] 364|| [B0] BNOP top2x? ; even protects from interrupt 365 366 ADD @Y[$d0],@Y[$c2],@Y[$c2] 367|| ADD @Y[$d1],@Y[$c3],@Y[$c3] 368|| [B0] XOR @X[$c1],@X[$b1],@X[$b1] 369|| [B0] XOR @X[$c2],@X[$b2],@X[$b2] 370|| [B0] XOR @X[$c0],@X[$b0],@X[$b0] 371|| [B0] XOR @X[$c3],@X[$b3],@X[$b3] 372 ADD @Y[$d2],@Y[$c0],@Y[$c0] 373|| ADD @Y[$d3],@Y[$c1],@Y[$c1] 374|| XOR @Y[$c2],@Y[$b0],@Y[$b0] 375|| XOR @Y[$c3],@Y[$b1],@Y[$b1] 376|| ROTL @Y[$d0],0,@Y[$d3] ; moved to avoid cross-path stall 377|| ROTL @Y[$d1],0,@Y[$d0] 378 XOR @Y[$c0],@Y[$b2],@Y[$b2] 379|| XOR @Y[$c1],@Y[$b3],@Y[$b3] 380|| MV @Y[$d2],@Y[$d1] 381|| MV @Y[$d3],@Y[$d2] 382|| [B0] ROTL @X[$b1],12,@X[$b1] 383|| [B0] ROTL @X[$b2],12,@X[$b2] 384 ROTL @Y[$b0],7,@Y[$b1] ; avoided cross-path stall 385|| ROTL @Y[$b1],7,@Y[$b2] 386 ROTL @Y[$b2],7,@Y[$b3] 387|| ROTL @Y[$b3],7,@Y[$b0] 388bottom2x2?: 389___ 390} 391 392$code.=<<___; 393 ADD @K2x[0],@X[0],@X[0] ; accumulate key material 394|| ADD @K2x[1],@X[1],@X[1] 395|| ADD @K2x[2],@X[2],@X[2] 396|| ADD @K2x[3],@X[3],@X[3] 397 ADD @K2x[0],@Y[0],@Y[0] 398|| ADD @K2x[1],@Y[1],@Y[1] 399|| ADD @K2x[2],@Y[2],@Y[2] 400|| ADD @K2x[3],@Y[3],@Y[3] 401|| LDNDW *${INP}++[8],@DAT[1]:@DAT[0] 402 ADD @K2x[4],@X[4],@X[4] 403|| ADD @K2x[5],@X[5],@X[5] 404|| ADD @K2x[6],@X[6],@X[6] 405|| ADD @K2x[7],@X[7],@X[7] 406|| LDNDW *${INP}[-7],@DAT[3]:@DAT[2] 407 ADD @K2x[4],@Y[4],@Y[4] 408|| ADD @K2x[5],@Y[5],@Y[5] 409|| ADD @K2x[6],@Y[6],@Y[6] 410|| ADD @K2x[7],@Y[7],@Y[7] 411|| LDNDW *${INP}[-6],@DAT[5]:@DAT[4] 412 ADD @K2x[8],@X[8],@X[8] 413|| ADD @K2x[9],@X[9],@X[9] 414|| ADD @K2x[10],@X[10],@X[10] 415|| ADD @K2x[11],@X[11],@X[11] 416|| LDNDW *${INP}[-5],@DAT[7]:@DAT[6] 417 ADD @K2x[8],@Y[8],@Y[8] 418|| ADD @K2x[9],@Y[9],@Y[9] 419|| ADD @K2x[10],@Y[10],@Y[10] 420|| ADD @K2x[11],@Y[11],@Y[11] 421|| LDNDW *${INP}[-4],@DAT[9]:@DAT[8] 422 ADD @K2x[12],@X[12],@X[12] 423|| ADD @K2x[13],@X[13],@X[13] 424|| ADD @K2x[14],@X[14],@X[14] 425|| ADD @K2x[15],@X[15],@X[15] 426|| LDNDW *${INP}[-3],@DAT[11]:@DAT[10] 427 ADD @K2x[12],@Y[12],@Y[12] 428|| ADD @K2x[13],@Y[13],@Y[13] 429|| ADD @K2x[14],@Y[14],@Y[14] 430|| ADD @K2x[15],@Y[15],@Y[15] 431|| LDNDW *${INP}[-2],@DAT[13]:@DAT[12] 432 ADD 1,@Y[12],@Y[12] ; adjust counter for 2nd block 433|| ADD 2,@K2x[12],@K2x[12] ; increment counter 434|| LDNDW *${INP}[-1],@DAT[15]:@DAT[14] 435 436 .if .BIG_ENDIAN 437 SWAP2 @X[0],@X[0] 438|| SWAP2 @X[1],@X[1] 439|| SWAP2 @X[2],@X[2] 440|| SWAP2 @X[3],@X[3] 441 SWAP2 @X[4],@X[4] 442|| SWAP2 @X[5],@X[5] 443|| SWAP2 @X[6],@X[6] 444|| SWAP2 @X[7],@X[7] 445 SWAP2 @X[8],@X[8] 446|| SWAP2 @X[9],@X[9] 447|| SWAP4 @X[0],@X[1] 448|| SWAP4 @X[1],@X[0] 449 SWAP2 @X[10],@X[10] 450|| SWAP2 @X[11],@X[11] 451|| SWAP4 @X[2],@X[3] 452|| SWAP4 @X[3],@X[2] 453 SWAP2 @X[12],@X[12] 454|| SWAP2 @X[13],@X[13] 455|| SWAP4 @X[4],@X[5] 456|| SWAP4 @X[5],@X[4] 457 SWAP2 @X[14],@X[14] 458|| SWAP2 @X[15],@X[15] 459|| SWAP4 @X[6],@X[7] 460|| SWAP4 @X[7],@X[6] 461 SWAP4 @X[8],@X[9] 462|| SWAP4 @X[9],@X[8] 463|| SWAP2 @Y[0],@Y[0] 464|| SWAP2 @Y[1],@Y[1] 465 SWAP4 @X[10],@X[11] 466|| SWAP4 @X[11],@X[10] 467|| SWAP2 @Y[2],@Y[2] 468|| SWAP2 @Y[3],@Y[3] 469 SWAP4 @X[12],@X[13] 470|| SWAP4 @X[13],@X[12] 471|| SWAP2 @Y[4],@Y[4] 472|| SWAP2 @Y[5],@Y[5] 473 SWAP4 @X[14],@X[15] 474|| SWAP4 @X[15],@X[14] 475|| SWAP2 @Y[6],@Y[6] 476|| SWAP2 @Y[7],@Y[7] 477 SWAP2 @Y[8],@Y[8] 478|| SWAP2 @Y[9],@Y[9] 479|| SWAP4 @Y[0],@Y[1] 480|| SWAP4 @Y[1],@Y[0] 481 SWAP2 @Y[10],@Y[10] 482|| SWAP2 @Y[11],@Y[11] 483|| SWAP4 @Y[2],@Y[3] 484|| SWAP4 @Y[3],@Y[2] 485 SWAP2 @Y[12],@Y[12] 486|| SWAP2 @Y[13],@Y[13] 487|| SWAP4 @Y[4],@Y[5] 488|| SWAP4 @Y[5],@Y[4] 489 SWAP2 @Y[14],@Y[14] 490|| SWAP2 @Y[15],@Y[15] 491|| SWAP4 @Y[6],@Y[7] 492|| SWAP4 @Y[7],@Y[6] 493 SWAP4 @Y[8],@Y[9] 494|| SWAP4 @Y[9],@Y[8] 495 SWAP4 @Y[10],@Y[11] 496|| SWAP4 @Y[11],@Y[10] 497 SWAP4 @Y[12],@Y[13] 498|| SWAP4 @Y[13],@Y[12] 499 SWAP4 @Y[14],@Y[15] 500|| SWAP4 @Y[15],@Y[14] 501 .endif 502 503 XOR @DAT[0],@X[0],@X[0] ; xor 1st block 504|| XOR @DAT[3],@X[3],@X[3] 505|| XOR @DAT[2],@X[2],@X[1] 506|| XOR @DAT[1],@X[1],@X[2] 507|| LDNDW *${INP}++[8],@DAT[1]:@DAT[0] 508 XOR @DAT[4],@X[4],@X[4] 509|| XOR @DAT[7],@X[7],@X[7] 510|| LDNDW *${INP}[-7],@DAT[3]:@DAT[2] 511 XOR @DAT[6],@X[6],@X[5] 512|| XOR @DAT[5],@X[5],@X[6] 513|| LDNDW *${INP}[-6],@DAT[5]:@DAT[4] 514 XOR @DAT[8],@X[8],@X[8] 515|| XOR @DAT[11],@X[11],@X[11] 516|| LDNDW *${INP}[-5],@DAT[7]:@DAT[6] 517 XOR @DAT[10],@X[10],@X[9] 518|| XOR @DAT[9],@X[9],@X[10] 519|| LDNDW *${INP}[-4],@DAT[9]:@DAT[8] 520 XOR @DAT[12],@X[12],@X[12] 521|| XOR @DAT[15],@X[15],@X[15] 522|| LDNDW *${INP}[-3],@DAT[11]:@DAT[10] 523 XOR @DAT[14],@X[14],@X[13] 524|| XOR @DAT[13],@X[13],@X[14] 525|| LDNDW *${INP}[-2],@DAT[13]:@DAT[12] 526 [A0] SUB A0,$STEP,A0 ; SUB A0,128,A0 527|| LDNDW *${INP}[-1],@DAT[15]:@DAT[14] 528 529 XOR @Y[0],@DAT[0],@DAT[0] ; xor 2nd block 530|| XOR @Y[1],@DAT[1],@DAT[1] 531|| STNDW @X[2]:@X[0],*${OUT}++[8] 532 XOR @Y[2],@DAT[2],@DAT[2] 533|| XOR @Y[3],@DAT[3],@DAT[3] 534|| STNDW @X[3]:@X[1],*${OUT}[-7] 535 XOR @Y[4],@DAT[4],@DAT[4] 536|| [A0] LDDW *FP[-12],@X[2]:@X[0] ; re-load key material from stack 537|| [A0] LDDW *SP[2], @X[3]:@X[1] 538 XOR @Y[5],@DAT[5],@DAT[5] 539|| STNDW @X[6]:@X[4],*${OUT}[-6] 540 XOR @Y[6],@DAT[6],@DAT[6] 541|| XOR @Y[7],@DAT[7],@DAT[7] 542|| STNDW @X[7]:@X[5],*${OUT}[-5] 543 XOR @Y[8],@DAT[8],@DAT[8] 544|| [A0] LDDW *FP[-10],@X[6]:@X[4] 545|| [A0] LDDW *SP[4], @X[7]:@X[5] 546 XOR @Y[9],@DAT[9],@DAT[9] 547|| STNDW @X[10]:@X[8],*${OUT}[-4] 548 XOR @Y[10],@DAT[10],@DAT[10] 549|| XOR @Y[11],@DAT[11],@DAT[11] 550|| STNDW @X[11]:@X[9],*${OUT}[-3] 551 XOR @Y[12],@DAT[12],@DAT[12] 552|| [A0] LDDW *FP[-8], @X[10]:@X[8] 553|| [A0] LDDW *SP[6], @X[11]:@X[9] 554 XOR @Y[13],@DAT[13],@DAT[13] 555|| STNDW @X[14]:@X[12],*${OUT}[-2] 556 XOR @Y[14],@DAT[14],@DAT[14] 557|| XOR @Y[15],@DAT[15],@DAT[15] 558|| STNDW @X[15]:@X[13],*${OUT}[-1] 559 560 [A0] MV @K2x[12],@X[12] 561|| [A0] MV @K2x[13],@X[13] 562|| [A0] LDW *FP[-6*2], @X[14] 563|| [A0] LDW *SP[8*2], @X[15] 564 565 [A0] DMV @X[2],@X[0],@Y[2]:@Y[0] ; duplicate key material 566|| STNDW @DAT[1]:@DAT[0],*${OUT}++[8] 567 [A0] DMV @X[3],@X[1],@Y[3]:@Y[1] 568|| STNDW @DAT[3]:@DAT[2],*${OUT}[-7] 569 [A0] DMV @X[6],@X[4],@Y[6]:@Y[4] 570|| STNDW @DAT[5]:@DAT[4],*${OUT}[-6] 571|| CMPLTU A0,$STEP,A1 ; is remaining length < 2*blocks? 572||[!A0] BNOP epilogue? 573 [A0] DMV @X[7],@X[5],@Y[7]:@Y[5] 574|| STNDW @DAT[7]:@DAT[6],*${OUT}[-5] 575||[!A1] BNOP outer2x? 576 [A0] DMV @X[10],@X[8],@Y[10]:@Y[8] 577|| STNDW @DAT[9]:@DAT[8],*${OUT}[-4] 578 [A0] DMV @X[11],@X[9],@Y[11]:@Y[9] 579|| STNDW @DAT[11]:@DAT[10],*${OUT}[-3] 580 [A0] DMV @X[14],@X[12],@Y[14]:@Y[12] 581|| STNDW @DAT[13]:@DAT[12],*${OUT}[-2] 582 [A0] DMV @X[15],@X[13],@Y[15]:@Y[13] 583|| STNDW @DAT[15]:@DAT[14],*${OUT}[-1] 584;;===== branch to epilogue? is taken here 585 [A1] MVK 64,$STEP 586|| [A0] MVK 10,B0 ; inner loop counter 587;;===== branch to outer2x? is taken here 588___ 589{ 590my ($a0,$a1,$a2,$a3) = (0..3); 591my ($b0,$b1,$b2,$b3) = (4..7); 592my ($c0,$c1,$c2,$c3) = (8..11); 593my ($d0,$d1,$d2,$d3) = (12..15); 594 595$code.=<<___; 596top1x?: 597 ADD @X[$b1],@X[$a1],@X[$a1] 598|| ADD @X[$b2],@X[$a2],@X[$a2] 599 ADD @X[$b0],@X[$a0],@X[$a0] 600|| ADD @X[$b3],@X[$a3],@X[$a3] 601|| XOR @X[$a1],@X[$d1],@X[$d1] 602|| XOR @X[$a2],@X[$d2],@X[$d2] 603 XOR @X[$a0],@X[$d0],@X[$d0] 604|| XOR @X[$a3],@X[$d3],@X[$d3] 605|| SWAP2 @X[$d1],@X[$d1] ; rotate by 16 606|| SWAP2 @X[$d2],@X[$d2] 607 SWAP2 @X[$d0],@X[$d0] 608|| SWAP2 @X[$d3],@X[$d3] 609 610|| ADD @X[$d1],@X[$c1],@X[$c1] 611|| ADD @X[$d2],@X[$c2],@X[$c2] 612 ADD @X[$d0],@X[$c0],@X[$c0] 613|| ADD @X[$d3],@X[$c3],@X[$c3] 614|| XOR @X[$c1],@X[$b1],@X[$b1] 615|| XOR @X[$c2],@X[$b2],@X[$b2] 616 XOR @X[$c0],@X[$b0],@X[$b0] 617|| XOR @X[$c3],@X[$b3],@X[$b3] 618|| ROTL @X[$b1],12,@X[$b1] 619|| ROTL @X[$b2],12,@X[$b2] 620 ROTL @X[$b0],12,@X[$b0] 621|| ROTL @X[$b3],12,@X[$b3] 622 623 ADD @X[$b1],@X[$a1],@X[$a1] 624|| ADD @X[$b2],@X[$a2],@X[$a2] 625 ADD @X[$b0],@X[$a0],@X[$a0] 626|| ADD @X[$b3],@X[$a3],@X[$a3] 627|| XOR @X[$a1],@X[$d1],@X[$d1] 628|| XOR @X[$a2],@X[$d2],@X[$d2] 629 XOR @X[$a0],@X[$d0],@X[$d0] 630|| XOR @X[$a3],@X[$d3],@X[$d3] 631|| ROTL @X[$d1],8,@X[$d1] 632|| ROTL @X[$d2],8,@X[$d2] 633 ROTL @X[$d0],8,@X[$d0] 634|| ROTL @X[$d3],8,@X[$d3] 635|| BNOP middle1x? ; protect from interrupt 636 637 ADD @X[$d1],@X[$c1],@X[$c1] 638|| ADD @X[$d2],@X[$c2],@X[$c2] 639 ADD @X[$d0],@X[$c0],@X[$c0] 640|| ADD @X[$d3],@X[$c3],@X[$c3] 641|| XOR @X[$c1],@X[$b1],@X[$b1] 642|| XOR @X[$c2],@X[$b2],@X[$b2] 643|| ROTL @X[$d1],0,@X[$d2] ; moved to avoid cross-path stall 644|| ROTL @X[$d2],0,@X[$d3] 645 XOR @X[$c0],@X[$b0],@X[$b0] 646|| XOR @X[$c3],@X[$b3],@X[$b3] 647|| ROTL @X[$d0],0,@X[$d1] 648|| ROTL @X[$d3],0,@X[$d0] 649 ROTL @X[$b1],7,@X[$b0] ; avoided cross-path stall 650|| ROTL @X[$b2],7,@X[$b1] 651 ROTL @X[$b0],7,@X[$b3] 652|| ROTL @X[$b3],7,@X[$b2] 653middle1x?: 654 655 ADD @X[$b0],@X[$a0],@X[$a0] 656|| ADD @X[$b1],@X[$a1],@X[$a1] 657 ADD @X[$b2],@X[$a2],@X[$a2] 658|| ADD @X[$b3],@X[$a3],@X[$a3] 659|| XOR @X[$a0],@X[$d0],@X[$d0] 660|| XOR @X[$a1],@X[$d1],@X[$d1] 661 XOR @X[$a2],@X[$d2],@X[$d2] 662|| XOR @X[$a3],@X[$d3],@X[$d3] 663|| SWAP2 @X[$d0],@X[$d0] ; rotate by 16 664|| SWAP2 @X[$d1],@X[$d1] 665 SWAP2 @X[$d2],@X[$d2] 666|| SWAP2 @X[$d3],@X[$d3] 667 668|| ADD @X[$d0],@X[$c2],@X[$c2] 669|| ADD @X[$d1],@X[$c3],@X[$c3] 670 ADD @X[$d2],@X[$c0],@X[$c0] 671|| ADD @X[$d3],@X[$c1],@X[$c1] 672|| XOR @X[$c2],@X[$b0],@X[$b0] 673|| XOR @X[$c3],@X[$b1],@X[$b1] 674 XOR @X[$c0],@X[$b2],@X[$b2] 675|| XOR @X[$c1],@X[$b3],@X[$b3] 676|| ROTL @X[$b0],12,@X[$b0] 677|| ROTL @X[$b1],12,@X[$b1] 678 ROTL @X[$b2],12,@X[$b2] 679|| ROTL @X[$b3],12,@X[$b3] 680 681 ADD @X[$b0],@X[$a0],@X[$a0] 682|| ADD @X[$b1],@X[$a1],@X[$a1] 683|| [B0] SUB B0,1,B0 ; decrement inner loop counter 684 ADD @X[$b2],@X[$a2],@X[$a2] 685|| ADD @X[$b3],@X[$a3],@X[$a3] 686|| XOR @X[$a0],@X[$d0],@X[$d0] 687|| XOR @X[$a1],@X[$d1],@X[$d1] 688 XOR @X[$a2],@X[$d2],@X[$d2] 689|| XOR @X[$a3],@X[$d3],@X[$d3] 690|| ROTL @X[$d0],8,@X[$d0] 691|| ROTL @X[$d1],8,@X[$d1] 692 ROTL @X[$d2],8,@X[$d2] 693|| ROTL @X[$d3],8,@X[$d3] 694|| [B0] BNOP top1x? ; even protects from interrupt 695 696 ADD @X[$d0],@X[$c2],@X[$c2] 697|| ADD @X[$d1],@X[$c3],@X[$c3] 698 ADD @X[$d2],@X[$c0],@X[$c0] 699|| ADD @X[$d3],@X[$c1],@X[$c1] 700|| XOR @X[$c2],@X[$b0],@X[$b0] 701|| XOR @X[$c3],@X[$b1],@X[$b1] 702|| ROTL @X[$d0],0,@X[$d3] ; moved to avoid cross-path stall 703|| ROTL @X[$d1],0,@X[$d0] 704 XOR @X[$c0],@X[$b2],@X[$b2] 705|| XOR @X[$c1],@X[$b3],@X[$b3] 706|| ROTL @X[$d2],0,@X[$d1] 707|| ROTL @X[$d3],0,@X[$d2] 708 ROTL @X[$b0],7,@X[$b1] ; avoided cross-path stall 709|| ROTL @X[$b1],7,@X[$b2] 710 ROTL @X[$b2],7,@X[$b3] 711|| ROTL @X[$b3],7,@X[$b0] 712||[!B0] CMPLTU A0,$STEP,A1 ; less than 64 bytes left? 713bottom1x?: 714___ 715} 716 717$code.=<<___; 718 ADD @Y[0],@X[0],@X[0] ; accumulate key material 719|| ADD @Y[1],@X[1],@X[1] 720|| ADD @Y[2],@X[2],@X[2] 721|| ADD @Y[3],@X[3],@X[3] 722||[!A1] LDNDW *${INP}++[8],@DAT[1]:@DAT[0] 723|| [A1] BNOP tail? 724 ADD @Y[4],@X[4],@X[4] 725|| ADD @Y[5],@X[5],@X[5] 726|| ADD @Y[6],@X[6],@X[6] 727|| ADD @Y[7],@X[7],@X[7] 728||[!A1] LDNDW *${INP}[-7],@DAT[3]:@DAT[2] 729 ADD @Y[8],@X[8],@X[8] 730|| ADD @Y[9],@X[9],@X[9] 731|| ADD @Y[10],@X[10],@X[10] 732|| ADD @Y[11],@X[11],@X[11] 733||[!A1] LDNDW *${INP}[-6],@DAT[5]:@DAT[4] 734 ADD @Y[12],@X[12],@X[12] 735|| ADD @Y[13],@X[13],@X[13] 736|| ADD @Y[14],@X[14],@X[14] 737|| ADD @Y[15],@X[15],@X[15] 738||[!A1] LDNDW *${INP}[-5],@DAT[7]:@DAT[6] 739 [!A1] LDNDW *${INP}[-4],@DAT[9]:@DAT[8] 740 [!A1] LDNDW *${INP}[-3],@DAT[11]:@DAT[10] 741 LDNDW *${INP}[-2],@DAT[13]:@DAT[12] 742 LDNDW *${INP}[-1],@DAT[15]:@DAT[14] 743 744 .if .BIG_ENDIAN 745 SWAP2 @X[0],@X[0] 746|| SWAP2 @X[1],@X[1] 747|| SWAP2 @X[2],@X[2] 748|| SWAP2 @X[3],@X[3] 749 SWAP2 @X[4],@X[4] 750|| SWAP2 @X[5],@X[5] 751|| SWAP2 @X[6],@X[6] 752|| SWAP2 @X[7],@X[7] 753 SWAP2 @X[8],@X[8] 754|| SWAP2 @X[9],@X[9] 755|| SWAP4 @X[0],@X[1] 756|| SWAP4 @X[1],@X[0] 757 SWAP2 @X[10],@X[10] 758|| SWAP2 @X[11],@X[11] 759|| SWAP4 @X[2],@X[3] 760|| SWAP4 @X[3],@X[2] 761 SWAP2 @X[12],@X[12] 762|| SWAP2 @X[13],@X[13] 763|| SWAP4 @X[4],@X[5] 764|| SWAP4 @X[5],@X[4] 765 SWAP2 @X[14],@X[14] 766|| SWAP2 @X[15],@X[15] 767|| SWAP4 @X[6],@X[7] 768|| SWAP4 @X[7],@X[6] 769 SWAP4 @X[8],@X[9] 770|| SWAP4 @X[9],@X[8] 771 SWAP4 @X[10],@X[11] 772|| SWAP4 @X[11],@X[10] 773 SWAP4 @X[12],@X[13] 774|| SWAP4 @X[13],@X[12] 775 SWAP4 @X[14],@X[15] 776|| SWAP4 @X[15],@X[14] 777 .else 778 NOP 1 779 .endif 780 781 XOR @X[0],@DAT[0],@DAT[0] ; xor with input 782|| XOR @X[1],@DAT[1],@DAT[1] 783|| XOR @X[2],@DAT[2],@DAT[2] 784|| XOR @X[3],@DAT[3],@DAT[3] 785|| [A0] SUB A0,$STEP,A0 ; SUB A0,64,A0 786 XOR @X[4],@DAT[4],@DAT[4] 787|| XOR @X[5],@DAT[5],@DAT[5] 788|| XOR @X[6],@DAT[6],@DAT[6] 789|| XOR @X[7],@DAT[7],@DAT[7] 790|| STNDW @DAT[1]:@DAT[0],*${OUT}++[8] 791 XOR @X[8],@DAT[8],@DAT[8] 792|| XOR @X[9],@DAT[9],@DAT[9] 793|| XOR @X[10],@DAT[10],@DAT[10] 794|| XOR @X[11],@DAT[11],@DAT[11] 795|| STNDW @DAT[3]:@DAT[2],*${OUT}[-7] 796 XOR @X[12],@DAT[12],@DAT[12] 797|| XOR @X[13],@DAT[13],@DAT[13] 798|| XOR @X[14],@DAT[14],@DAT[14] 799|| XOR @X[15],@DAT[15],@DAT[15] 800|| STNDW @DAT[5]:@DAT[4],*${OUT}[-6] 801|| [A0] BNOP top1x? 802 [A0] DMV @Y[2],@Y[0],@X[2]:@X[0] ; duplicate key material 803|| [A0] DMV @Y[3],@Y[1],@X[3]:@X[1] 804|| STNDW @DAT[7]:@DAT[6],*${OUT}[-5] 805 [A0] DMV @Y[6],@Y[4],@X[6]:@X[4] 806|| [A0] DMV @Y[7],@Y[5],@X[7]:@X[5] 807|| STNDW @DAT[9]:@DAT[8],*${OUT}[-4] 808 [A0] DMV @Y[10],@Y[8],@X[10]:@X[8] 809|| [A0] DMV @Y[11],@Y[9],@X[11]:@X[9] 810|| [A0] ADD 1,@Y[12],@Y[12] ; increment counter 811|| STNDW @DAT[11]:@DAT[10],*${OUT}[-3] 812 [A0] DMV @Y[14],@Y[12],@X[14]:@X[12] 813|| [A0] DMV @Y[15],@Y[13],@X[15]:@X[13] 814|| STNDW @DAT[13]:@DAT[12],*${OUT}[-2] 815 [A0] MVK 10,B0 ; inner loop counter 816|| STNDW @DAT[15]:@DAT[14],*${OUT}[-1] 817;;===== branch to top1x? is taken here 818 819epilogue?: 820 LDDW *FP[-4],A11:A10 ; ABI says so 821 LDDW *FP[-3],A13:A12 822|| LDDW *SP[3+8],B11:B10 823 LDDW *SP[4+8],B13:B12 824|| BNOP RA 825 LDW *++SP(40+64),FP ; restore frame pointer 826 NOP 4 827 828tail?: 829 LDBU *${INP}++[1],B24 ; load byte by byte 830|| SUB A0,1,A0 831|| SUB A0,1,B1 832 [!B1] BNOP epilogue? ; interrupts are disabled for whole time 833|| [A0] LDBU *${INP}++[1],B24 834|| [A0] SUB A0,1,A0 835|| SUB B1,1,B1 836 [!B1] BNOP epilogue? 837|| [A0] LDBU *${INP}++[1],B24 838|| [A0] SUB A0,1,A0 839|| SUB B1,1,B1 840 [!B1] BNOP epilogue? 841|| ROTL @X[0],0,A24 842|| [A0] LDBU *${INP}++[1],B24 843|| [A0] SUB A0,1,A0 844|| SUB B1,1,B1 845 [!B1] BNOP epilogue? 846|| ROTL @X[0],24,A24 847|| [A0] LDBU *${INP}++[1],A24 848|| [A0] SUB A0,1,A0 849|| SUB B1,1,B1 850 [!B1] BNOP epilogue? 851|| ROTL @X[0],16,A24 852|| [A0] LDBU *${INP}++[1],A24 853|| [A0] SUB A0,1,A0 854|| SUB B1,1,B1 855|| XOR A24,B24,B25 856 STB B25,*${OUT}++[1] ; store byte by byte 857||[!B1] BNOP epilogue? 858|| ROTL @X[0],8,A24 859|| [A0] LDBU *${INP}++[1],A24 860|| [A0] SUB A0,1,A0 861|| SUB B1,1,B1 862|| XOR A24,B24,B25 863 STB B25,*${OUT}++[1] 864___ 865sub TAIL_STEP { 866my $Xi= shift; 867my $T = ($Xi=~/^B/?"B24":"A24"); # match @X[i] to avoid cross path 868my $D = $T; $D=~tr/AB/BA/; 869my $O = $D; $O=~s/24/25/; 870 871$code.=<<___; 872||[!B1] BNOP epilogue? 873|| ROTL $Xi,0,$T 874|| [A0] LDBU *${INP}++[1],$D 875|| [A0] SUB A0,1,A0 876|| SUB B1,1,B1 877|| XOR A24,B24,$O 878 STB $O,*${OUT}++[1] 879||[!B1] BNOP epilogue? 880|| ROTL $Xi,24,$T 881|| [A0] LDBU *${INP}++[1],$T 882|| [A0] SUB A0,1,A0 883|| SUB B1,1,B1 884|| XOR A24,B24,$O 885 STB $O,*${OUT}++[1] 886||[!B1] BNOP epilogue? 887|| ROTL $Xi,16,$T 888|| [A0] LDBU *${INP}++[1],$T 889|| [A0] SUB A0,1,A0 890|| SUB B1,1,B1 891|| XOR A24,B24,$O 892 STB $O,*${OUT}++[1] 893||[!B1] BNOP epilogue? 894|| ROTL $Xi,8,$T 895|| [A0] LDBU *${INP}++[1],$T 896|| [A0] SUB A0,1,A0 897|| SUB B1,1,B1 898|| XOR A24,B24,$O 899 STB $O,*${OUT}++[1] 900___ 901} 902 foreach (1..14) { TAIL_STEP(@X[$_]); } 903$code.=<<___; 904||[!B1] BNOP epilogue? 905|| ROTL @X[15],0,B24 906|| XOR A24,B24,A25 907 STB A25,*${OUT}++[1] 908|| ROTL @X[15],24,B24 909|| XOR A24,B24,A25 910 STB A25,*${OUT}++[1] 911|| ROTL @X[15],16,B24 912|| XOR A24,B24,A25 913 STB A25,*${OUT}++[1] 914|| XOR A24,B24,A25 915 STB A25,*${OUT}++[1] 916|| XOR A24,B24,B25 917 STB B25,*${OUT}++[1] 918 .endasmfunc 919 920 .sect .const 921 .cstring "ChaCha20 for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 922 .align 4 923___ 924 925print $code; 926close STDOUT or die "error closing STDOUT: $!"; 927