1;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. 2;; 3;; Licensed under the OpenSSL license (the "License"). You may not use 4;; this file except in compliance with the License. You can obtain a copy 5;; in the file LICENSE in the source distribution or at 6;; https://www.openssl.org/source/license.html 7;; 8;;==================================================================== 9;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 10;; project. 11;; 12;; Rights for redistribution and usage in source and binary forms are 13;; granted according to the OpenSSL license. Warranty of any kind is 14;; disclaimed. 15;;==================================================================== 16;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n 17;; being the number of 32-bit words, addition - 8*n. Corresponding 4x 18;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler 19;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. 20;;==================================================================== 21 .text 22 23 .if .ASSEMBLER_VERSION<7000000 24 .asg 0,__TI_EABI__ 25 .endif 26 .if __TI_EABI__ 27 .asg bn_mul_add_words,_bn_mul_add_words 28 .asg bn_mul_words,_bn_mul_words 29 .asg bn_sqr_words,_bn_sqr_words 30 .asg bn_add_words,_bn_add_words 31 .asg bn_sub_words,_bn_sub_words 32 .asg bn_div_words,_bn_div_words 33 .asg bn_sqr_comba8,_bn_sqr_comba8 34 .asg bn_mul_comba8,_bn_mul_comba8 35 .asg bn_sqr_comba4,_bn_sqr_comba4 36 .asg bn_mul_comba4,_bn_mul_comba4 37 .endif 38 39 .asg B3,RA 40 .asg A4,ARG0 41 .asg B4,ARG1 42 .asg A6,ARG2 43 .asg B6,ARG3 44 .asg A8,ARG4 45 .asg B8,ARG5 46 .asg A4,RET 47 .asg A15,FP 48 .asg B14,DP 49 .asg B15,SP 50 51 .global _bn_mul_add_words 52_bn_mul_add_words: 53 .asmfunc 54 MV ARG2,B0 55 [!B0] BNOP RA 56||[!B0] MVK 0,RET 57 [B0] MVC B0,ILC 58 [B0] ZERO A19 ; high part of accumulator 59|| [B0] MV ARG0,A2 60|| [B0] MV ARG3,A3 61 NOP 3 62 63 SPLOOP 2 ; 2*n+10 64;;==================================================================== 65 LDW *ARG1++,B7 ; ap[i] 66 NOP 3 67 LDW *ARG0++,A7 ; rp[i] 68 MPY32U B7,A3,A17:A16 69 NOP 3 ; [2,0] in epilogue 70 ADDU A16,A7,A21:A20 71 ADDU A19,A21:A20,A19:A18 72|| MV.S A17,A23 73 SPKERNEL 2,1 ; leave slot for "return value" 74|| STW A18,*A2++ ; rp[i] 75|| ADD A19,A23,A19 76;;==================================================================== 77 BNOP RA,4 78 MV A19,RET ; return value 79 .endasmfunc 80 81 .global _bn_mul_words 82_bn_mul_words: 83 .asmfunc 84 MV ARG2,B0 85 [!B0] BNOP RA 86||[!B0] MVK 0,RET 87 [B0] MVC B0,ILC 88 [B0] ZERO A19 ; high part of accumulator 89 NOP 3 90 91 SPLOOP 2 ; 2*n+10 92;;==================================================================== 93 LDW *ARG1++,A7 ; ap[i] 94 NOP 4 95 MPY32U A7,ARG3,A17:A16 96 NOP 4 ; [2,0] in epiloque 97 ADDU A19,A16,A19:A18 98|| MV.S A17,A21 99 SPKERNEL 2,1 ; leave slot for "return value" 100|| STW A18,*ARG0++ ; rp[i] 101|| ADD.L A19,A21,A19 102;;==================================================================== 103 BNOP RA,4 104 MV A19,RET ; return value 105 .endasmfunc 106 107 .global _bn_sqr_words 108_bn_sqr_words: 109 .asmfunc 110 MV ARG2,B0 111 [!B0] BNOP RA 112||[!B0] MVK 0,RET 113 [B0] MVC B0,ILC 114 [B0] MV ARG0,B2 115|| [B0] ADD 4,ARG0,ARG0 116 NOP 3 117 118 SPLOOP 2 ; 2*n+10 119;;==================================================================== 120 LDW *ARG1++,B7 ; ap[i] 121 NOP 4 122 MPY32U B7,B7,B1:B0 123 NOP 3 ; [2,0] in epilogue 124 STW B0,*B2++(8) ; rp[2*i] 125 MV B1,A1 126 SPKERNEL 2,0 ; fully overlap BNOP RA,5 127|| STW A1,*ARG0++(8) ; rp[2*i+1] 128;;==================================================================== 129 BNOP RA,5 130 .endasmfunc 131 132 .global _bn_add_words 133_bn_add_words: 134 .asmfunc 135 MV ARG3,B0 136 [!B0] BNOP RA 137||[!B0] MVK 0,RET 138 [B0] MVC B0,ILC 139 [B0] ZERO A1 ; carry flag 140|| [B0] MV ARG0,A3 141 NOP 3 142 143 SPLOOP 2 ; 2*n+6 144;;==================================================================== 145 LDW *ARG2++,A7 ; bp[i] 146|| LDW *ARG1++,B7 ; ap[i] 147 NOP 4 148 ADDU A7,B7,A9:A8 149 ADDU A1,A9:A8,A1:A0 150 SPKERNEL 0,0 ; fully overlap BNOP RA,5 151|| STW A0,*A3++ ; write result 152|| MV A1,RET ; keep carry flag in RET 153;;==================================================================== 154 BNOP RA,5 155 .endasmfunc 156 157 .global _bn_sub_words 158_bn_sub_words: 159 .asmfunc 160 MV ARG3,B0 161 [!B0] BNOP RA 162||[!B0] MVK 0,RET 163 [B0] MVC B0,ILC 164 [B0] ZERO A2 ; borrow flag 165|| [B0] MV ARG0,A3 166 NOP 3 167 168 SPLOOP 2 ; 2*n+6 169;;==================================================================== 170 LDW *ARG2++,A7 ; bp[i] 171|| LDW *ARG1++,B7 ; ap[i] 172 NOP 4 173 SUBU B7,A7,A1:A0 174 [A2] SUB A1:A0,1,A1:A0 175 SPKERNEL 0,1 ; leave slot for "return borrow flag" 176|| STW A0,*A3++ ; write result 177|| AND 1,A1,A2 ; pass on borrow flag 178;;==================================================================== 179 BNOP RA,4 180 AND 1,A1,RET ; return borrow flag 181 .endasmfunc 182 183 .global _bn_div_words 184_bn_div_words: 185 .asmfunc 186 LMBD 1,A6,A0 ; leading zero bits in dv 187 LMBD 1,A4,A1 ; leading zero bits in hi 188|| MVK 32,B0 189 CMPLTU A1,A0,A2 190|| ADD A0,B0,B0 191 [ A2] BNOP RA 192||[ A2] MVK -1,A4 ; return overflow 193||[!A2] MV A4,A3 ; reassign hi 194 [!A2] MV B4,A4 ; reassign lo, will be quotient 195||[!A2] MVC B0,ILC 196 [!A2] SHL A6,A0,A6 ; normalize dv 197|| MVK 1,A1 198 199 [!A2] CMPLTU A3,A6,A1 ; hi<dv? 200||[!A2] SHL A4,1,A5:A4 ; lo<<1 201 [!A1] SUB A3,A6,A3 ; hi-=dv 202||[!A1] OR 1,A4,A4 203 [!A2] SHRU A3,31,A1 ; upper bit 204||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31 205 206 SPLOOP 3 207 [!A1] CMPLTU A3,A6,A1 ; hi<dv? 208||[ A1] ZERO A1 209|| SHL A4,1,A5:A4 ; lo<<1 210 [!A1] SUB A3,A6,A3 ; hi-=dv 211||[!A1] OR 1,A4,A4 ; quotient 212 SHRU A3,31,A1 ; upper bit 213|| ADDAH A5,A3,A3 ; hi<<1|lo>>31 214 SPKERNEL 215 216 BNOP RA,5 217 .endasmfunc 218 219;;==================================================================== 220;; Not really Comba algorithm, just straightforward NxM... Dedicated 221;; fully unrolled real Comba implementations are asymptotically 2x 222;; faster, but naturally larger undertaking. Purpose of this exercise 223;; was rather to learn to master nested SPLOOPs... 224;;==================================================================== 225 .global _bn_sqr_comba8 226 .global _bn_mul_comba8 227_bn_sqr_comba8: 228 MV ARG1,ARG2 229_bn_mul_comba8: 230 .asmfunc 231 MVK 8,B0 ; N, RILC 232|| MVK 8,A0 ; M, outer loop counter 233|| MV ARG1,A5 ; copy ap 234|| MV ARG0,B4 ; copy rp 235|| ZERO B19 ; high part of accumulator 236 MVC B0,RILC 237|| SUB B0,2,B1 ; N-2, initial ILC 238|| SUB B0,1,B2 ; const B2=N-1 239|| LDW *A5++,B6 ; ap[0] 240|| MV A0,A3 ; const A3=M 241sploopNxM?: ; for best performance arrange M<=N 242 [A0] SPLOOPD 2 ; 2*n+10 243|| MVC B1,ILC 244|| ADDAW B4,B0,B5 245|| ZERO B7 246|| LDW *A5++,A9 ; pre-fetch ap[1] 247|| ZERO A1 248|| SUB A0,1,A0 249;;==================================================================== 250;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. 251;; This is because of Advisory 15 from TI publication SPRZ247I. 252 LDW *ARG2++,A7 ; bp[i] 253 NOP 3 254 [A1] LDW *B5++,B7 ; rp[i] 255 MPY32U A7,B6,B17:B16 256 NOP 3 257 ADDU B16,B7,B21:B20 258 ADDU B19,B21:B20,B19:B18 259|| MV.S B17,B23 260 SPKERNEL 261|| STW B18,*B4++ ; rp[i] 262|| ADD.S B19,B23,B19 263;;==================================================================== 264outer?: ; m*2*(n+1)+10 265 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] 266 SPMASKR 267|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? 268 MVD A9,B6 ; move through .M unit(*) 269 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] 270 SUBAW B5,B2,B5 ; rewind rp to rp[1] 271 MVK 1,A1 272 [A0] BNOP.S1 outer?,4 273|| [A0] SUB.L A0,1,A0 274 STW B19,*B4--[B2] ; rewind rp tp rp[1] 275|| ZERO.S B19 ; high part of accumulator 276;; end of outer? 277 BNOP RA,5 ; return 278 .endasmfunc 279;; (*) It should be noted that B6 is used as input to MPY32U in 280;; chronologically next cycle in *preceding* SPLOOP iteration. 281;; Normally such arrangement would require DINT, but at this 282;; point SPLOOP is draining and interrupts are disabled 283;; implicitly. 284 285 .global _bn_sqr_comba4 286 .global _bn_mul_comba4 287_bn_sqr_comba4: 288 MV ARG1,ARG2 289_bn_mul_comba4: 290 .asmfunc 291 .if 0 292 BNOP sploopNxM?,3 293 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, 294 ;; because of low-counter effect, when prologue phase finishes 295 ;; before SPKERNEL instruction is reached. As result it's 25% 296 ;; slower than expected... 297 MVK 4,B0 ; N, RILC 298|| MVK 4,A0 ; M, outer loop counter 299|| MV ARG1,A5 ; copy ap 300|| MV ARG0,B4 ; copy rp 301|| ZERO B19 ; high part of accumulator 302 MVC B0,RILC 303|| SUB B0,2,B1 ; first ILC 304|| SUB B0,1,B2 ; const B2=N-1 305|| LDW *A5++,B6 ; ap[0] 306|| MV A0,A3 ; const A3=M 307 .else 308 ;; This alternative is an exercise in fully unrolled Comba 309 ;; algorithm implementation that operates at n*(n+1)+12, or 310 ;; as little as 32 cycles... 311 LDW *ARG1[0],B16 ; a[0] 312|| LDW *ARG2[0],A16 ; b[0] 313 LDW *ARG1[1],B17 ; a[1] 314|| LDW *ARG2[1],A17 ; b[1] 315 LDW *ARG1[2],B18 ; a[2] 316|| LDW *ARG2[2],A18 ; b[2] 317 LDW *ARG1[3],B19 ; a[3] 318|| LDW *ARG2[3],A19 ; b[3] 319 NOP 320 MPY32U A16,B16,A1:A0 ; a[0]*b[0] 321 MPY32U A17,B16,A23:A22 ; a[0]*b[1] 322 MPY32U A16,B17,A25:A24 ; a[1]*b[0] 323 MPY32U A16,B18,A27:A26 ; a[2]*b[0] 324 STW A0,*ARG0[0] 325|| MPY32U A17,B17,A29:A28 ; a[1]*b[1] 326 MPY32U A18,B16,A31:A30 ; a[0]*b[2] 327|| ADDU A22,A1,A1:A0 328 MV A23,B0 329|| MPY32U A19,B16,A21:A20 ; a[3]*b[0] 330|| ADDU A24,A1:A0,A1:A0 331 ADDU A25,B0,B1:B0 332|| STW A0,*ARG0[1] 333|| MPY32U A18,B17,A23:A22 ; a[2]*b[1] 334|| ADDU A26,A1,A9:A8 335 ADDU A27,B1,B9:B8 336|| MPY32U A17,B18,A25:A24 ; a[1]*b[2] 337|| ADDU A28,A9:A8,A9:A8 338 ADDU A29,B9:B8,B9:B8 339|| MPY32U A16,B19,A27:A26 ; a[0]*b[3] 340|| ADDU A30,A9:A8,A9:A8 341 ADDU A31,B9:B8,B9:B8 342|| ADDU B0,A9:A8,A9:A8 343 STW A8,*ARG0[2] 344|| ADDU A20,A9,A1:A0 345 ADDU A21,B9,B1:B0 346|| MPY32U A19,B17,A21:A20 ; a[3]*b[1] 347|| ADDU A22,A1:A0,A1:A0 348 ADDU A23,B1:B0,B1:B0 349|| MPY32U A18,B18,A23:A22 ; a[2]*b[2] 350|| ADDU A24,A1:A0,A1:A0 351 ADDU A25,B1:B0,B1:B0 352|| MPY32U A17,B19,A25:A24 ; a[1]*b[3] 353|| ADDU A26,A1:A0,A1:A0 354 ADDU A27,B1:B0,B1:B0 355|| ADDU B8,A1:A0,A1:A0 356 STW A0,*ARG0[3] 357|| MPY32U A19,B18,A27:A26 ; a[3]*b[2] 358|| ADDU A20,A1,A9:A8 359 ADDU A21,B1,B9:B8 360|| MPY32U A18,B19,A29:A28 ; a[2]*b[3] 361|| ADDU A22,A9:A8,A9:A8 362 ADDU A23,B9:B8,B9:B8 363|| MPY32U A19,B19,A31:A30 ; a[3]*b[3] 364|| ADDU A24,A9:A8,A9:A8 365 ADDU A25,B9:B8,B9:B8 366|| ADDU B0,A9:A8,A9:A8 367 STW A8,*ARG0[4] 368|| ADDU A26,A9,A1:A0 369 ADDU A27,B9,B1:B0 370|| ADDU A28,A1:A0,A1:A0 371 ADDU A29,B1:B0,B1:B0 372|| BNOP RA 373|| ADDU B8,A1:A0,A1:A0 374 STW A0,*ARG0[5] 375|| ADDU A30,A1,A9:A8 376 ADD A31,B1,B8 377 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below 378 ADD B8,A9,A9 379|| STW A8,*ARG0[6] 380 STW A9,*ARG0[7] 381 .endif 382 .endasmfunc 383