1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16 17#if __ARM_MAX_ARCH__>=7 18.text 19 20.globl _gcm_init_v8 21.private_extern _gcm_init_v8 22 23.align 4 24_gcm_init_v8: 25 AARCH64_VALID_CALL_TARGET 26 ld1 {v17.2d},[x1] //load input H 27 movi v19.16b,#0xe1 28 shl v19.2d,v19.2d,#57 //0xc2.0 29 ext v3.16b,v17.16b,v17.16b,#8 30 ushr v18.2d,v19.2d,#63 31 dup v17.4s,v17.s[1] 32 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 33 ushr v18.2d,v3.2d,#63 34 sshr v17.4s,v17.4s,#31 //broadcast carry bit 35 and v18.16b,v18.16b,v16.16b 36 shl v3.2d,v3.2d,#1 37 ext v18.16b,v18.16b,v18.16b,#8 38 and v16.16b,v16.16b,v17.16b 39 orr v3.16b,v3.16b,v18.16b //H<<<=1 40 eor v20.16b,v3.16b,v16.16b //twisted H 41 st1 {v20.2d},[x0],#16 //store Htable[0] 42 43 //calculate H^2 44 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 45 pmull v0.1q,v20.1d,v20.1d 46 eor v16.16b,v16.16b,v20.16b 47 pmull2 v2.1q,v20.2d,v20.2d 48 pmull v1.1q,v16.1d,v16.1d 49 50 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 51 eor v18.16b,v0.16b,v2.16b 52 eor v1.16b,v1.16b,v17.16b 53 eor v1.16b,v1.16b,v18.16b 54 pmull v18.1q,v0.1d,v19.1d //1st phase 55 56 ins v2.d[0],v1.d[1] 57 ins v1.d[1],v0.d[0] 58 eor v0.16b,v1.16b,v18.16b 59 60 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 61 pmull v0.1q,v0.1d,v19.1d 62 eor v18.16b,v18.16b,v2.16b 63 eor v22.16b,v0.16b,v18.16b 64 65 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 66 eor v17.16b,v17.16b,v22.16b 67 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 68 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 69 //calculate H^3 and H^4 70 pmull v0.1q,v20.1d, v22.1d 71 pmull v5.1q,v22.1d,v22.1d 72 pmull2 v2.1q,v20.2d, v22.2d 73 pmull2 v7.1q,v22.2d,v22.2d 74 pmull v1.1q,v16.1d,v17.1d 75 pmull v6.1q,v17.1d,v17.1d 76 77 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 78 ext v17.16b,v5.16b,v7.16b,#8 79 eor v18.16b,v0.16b,v2.16b 80 eor v1.16b,v1.16b,v16.16b 81 eor v4.16b,v5.16b,v7.16b 82 eor v6.16b,v6.16b,v17.16b 83 eor v1.16b,v1.16b,v18.16b 84 pmull v18.1q,v0.1d,v19.1d //1st phase 85 eor v6.16b,v6.16b,v4.16b 86 pmull v4.1q,v5.1d,v19.1d 87 88 ins v2.d[0],v1.d[1] 89 ins v7.d[0],v6.d[1] 90 ins v1.d[1],v0.d[0] 91 ins v6.d[1],v5.d[0] 92 eor v0.16b,v1.16b,v18.16b 93 eor v5.16b,v6.16b,v4.16b 94 95 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 96 ext v4.16b,v5.16b,v5.16b,#8 97 pmull v0.1q,v0.1d,v19.1d 98 pmull v5.1q,v5.1d,v19.1d 99 eor v18.16b,v18.16b,v2.16b 100 eor v4.16b,v4.16b,v7.16b 101 eor v20.16b, v0.16b,v18.16b //H^3 102 eor v22.16b,v5.16b,v4.16b //H^4 103 104 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 105 ext v17.16b,v22.16b,v22.16b,#8 106 eor v16.16b,v16.16b,v20.16b 107 eor v17.16b,v17.16b,v22.16b 108 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 109 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 110 ret 111 112.globl _gcm_gmult_v8 113.private_extern _gcm_gmult_v8 114 115.align 4 116_gcm_gmult_v8: 117 AARCH64_VALID_CALL_TARGET 118 ld1 {v17.2d},[x0] //load Xi 119 movi v19.16b,#0xe1 120 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 121 shl v19.2d,v19.2d,#57 122#ifndef __ARMEB__ 123 rev64 v17.16b,v17.16b 124#endif 125 ext v3.16b,v17.16b,v17.16b,#8 126 127 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 128 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 129 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 130 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 131 132 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 133 eor v18.16b,v0.16b,v2.16b 134 eor v1.16b,v1.16b,v17.16b 135 eor v1.16b,v1.16b,v18.16b 136 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 137 138 ins v2.d[0],v1.d[1] 139 ins v1.d[1],v0.d[0] 140 eor v0.16b,v1.16b,v18.16b 141 142 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 143 pmull v0.1q,v0.1d,v19.1d 144 eor v18.16b,v18.16b,v2.16b 145 eor v0.16b,v0.16b,v18.16b 146 147#ifndef __ARMEB__ 148 rev64 v0.16b,v0.16b 149#endif 150 ext v0.16b,v0.16b,v0.16b,#8 151 st1 {v0.2d},[x0] //write out Xi 152 153 ret 154 155.globl _gcm_ghash_v8 156.private_extern _gcm_ghash_v8 157 158.align 4 159_gcm_ghash_v8: 160 AARCH64_VALID_CALL_TARGET 161 cmp x3,#64 162 b.hs Lgcm_ghash_v8_4x 163 ld1 {v0.2d},[x0] //load [rotated] Xi 164 //"[rotated]" means that 165 //loaded value would have 166 //to be rotated in order to 167 //make it appear as in 168 //algorithm specification 169 subs x3,x3,#32 //see if x3 is 32 or larger 170 mov x12,#16 //x12 is used as post- 171 //increment for input pointer; 172 //as loop is modulo-scheduled 173 //x12 is zeroed just in time 174 //to preclude overstepping 175 //inp[len], which means that 176 //last block[s] are actually 177 //loaded twice, but last 178 //copy is not processed 179 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 180 movi v19.16b,#0xe1 181 ld1 {v22.2d},[x1] 182 csel x12,xzr,x12,eq //is it time to zero x12? 183 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 184 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 185 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 186#ifndef __ARMEB__ 187 rev64 v16.16b,v16.16b 188 rev64 v0.16b,v0.16b 189#endif 190 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 191 b.lo Lodd_tail_v8 //x3 was less than 32 192 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 193#ifndef __ARMEB__ 194 rev64 v17.16b,v17.16b 195#endif 196 ext v7.16b,v17.16b,v17.16b,#8 197 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 198 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 199 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 200 pmull2 v6.1q,v20.2d,v7.2d 201 b Loop_mod2x_v8 202 203.align 4 204Loop_mod2x_v8: 205 ext v18.16b,v3.16b,v3.16b,#8 206 subs x3,x3,#32 //is there more data? 207 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 208 csel x12,xzr,x12,lo //is it time to zero x12? 209 210 pmull v5.1q,v21.1d,v17.1d 211 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 212 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 213 eor v0.16b,v0.16b,v4.16b //accumulate 214 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 215 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 216 217 eor v2.16b,v2.16b,v6.16b 218 csel x12,xzr,x12,eq //is it time to zero x12? 219 eor v1.16b,v1.16b,v5.16b 220 221 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 222 eor v18.16b,v0.16b,v2.16b 223 eor v1.16b,v1.16b,v17.16b 224 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 225#ifndef __ARMEB__ 226 rev64 v16.16b,v16.16b 227#endif 228 eor v1.16b,v1.16b,v18.16b 229 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 230 231#ifndef __ARMEB__ 232 rev64 v17.16b,v17.16b 233#endif 234 ins v2.d[0],v1.d[1] 235 ins v1.d[1],v0.d[0] 236 ext v7.16b,v17.16b,v17.16b,#8 237 ext v3.16b,v16.16b,v16.16b,#8 238 eor v0.16b,v1.16b,v18.16b 239 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 240 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 241 242 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 243 pmull v0.1q,v0.1d,v19.1d 244 eor v3.16b,v3.16b,v18.16b 245 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 246 eor v3.16b,v3.16b,v0.16b 247 pmull2 v6.1q,v20.2d,v7.2d 248 b.hs Loop_mod2x_v8 //there was at least 32 more bytes 249 250 eor v2.16b,v2.16b,v18.16b 251 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 252 adds x3,x3,#32 //re-construct x3 253 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 254 b.eq Ldone_v8 //is x3 zero? 255Lodd_tail_v8: 256 ext v18.16b,v0.16b,v0.16b,#8 257 eor v3.16b,v3.16b,v0.16b //inp^=Xi 258 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 259 260 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 261 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 262 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 263 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 264 265 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 266 eor v18.16b,v0.16b,v2.16b 267 eor v1.16b,v1.16b,v17.16b 268 eor v1.16b,v1.16b,v18.16b 269 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 270 271 ins v2.d[0],v1.d[1] 272 ins v1.d[1],v0.d[0] 273 eor v0.16b,v1.16b,v18.16b 274 275 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 276 pmull v0.1q,v0.1d,v19.1d 277 eor v18.16b,v18.16b,v2.16b 278 eor v0.16b,v0.16b,v18.16b 279 280Ldone_v8: 281#ifndef __ARMEB__ 282 rev64 v0.16b,v0.16b 283#endif 284 ext v0.16b,v0.16b,v0.16b,#8 285 st1 {v0.2d},[x0] //write out Xi 286 287 ret 288 289 290.align 4 291gcm_ghash_v8_4x: 292Lgcm_ghash_v8_4x: 293 ld1 {v0.2d},[x0] //load [rotated] Xi 294 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 295 movi v19.16b,#0xe1 296 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 297 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 298 299 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 300#ifndef __ARMEB__ 301 rev64 v0.16b,v0.16b 302 rev64 v5.16b,v5.16b 303 rev64 v6.16b,v6.16b 304 rev64 v7.16b,v7.16b 305 rev64 v4.16b,v4.16b 306#endif 307 ext v25.16b,v7.16b,v7.16b,#8 308 ext v24.16b,v6.16b,v6.16b,#8 309 ext v23.16b,v5.16b,v5.16b,#8 310 311 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 312 eor v7.16b,v7.16b,v25.16b 313 pmull2 v31.1q,v20.2d,v25.2d 314 pmull v30.1q,v21.1d,v7.1d 315 316 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 317 eor v6.16b,v6.16b,v24.16b 318 pmull2 v24.1q,v22.2d,v24.2d 319 pmull2 v6.1q,v21.2d,v6.2d 320 321 eor v29.16b,v29.16b,v16.16b 322 eor v31.16b,v31.16b,v24.16b 323 eor v30.16b,v30.16b,v6.16b 324 325 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 326 eor v5.16b,v5.16b,v23.16b 327 pmull2 v23.1q,v26.2d,v23.2d 328 pmull v5.1q,v27.1d,v5.1d 329 330 eor v29.16b,v29.16b,v7.16b 331 eor v31.16b,v31.16b,v23.16b 332 eor v30.16b,v30.16b,v5.16b 333 334 subs x3,x3,#128 335 b.lo Ltail4x 336 337 b Loop4x 338 339.align 4 340Loop4x: 341 eor v16.16b,v4.16b,v0.16b 342 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 343 ext v3.16b,v16.16b,v16.16b,#8 344#ifndef __ARMEB__ 345 rev64 v5.16b,v5.16b 346 rev64 v6.16b,v6.16b 347 rev64 v7.16b,v7.16b 348 rev64 v4.16b,v4.16b 349#endif 350 351 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 352 eor v16.16b,v16.16b,v3.16b 353 pmull2 v2.1q,v28.2d,v3.2d 354 ext v25.16b,v7.16b,v7.16b,#8 355 pmull2 v1.1q,v27.2d,v16.2d 356 357 eor v0.16b,v0.16b,v29.16b 358 eor v2.16b,v2.16b,v31.16b 359 ext v24.16b,v6.16b,v6.16b,#8 360 eor v1.16b,v1.16b,v30.16b 361 ext v23.16b,v5.16b,v5.16b,#8 362 363 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 364 eor v18.16b,v0.16b,v2.16b 365 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 366 eor v7.16b,v7.16b,v25.16b 367 eor v1.16b,v1.16b,v17.16b 368 pmull2 v31.1q,v20.2d,v25.2d 369 eor v1.16b,v1.16b,v18.16b 370 pmull v30.1q,v21.1d,v7.1d 371 372 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 373 ins v2.d[0],v1.d[1] 374 ins v1.d[1],v0.d[0] 375 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 376 eor v6.16b,v6.16b,v24.16b 377 pmull2 v24.1q,v22.2d,v24.2d 378 eor v0.16b,v1.16b,v18.16b 379 pmull2 v6.1q,v21.2d,v6.2d 380 381 eor v29.16b,v29.16b,v16.16b 382 eor v31.16b,v31.16b,v24.16b 383 eor v30.16b,v30.16b,v6.16b 384 385 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 386 pmull v0.1q,v0.1d,v19.1d 387 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 388 eor v5.16b,v5.16b,v23.16b 389 eor v18.16b,v18.16b,v2.16b 390 pmull2 v23.1q,v26.2d,v23.2d 391 pmull v5.1q,v27.1d,v5.1d 392 393 eor v0.16b,v0.16b,v18.16b 394 eor v29.16b,v29.16b,v7.16b 395 eor v31.16b,v31.16b,v23.16b 396 ext v0.16b,v0.16b,v0.16b,#8 397 eor v30.16b,v30.16b,v5.16b 398 399 subs x3,x3,#64 400 b.hs Loop4x 401 402Ltail4x: 403 eor v16.16b,v4.16b,v0.16b 404 ext v3.16b,v16.16b,v16.16b,#8 405 406 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 407 eor v16.16b,v16.16b,v3.16b 408 pmull2 v2.1q,v28.2d,v3.2d 409 pmull2 v1.1q,v27.2d,v16.2d 410 411 eor v0.16b,v0.16b,v29.16b 412 eor v2.16b,v2.16b,v31.16b 413 eor v1.16b,v1.16b,v30.16b 414 415 adds x3,x3,#64 416 b.eq Ldone4x 417 418 cmp x3,#32 419 b.lo Lone 420 b.eq Ltwo 421Lthree: 422 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 423 eor v18.16b,v0.16b,v2.16b 424 eor v1.16b,v1.16b,v17.16b 425 ld1 {v4.2d,v5.2d,v6.2d},[x2] 426 eor v1.16b,v1.16b,v18.16b 427#ifndef __ARMEB__ 428 rev64 v5.16b,v5.16b 429 rev64 v6.16b,v6.16b 430 rev64 v4.16b,v4.16b 431#endif 432 433 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 434 ins v2.d[0],v1.d[1] 435 ins v1.d[1],v0.d[0] 436 ext v24.16b,v6.16b,v6.16b,#8 437 ext v23.16b,v5.16b,v5.16b,#8 438 eor v0.16b,v1.16b,v18.16b 439 440 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 441 eor v6.16b,v6.16b,v24.16b 442 443 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 444 pmull v0.1q,v0.1d,v19.1d 445 eor v18.16b,v18.16b,v2.16b 446 pmull2 v31.1q,v20.2d,v24.2d 447 pmull v30.1q,v21.1d,v6.1d 448 eor v0.16b,v0.16b,v18.16b 449 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 450 eor v5.16b,v5.16b,v23.16b 451 ext v0.16b,v0.16b,v0.16b,#8 452 453 pmull2 v23.1q,v22.2d,v23.2d 454 eor v16.16b,v4.16b,v0.16b 455 pmull2 v5.1q,v21.2d,v5.2d 456 ext v3.16b,v16.16b,v16.16b,#8 457 458 eor v29.16b,v29.16b,v7.16b 459 eor v31.16b,v31.16b,v23.16b 460 eor v30.16b,v30.16b,v5.16b 461 462 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 463 eor v16.16b,v16.16b,v3.16b 464 pmull2 v2.1q,v26.2d,v3.2d 465 pmull v1.1q,v27.1d,v16.1d 466 467 eor v0.16b,v0.16b,v29.16b 468 eor v2.16b,v2.16b,v31.16b 469 eor v1.16b,v1.16b,v30.16b 470 b Ldone4x 471 472.align 4 473Ltwo: 474 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 475 eor v18.16b,v0.16b,v2.16b 476 eor v1.16b,v1.16b,v17.16b 477 ld1 {v4.2d,v5.2d},[x2] 478 eor v1.16b,v1.16b,v18.16b 479#ifndef __ARMEB__ 480 rev64 v5.16b,v5.16b 481 rev64 v4.16b,v4.16b 482#endif 483 484 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 485 ins v2.d[0],v1.d[1] 486 ins v1.d[1],v0.d[0] 487 ext v23.16b,v5.16b,v5.16b,#8 488 eor v0.16b,v1.16b,v18.16b 489 490 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 491 pmull v0.1q,v0.1d,v19.1d 492 eor v18.16b,v18.16b,v2.16b 493 eor v0.16b,v0.16b,v18.16b 494 ext v0.16b,v0.16b,v0.16b,#8 495 496 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 497 eor v5.16b,v5.16b,v23.16b 498 499 eor v16.16b,v4.16b,v0.16b 500 ext v3.16b,v16.16b,v16.16b,#8 501 502 pmull2 v31.1q,v20.2d,v23.2d 503 pmull v30.1q,v21.1d,v5.1d 504 505 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 506 eor v16.16b,v16.16b,v3.16b 507 pmull2 v2.1q,v22.2d,v3.2d 508 pmull2 v1.1q,v21.2d,v16.2d 509 510 eor v0.16b,v0.16b,v29.16b 511 eor v2.16b,v2.16b,v31.16b 512 eor v1.16b,v1.16b,v30.16b 513 b Ldone4x 514 515.align 4 516Lone: 517 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 518 eor v18.16b,v0.16b,v2.16b 519 eor v1.16b,v1.16b,v17.16b 520 ld1 {v4.2d},[x2] 521 eor v1.16b,v1.16b,v18.16b 522#ifndef __ARMEB__ 523 rev64 v4.16b,v4.16b 524#endif 525 526 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 527 ins v2.d[0],v1.d[1] 528 ins v1.d[1],v0.d[0] 529 eor v0.16b,v1.16b,v18.16b 530 531 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 532 pmull v0.1q,v0.1d,v19.1d 533 eor v18.16b,v18.16b,v2.16b 534 eor v0.16b,v0.16b,v18.16b 535 ext v0.16b,v0.16b,v0.16b,#8 536 537 eor v16.16b,v4.16b,v0.16b 538 ext v3.16b,v16.16b,v16.16b,#8 539 540 pmull v0.1q,v20.1d,v3.1d 541 eor v16.16b,v16.16b,v3.16b 542 pmull2 v2.1q,v20.2d,v3.2d 543 pmull v1.1q,v21.1d,v16.1d 544 545Ldone4x: 546 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 547 eor v18.16b,v0.16b,v2.16b 548 eor v1.16b,v1.16b,v17.16b 549 eor v1.16b,v1.16b,v18.16b 550 551 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 552 ins v2.d[0],v1.d[1] 553 ins v1.d[1],v0.d[0] 554 eor v0.16b,v1.16b,v18.16b 555 556 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 557 pmull v0.1q,v0.1d,v19.1d 558 eor v18.16b,v18.16b,v2.16b 559 eor v0.16b,v0.16b,v18.16b 560 ext v0.16b,v0.16b,v0.16b,#8 561 562#ifndef __ARMEB__ 563 rev64 v0.16b,v0.16b 564#endif 565 st1 {v0.2d},[x0] //write out Xi 566 567 ret 568 569.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 570.align 2 571.align 2 572#endif 573#endif // !OPENSSL_NO_ASM 574