17bded2dbSJung-uk Kim#! /usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 97bded2dbSJung-uk Kim 107bded2dbSJung-uk Kim###################################################################### 117bded2dbSJung-uk Kim## Constant-time SSSE3 AES core implementation. 127bded2dbSJung-uk Kim## version 0.1 137bded2dbSJung-uk Kim## 147bded2dbSJung-uk Kim## By Mike Hamburg (Stanford University), 2009 157bded2dbSJung-uk Kim## Public domain. 167bded2dbSJung-uk Kim## 177bded2dbSJung-uk Kim## For details see http://shiftleft.org/papers/vector_aes/ and 187bded2dbSJung-uk Kim## http://crypto.stanford.edu/vpaes/. 197bded2dbSJung-uk Kim 207bded2dbSJung-uk Kim# CBC encrypt/decrypt performance in cycles per byte processed with 217bded2dbSJung-uk Kim# 128-bit key. 227bded2dbSJung-uk Kim# 237bded2dbSJung-uk Kim# aes-ppc.pl this 24e71b7053SJung-uk Kim# PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4 25e71b7053SJung-uk Kim# PPC970/G5 37.9/55.0/(28.5) 22.2/28.5 267bded2dbSJung-uk Kim# POWER6 42.7/54.3/(28.2) 63.0/92.8(**) 277bded2dbSJung-uk Kim# POWER7 32.3/42.9/(18.4) 18.5/23.3 287bded2dbSJung-uk Kim# 297bded2dbSJung-uk Kim# (*) This is ~10% worse than reported in paper. The reason is 307bded2dbSJung-uk Kim# twofold. This module doesn't make any assumption about 317bded2dbSJung-uk Kim# key schedule (or data for that matter) alignment and handles 327bded2dbSJung-uk Kim# it in-line. Secondly it, being transliterated from 337bded2dbSJung-uk Kim# vpaes-x86_64.pl, relies on "nested inversion" better suited 347bded2dbSJung-uk Kim# for Intel CPUs. 357bded2dbSJung-uk Kim# (**) Inadequate POWER6 performance is due to astronomic AltiVec 367bded2dbSJung-uk Kim# latency, 9 cycles per simple logical operation. 377bded2dbSJung-uk Kim 38*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension) 39*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file 40*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 41*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 427bded2dbSJung-uk Kim 437bded2dbSJung-uk Kimif ($flavour =~ /64/) { 447bded2dbSJung-uk Kim $SIZE_T =8; 457bded2dbSJung-uk Kim $LRSAVE =2*$SIZE_T; 467bded2dbSJung-uk Kim $STU ="stdu"; 477bded2dbSJung-uk Kim $POP ="ld"; 487bded2dbSJung-uk Kim $PUSH ="std"; 497bded2dbSJung-uk Kim $UCMP ="cmpld"; 507bded2dbSJung-uk Kim} elsif ($flavour =~ /32/) { 517bded2dbSJung-uk Kim $SIZE_T =4; 527bded2dbSJung-uk Kim $LRSAVE =$SIZE_T; 537bded2dbSJung-uk Kim $STU ="stwu"; 547bded2dbSJung-uk Kim $POP ="lwz"; 557bded2dbSJung-uk Kim $PUSH ="stw"; 567bded2dbSJung-uk Kim $UCMP ="cmplw"; 577bded2dbSJung-uk Kim} else { die "nonsense $flavour"; } 587bded2dbSJung-uk Kim 597bded2dbSJung-uk Kim$sp="r1"; 607bded2dbSJung-uk Kim$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload 617bded2dbSJung-uk Kim 627bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 637bded2dbSJung-uk Kim( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 647bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 657bded2dbSJung-uk Kimdie "can't locate ppc-xlate.pl"; 667bded2dbSJung-uk Kim 67*b077aed3SPierre Proncheryopen STDOUT,"| $^X $xlate $flavour \"$output\"" 68*b077aed3SPierre Pronchery || die "can't call $xlate: $!"; 697bded2dbSJung-uk Kim 707bded2dbSJung-uk Kim$code.=<<___; 717bded2dbSJung-uk Kim.machine "any" 727bded2dbSJung-uk Kim 737bded2dbSJung-uk Kim.text 747bded2dbSJung-uk Kim 757bded2dbSJung-uk Kim.align 7 # totally strategic alignment 767bded2dbSJung-uk Kim_vpaes_consts: 777bded2dbSJung-uk KimLk_mc_forward: # mc_forward 787bded2dbSJung-uk Kim .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv 797bded2dbSJung-uk Kim .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv 807bded2dbSJung-uk Kim .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv 817bded2dbSJung-uk Kim .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv 827bded2dbSJung-uk KimLk_mc_backward: # mc_backward 837bded2dbSJung-uk Kim .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv 847bded2dbSJung-uk Kim .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv 857bded2dbSJung-uk Kim .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv 867bded2dbSJung-uk Kim .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv 877bded2dbSJung-uk KimLk_sr: # sr 887bded2dbSJung-uk Kim .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv 897bded2dbSJung-uk Kim .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv 907bded2dbSJung-uk Kim .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv 917bded2dbSJung-uk Kim .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv 927bded2dbSJung-uk Kim 937bded2dbSJung-uk Kim## 947bded2dbSJung-uk Kim## "Hot" constants 957bded2dbSJung-uk Kim## 967bded2dbSJung-uk KimLk_inv: # inv, inva 977bded2dbSJung-uk Kim .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev 987bded2dbSJung-uk Kim .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev 997bded2dbSJung-uk KimLk_ipt: # input transform (lo, hi) 1007bded2dbSJung-uk Kim .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev 1017bded2dbSJung-uk Kim .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev 1027bded2dbSJung-uk KimLk_sbo: # sbou, sbot 1037bded2dbSJung-uk Kim .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev 1047bded2dbSJung-uk Kim .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev 1057bded2dbSJung-uk KimLk_sb1: # sb1u, sb1t 1067bded2dbSJung-uk Kim .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev 1077bded2dbSJung-uk Kim .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev 1087bded2dbSJung-uk KimLk_sb2: # sb2u, sb2t 1097bded2dbSJung-uk Kim .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev 1107bded2dbSJung-uk Kim .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev 1117bded2dbSJung-uk Kim 1127bded2dbSJung-uk Kim## 1137bded2dbSJung-uk Kim## Decryption stuff 1147bded2dbSJung-uk Kim## 1157bded2dbSJung-uk KimLk_dipt: # decryption input transform 1167bded2dbSJung-uk Kim .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev 1177bded2dbSJung-uk Kim .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev 1187bded2dbSJung-uk KimLk_dsbo: # decryption sbox final output 1197bded2dbSJung-uk Kim .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev 1207bded2dbSJung-uk Kim .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev 1217bded2dbSJung-uk KimLk_dsb9: # decryption sbox output *9*u, *9*t 1227bded2dbSJung-uk Kim .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev 1237bded2dbSJung-uk Kim .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev 1247bded2dbSJung-uk KimLk_dsbd: # decryption sbox output *D*u, *D*t 1257bded2dbSJung-uk Kim .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev 1267bded2dbSJung-uk Kim .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev 1277bded2dbSJung-uk KimLk_dsbb: # decryption sbox output *B*u, *B*t 1287bded2dbSJung-uk Kim .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev 1297bded2dbSJung-uk Kim .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev 1307bded2dbSJung-uk KimLk_dsbe: # decryption sbox output *E*u, *E*t 1317bded2dbSJung-uk Kim .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev 1327bded2dbSJung-uk Kim .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev 1337bded2dbSJung-uk Kim 1347bded2dbSJung-uk Kim## 1357bded2dbSJung-uk Kim## Key schedule constants 1367bded2dbSJung-uk Kim## 1377bded2dbSJung-uk KimLk_dksd: # decryption key schedule: invskew x*D 1387bded2dbSJung-uk Kim .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev 1397bded2dbSJung-uk Kim .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev 1407bded2dbSJung-uk KimLk_dksb: # decryption key schedule: invskew x*B 1417bded2dbSJung-uk Kim .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev 1427bded2dbSJung-uk Kim .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev 1437bded2dbSJung-uk KimLk_dkse: # decryption key schedule: invskew x*E + 0x63 1447bded2dbSJung-uk Kim .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev 1457bded2dbSJung-uk Kim .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev 1467bded2dbSJung-uk KimLk_dks9: # decryption key schedule: invskew x*9 1477bded2dbSJung-uk Kim .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev 1487bded2dbSJung-uk Kim .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev 1497bded2dbSJung-uk Kim 1507bded2dbSJung-uk KimLk_rcon: # rcon 1517bded2dbSJung-uk Kim .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis 1527bded2dbSJung-uk KimLk_s63: 1537bded2dbSJung-uk Kim .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis 1547bded2dbSJung-uk Kim 1557bded2dbSJung-uk KimLk_opt: # output transform 1567bded2dbSJung-uk Kim .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev 1577bded2dbSJung-uk Kim .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev 1587bded2dbSJung-uk KimLk_deskew: # deskew tables: inverts the sbox's "skew" 1597bded2dbSJung-uk Kim .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev 1607bded2dbSJung-uk Kim .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev 1617bded2dbSJung-uk Kim.align 5 1627bded2dbSJung-uk KimLconsts: 1637bded2dbSJung-uk Kim mflr r0 1647bded2dbSJung-uk Kim bcl 20,31,\$+4 1657bded2dbSJung-uk Kim mflr r12 #vvvvv "distance between . and _vpaes_consts 1667bded2dbSJung-uk Kim addi r12,r12,-0x308 1677bded2dbSJung-uk Kim mtlr r0 1687bded2dbSJung-uk Kim blr 1697bded2dbSJung-uk Kim .long 0 1707bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 1717bded2dbSJung-uk Kim.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)" 1727bded2dbSJung-uk Kim.align 6 1737bded2dbSJung-uk Kim___ 1747bded2dbSJung-uk Kim 1757bded2dbSJung-uk Kimmy ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31)); 1767bded2dbSJung-uk Kim{ 1777bded2dbSJung-uk Kimmy ($inp,$out,$key) = map("r$_",(3..5)); 1787bded2dbSJung-uk Kim 1797bded2dbSJung-uk Kimmy ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15)); 1807bded2dbSJung-uk Kimmy ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19)); 1817bded2dbSJung-uk Kimmy ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23)); 1827bded2dbSJung-uk Kim 1837bded2dbSJung-uk Kim$code.=<<___; 1847bded2dbSJung-uk Kim## 1857bded2dbSJung-uk Kim## _aes_preheat 1867bded2dbSJung-uk Kim## 1877bded2dbSJung-uk Kim## Fills register %r10 -> .aes_consts (so you can -fPIC) 1887bded2dbSJung-uk Kim## and %xmm9-%xmm15 as specified below. 1897bded2dbSJung-uk Kim## 1907bded2dbSJung-uk Kim.align 4 1917bded2dbSJung-uk Kim_vpaes_encrypt_preheat: 1927bded2dbSJung-uk Kim mflr r8 1937bded2dbSJung-uk Kim bl Lconsts 1947bded2dbSJung-uk Kim mtlr r8 1957bded2dbSJung-uk Kim li r11, 0xc0 # Lk_inv 1967bded2dbSJung-uk Kim li r10, 0xd0 1977bded2dbSJung-uk Kim li r9, 0xe0 # Lk_ipt 1987bded2dbSJung-uk Kim li r8, 0xf0 1997bded2dbSJung-uk Kim vxor v7, v7, v7 # 0x00..00 2007bded2dbSJung-uk Kim vspltisb v8,4 # 0x04..04 2017bded2dbSJung-uk Kim vspltisb v9,0x0f # 0x0f..0f 2027bded2dbSJung-uk Kim lvx $invlo, r12, r11 2037bded2dbSJung-uk Kim li r11, 0x100 2047bded2dbSJung-uk Kim lvx $invhi, r12, r10 2057bded2dbSJung-uk Kim li r10, 0x110 2067bded2dbSJung-uk Kim lvx $iptlo, r12, r9 2077bded2dbSJung-uk Kim li r9, 0x120 2087bded2dbSJung-uk Kim lvx $ipthi, r12, r8 2097bded2dbSJung-uk Kim li r8, 0x130 2107bded2dbSJung-uk Kim lvx $sbou, r12, r11 2117bded2dbSJung-uk Kim li r11, 0x140 2127bded2dbSJung-uk Kim lvx $sbot, r12, r10 2137bded2dbSJung-uk Kim li r10, 0x150 2147bded2dbSJung-uk Kim lvx $sb1u, r12, r9 2157bded2dbSJung-uk Kim lvx $sb1t, r12, r8 2167bded2dbSJung-uk Kim lvx $sb2u, r12, r11 2177bded2dbSJung-uk Kim lvx $sb2t, r12, r10 2187bded2dbSJung-uk Kim blr 2197bded2dbSJung-uk Kim .long 0 2207bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 2217bded2dbSJung-uk Kim 2227bded2dbSJung-uk Kim## 2237bded2dbSJung-uk Kim## _aes_encrypt_core 2247bded2dbSJung-uk Kim## 2257bded2dbSJung-uk Kim## AES-encrypt %xmm0. 2267bded2dbSJung-uk Kim## 2277bded2dbSJung-uk Kim## Inputs: 2287bded2dbSJung-uk Kim## %xmm0 = input 2297bded2dbSJung-uk Kim## %xmm9-%xmm15 as in _vpaes_preheat 2307bded2dbSJung-uk Kim## (%rdx) = scheduled keys 2317bded2dbSJung-uk Kim## 2327bded2dbSJung-uk Kim## Output in %xmm0 2337bded2dbSJung-uk Kim## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax 2347bded2dbSJung-uk Kim## 2357bded2dbSJung-uk Kim## 2367bded2dbSJung-uk Kim.align 5 2377bded2dbSJung-uk Kim_vpaes_encrypt_core: 2387bded2dbSJung-uk Kim lwz r8, 240($key) # pull rounds 2397bded2dbSJung-uk Kim li r9, 16 2407bded2dbSJung-uk Kim lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key 2417bded2dbSJung-uk Kim li r11, 0x10 2427bded2dbSJung-uk Kim lvx v6, r9, $key 2437bded2dbSJung-uk Kim addi r9, r9, 16 2447bded2dbSJung-uk Kim ?vperm v5, v5, v6, $keyperm # align round key 2457bded2dbSJung-uk Kim addi r10, r11, 0x40 2467bded2dbSJung-uk Kim vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 2477bded2dbSJung-uk Kim vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 2487bded2dbSJung-uk Kim vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2 2497bded2dbSJung-uk Kim vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0 2507bded2dbSJung-uk Kim vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 2517bded2dbSJung-uk Kim mtctr r8 2527bded2dbSJung-uk Kim b Lenc_entry 2537bded2dbSJung-uk Kim 2547bded2dbSJung-uk Kim.align 4 2557bded2dbSJung-uk KimLenc_loop: 2567bded2dbSJung-uk Kim # middle of middle round 2577bded2dbSJung-uk Kim vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 2587bded2dbSJung-uk Kim lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 2597bded2dbSJung-uk Kim addi r11, r11, 16 2607bded2dbSJung-uk Kim vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 2617bded2dbSJung-uk Kim vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 2627bded2dbSJung-uk Kim andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4 2637bded2dbSJung-uk Kim vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 2647bded2dbSJung-uk Kim vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 2657bded2dbSJung-uk Kim vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 2667bded2dbSJung-uk Kim lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 2677bded2dbSJung-uk Kim addi r10, r11, 0x40 2687bded2dbSJung-uk Kim vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 2697bded2dbSJung-uk Kim vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 2707bded2dbSJung-uk Kim vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 2717bded2dbSJung-uk Kim vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 2727bded2dbSJung-uk Kim vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 2737bded2dbSJung-uk Kim vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 2747bded2dbSJung-uk Kim vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 2757bded2dbSJung-uk Kim 2767bded2dbSJung-uk KimLenc_entry: 2777bded2dbSJung-uk Kim # top of round 2787bded2dbSJung-uk Kim vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 2797bded2dbSJung-uk Kim vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 2807bded2dbSJung-uk Kim vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 2817bded2dbSJung-uk Kim vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 2827bded2dbSJung-uk Kim vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 2837bded2dbSJung-uk Kim vand v0, v0, v9 2847bded2dbSJung-uk Kim vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 2857bded2dbSJung-uk Kim vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 2867bded2dbSJung-uk Kim vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 2877bded2dbSJung-uk Kim vmr v5, v6 2887bded2dbSJung-uk Kim lvx v6, r9, $key # vmovdqu (%r9), %xmm5 2897bded2dbSJung-uk Kim vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 2907bded2dbSJung-uk Kim addi r9, r9, 16 2917bded2dbSJung-uk Kim vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 2927bded2dbSJung-uk Kim ?vperm v5, v5, v6, $keyperm # align round key 2937bded2dbSJung-uk Kim vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 2947bded2dbSJung-uk Kim bdnz Lenc_loop 2957bded2dbSJung-uk Kim 2967bded2dbSJung-uk Kim # middle of last round 2977bded2dbSJung-uk Kim addi r10, r11, 0x80 2987bded2dbSJung-uk Kim # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 2997bded2dbSJung-uk Kim # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 3007bded2dbSJung-uk Kim vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 3017bded2dbSJung-uk Kim lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 3027bded2dbSJung-uk Kim vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 3037bded2dbSJung-uk Kim vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 3047bded2dbSJung-uk Kim vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 3057bded2dbSJung-uk Kim vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0 3067bded2dbSJung-uk Kim blr 3077bded2dbSJung-uk Kim .long 0 3087bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 3097bded2dbSJung-uk Kim 3107bded2dbSJung-uk Kim.globl .vpaes_encrypt 3117bded2dbSJung-uk Kim.align 5 3127bded2dbSJung-uk Kim.vpaes_encrypt: 3137bded2dbSJung-uk Kim $STU $sp,-$FRAME($sp) 3147bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 3157bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 3167bded2dbSJung-uk Kim mflr r6 3177bded2dbSJung-uk Kim mfspr r7, 256 # save vrsave 3187bded2dbSJung-uk Kim stvx v20,r10,$sp 3197bded2dbSJung-uk Kim addi r10,r10,32 3207bded2dbSJung-uk Kim stvx v21,r11,$sp 3217bded2dbSJung-uk Kim addi r11,r11,32 3227bded2dbSJung-uk Kim stvx v22,r10,$sp 3237bded2dbSJung-uk Kim addi r10,r10,32 3247bded2dbSJung-uk Kim stvx v23,r11,$sp 3257bded2dbSJung-uk Kim addi r11,r11,32 3267bded2dbSJung-uk Kim stvx v24,r10,$sp 3277bded2dbSJung-uk Kim addi r10,r10,32 3287bded2dbSJung-uk Kim stvx v25,r11,$sp 3297bded2dbSJung-uk Kim addi r11,r11,32 3307bded2dbSJung-uk Kim stvx v26,r10,$sp 3317bded2dbSJung-uk Kim addi r10,r10,32 3327bded2dbSJung-uk Kim stvx v27,r11,$sp 3337bded2dbSJung-uk Kim addi r11,r11,32 3347bded2dbSJung-uk Kim stvx v28,r10,$sp 3357bded2dbSJung-uk Kim addi r10,r10,32 3367bded2dbSJung-uk Kim stvx v29,r11,$sp 3377bded2dbSJung-uk Kim addi r11,r11,32 3387bded2dbSJung-uk Kim stvx v30,r10,$sp 3397bded2dbSJung-uk Kim stvx v31,r11,$sp 3407bded2dbSJung-uk Kim stw r7,`$FRAME-4`($sp) # save vrsave 3417bded2dbSJung-uk Kim li r0, -1 3427bded2dbSJung-uk Kim $PUSH r6,`$FRAME+$LRSAVE`($sp) 3437bded2dbSJung-uk Kim mtspr 256, r0 # preserve all AltiVec registers 3447bded2dbSJung-uk Kim 3457bded2dbSJung-uk Kim bl _vpaes_encrypt_preheat 3467bded2dbSJung-uk Kim 3477bded2dbSJung-uk Kim ?lvsl $inpperm, 0, $inp # prepare for unaligned access 3487bded2dbSJung-uk Kim lvx v0, 0, $inp 3497bded2dbSJung-uk Kim addi $inp, $inp, 15 # 15 is not a typo 3507bded2dbSJung-uk Kim ?lvsr $outperm, 0, $out 3517bded2dbSJung-uk Kim ?lvsl $keyperm, 0, $key # prepare for unaligned access 3527bded2dbSJung-uk Kim lvx $inptail, 0, $inp # redundant in aligned case 3537bded2dbSJung-uk Kim ?vperm v0, v0, $inptail, $inpperm 3547bded2dbSJung-uk Kim 3557bded2dbSJung-uk Kim bl _vpaes_encrypt_core 3567bded2dbSJung-uk Kim 35780815a77SJung-uk Kim andi. r8, $out, 15 35880815a77SJung-uk Kim li r9, 16 35980815a77SJung-uk Kim beq Lenc_out_aligned 3607bded2dbSJung-uk Kim 36180815a77SJung-uk Kim vperm v0, v0, v0, $outperm # rotate right/left 36280815a77SJung-uk Kim mtctr r9 36380815a77SJung-uk KimLenc_out_unaligned: 36480815a77SJung-uk Kim stvebx v0, 0, $out 36580815a77SJung-uk Kim addi $out, $out, 1 36680815a77SJung-uk Kim bdnz Lenc_out_unaligned 36780815a77SJung-uk Kim b Lenc_done 36880815a77SJung-uk Kim 36980815a77SJung-uk Kim.align 4 37080815a77SJung-uk KimLenc_out_aligned: 37180815a77SJung-uk Kim stvx v0, 0, $out 37280815a77SJung-uk KimLenc_done: 3737bded2dbSJung-uk Kim 3747bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 3757bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 3767bded2dbSJung-uk Kim mtlr r6 3777bded2dbSJung-uk Kim mtspr 256, r7 # restore vrsave 3787bded2dbSJung-uk Kim lvx v20,r10,$sp 3797bded2dbSJung-uk Kim addi r10,r10,32 3807bded2dbSJung-uk Kim lvx v21,r11,$sp 3817bded2dbSJung-uk Kim addi r11,r11,32 3827bded2dbSJung-uk Kim lvx v22,r10,$sp 3837bded2dbSJung-uk Kim addi r10,r10,32 3847bded2dbSJung-uk Kim lvx v23,r11,$sp 3857bded2dbSJung-uk Kim addi r11,r11,32 3867bded2dbSJung-uk Kim lvx v24,r10,$sp 3877bded2dbSJung-uk Kim addi r10,r10,32 3887bded2dbSJung-uk Kim lvx v25,r11,$sp 3897bded2dbSJung-uk Kim addi r11,r11,32 3907bded2dbSJung-uk Kim lvx v26,r10,$sp 3917bded2dbSJung-uk Kim addi r10,r10,32 3927bded2dbSJung-uk Kim lvx v27,r11,$sp 3937bded2dbSJung-uk Kim addi r11,r11,32 3947bded2dbSJung-uk Kim lvx v28,r10,$sp 3957bded2dbSJung-uk Kim addi r10,r10,32 3967bded2dbSJung-uk Kim lvx v29,r11,$sp 3977bded2dbSJung-uk Kim addi r11,r11,32 3987bded2dbSJung-uk Kim lvx v30,r10,$sp 3997bded2dbSJung-uk Kim lvx v31,r11,$sp 4007bded2dbSJung-uk Kim addi $sp,$sp,$FRAME 4017bded2dbSJung-uk Kim blr 4027bded2dbSJung-uk Kim .long 0 4037bded2dbSJung-uk Kim .byte 0,12,0x04,1,0x80,0,3,0 4047bded2dbSJung-uk Kim .long 0 4057bded2dbSJung-uk Kim.size .vpaes_encrypt,.-.vpaes_encrypt 4067bded2dbSJung-uk Kim 4077bded2dbSJung-uk Kim.align 4 4087bded2dbSJung-uk Kim_vpaes_decrypt_preheat: 4097bded2dbSJung-uk Kim mflr r8 4107bded2dbSJung-uk Kim bl Lconsts 4117bded2dbSJung-uk Kim mtlr r8 4127bded2dbSJung-uk Kim li r11, 0xc0 # Lk_inv 4137bded2dbSJung-uk Kim li r10, 0xd0 4147bded2dbSJung-uk Kim li r9, 0x160 # Ldipt 4157bded2dbSJung-uk Kim li r8, 0x170 4167bded2dbSJung-uk Kim vxor v7, v7, v7 # 0x00..00 4177bded2dbSJung-uk Kim vspltisb v8,4 # 0x04..04 4187bded2dbSJung-uk Kim vspltisb v9,0x0f # 0x0f..0f 4197bded2dbSJung-uk Kim lvx $invlo, r12, r11 4207bded2dbSJung-uk Kim li r11, 0x180 4217bded2dbSJung-uk Kim lvx $invhi, r12, r10 4227bded2dbSJung-uk Kim li r10, 0x190 4237bded2dbSJung-uk Kim lvx $iptlo, r12, r9 4247bded2dbSJung-uk Kim li r9, 0x1a0 4257bded2dbSJung-uk Kim lvx $ipthi, r12, r8 4267bded2dbSJung-uk Kim li r8, 0x1b0 4277bded2dbSJung-uk Kim lvx $sbou, r12, r11 4287bded2dbSJung-uk Kim li r11, 0x1c0 4297bded2dbSJung-uk Kim lvx $sbot, r12, r10 4307bded2dbSJung-uk Kim li r10, 0x1d0 4317bded2dbSJung-uk Kim lvx $sb9u, r12, r9 4327bded2dbSJung-uk Kim li r9, 0x1e0 4337bded2dbSJung-uk Kim lvx $sb9t, r12, r8 4347bded2dbSJung-uk Kim li r8, 0x1f0 4357bded2dbSJung-uk Kim lvx $sbdu, r12, r11 4367bded2dbSJung-uk Kim li r11, 0x200 4377bded2dbSJung-uk Kim lvx $sbdt, r12, r10 4387bded2dbSJung-uk Kim li r10, 0x210 4397bded2dbSJung-uk Kim lvx $sbbu, r12, r9 4407bded2dbSJung-uk Kim lvx $sbbt, r12, r8 4417bded2dbSJung-uk Kim lvx $sbeu, r12, r11 4427bded2dbSJung-uk Kim lvx $sbet, r12, r10 4437bded2dbSJung-uk Kim blr 4447bded2dbSJung-uk Kim .long 0 4457bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 4467bded2dbSJung-uk Kim 4477bded2dbSJung-uk Kim## 4487bded2dbSJung-uk Kim## Decryption core 4497bded2dbSJung-uk Kim## 4507bded2dbSJung-uk Kim## Same API as encryption core. 4517bded2dbSJung-uk Kim## 4527bded2dbSJung-uk Kim.align 4 4537bded2dbSJung-uk Kim_vpaes_decrypt_core: 4547bded2dbSJung-uk Kim lwz r8, 240($key) # pull rounds 4557bded2dbSJung-uk Kim li r9, 16 4567bded2dbSJung-uk Kim lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key 4577bded2dbSJung-uk Kim li r11, 0x30 4587bded2dbSJung-uk Kim lvx v6, r9, $key 4597bded2dbSJung-uk Kim addi r9, r9, 16 4607bded2dbSJung-uk Kim ?vperm v5, v5, v6, $keyperm # align round key 4617bded2dbSJung-uk Kim vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 4627bded2dbSJung-uk Kim vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 4637bded2dbSJung-uk Kim vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 4647bded2dbSJung-uk Kim vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2 4657bded2dbSJung-uk Kim vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 4667bded2dbSJung-uk Kim mtctr r8 4677bded2dbSJung-uk Kim b Ldec_entry 4687bded2dbSJung-uk Kim 4697bded2dbSJung-uk Kim.align 4 4707bded2dbSJung-uk KimLdec_loop: 4717bded2dbSJung-uk Kim# 4727bded2dbSJung-uk Kim# Inverse mix columns 4737bded2dbSJung-uk Kim# 4747bded2dbSJung-uk Kim lvx v0, r12, r11 # v5 and v0 are flipped 4757bded2dbSJung-uk Kim # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 4767bded2dbSJung-uk Kim # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 4777bded2dbSJung-uk Kim vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 4787bded2dbSJung-uk Kim subi r11, r11, 16 4797bded2dbSJung-uk Kim vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 4807bded2dbSJung-uk Kim andi. r11, r11, 0x30 4817bded2dbSJung-uk Kim vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 4827bded2dbSJung-uk Kim # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 4837bded2dbSJung-uk Kim vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 4847bded2dbSJung-uk Kim # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 4857bded2dbSJung-uk Kim 4867bded2dbSJung-uk Kim vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 4877bded2dbSJung-uk Kim vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 4887bded2dbSJung-uk Kim vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 4897bded2dbSJung-uk Kim vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 4907bded2dbSJung-uk Kim # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 4917bded2dbSJung-uk Kim vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 4927bded2dbSJung-uk Kim # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 4937bded2dbSJung-uk Kim 4947bded2dbSJung-uk Kim vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 4957bded2dbSJung-uk Kim vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 4967bded2dbSJung-uk Kim vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 4977bded2dbSJung-uk Kim vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 4987bded2dbSJung-uk Kim # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 4997bded2dbSJung-uk Kim vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 5007bded2dbSJung-uk Kim # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 5017bded2dbSJung-uk Kim 5027bded2dbSJung-uk Kim vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 5037bded2dbSJung-uk Kim vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 5047bded2dbSJung-uk Kim vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 5057bded2dbSJung-uk Kim vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 5067bded2dbSJung-uk Kim vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 5077bded2dbSJung-uk Kim 5087bded2dbSJung-uk KimLdec_entry: 5097bded2dbSJung-uk Kim # top of round 5107bded2dbSJung-uk Kim vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 5117bded2dbSJung-uk Kim vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 5127bded2dbSJung-uk Kim vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 5137bded2dbSJung-uk Kim vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 5147bded2dbSJung-uk Kim vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 5157bded2dbSJung-uk Kim vand v0, v0, v9 5167bded2dbSJung-uk Kim vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 5177bded2dbSJung-uk Kim vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 5187bded2dbSJung-uk Kim vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 5197bded2dbSJung-uk Kim vmr v5, v6 5207bded2dbSJung-uk Kim lvx v6, r9, $key # vmovdqu (%r9), %xmm0 5217bded2dbSJung-uk Kim vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 5227bded2dbSJung-uk Kim addi r9, r9, 16 5237bded2dbSJung-uk Kim vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 5247bded2dbSJung-uk Kim ?vperm v5, v5, v6, $keyperm # align round key 5257bded2dbSJung-uk Kim vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 5267bded2dbSJung-uk Kim bdnz Ldec_loop 5277bded2dbSJung-uk Kim 5287bded2dbSJung-uk Kim # middle of last round 5297bded2dbSJung-uk Kim addi r10, r11, 0x80 5307bded2dbSJung-uk Kim # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 5317bded2dbSJung-uk Kim vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 5327bded2dbSJung-uk Kim # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 5337bded2dbSJung-uk Kim lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 5347bded2dbSJung-uk Kim vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 5357bded2dbSJung-uk Kim vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 5367bded2dbSJung-uk Kim vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A 5377bded2dbSJung-uk Kim vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0 5387bded2dbSJung-uk Kim blr 5397bded2dbSJung-uk Kim .long 0 5407bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 5417bded2dbSJung-uk Kim 5427bded2dbSJung-uk Kim.globl .vpaes_decrypt 5437bded2dbSJung-uk Kim.align 5 5447bded2dbSJung-uk Kim.vpaes_decrypt: 5457bded2dbSJung-uk Kim $STU $sp,-$FRAME($sp) 5467bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 5477bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 5487bded2dbSJung-uk Kim mflr r6 5497bded2dbSJung-uk Kim mfspr r7, 256 # save vrsave 5507bded2dbSJung-uk Kim stvx v20,r10,$sp 5517bded2dbSJung-uk Kim addi r10,r10,32 5527bded2dbSJung-uk Kim stvx v21,r11,$sp 5537bded2dbSJung-uk Kim addi r11,r11,32 5547bded2dbSJung-uk Kim stvx v22,r10,$sp 5557bded2dbSJung-uk Kim addi r10,r10,32 5567bded2dbSJung-uk Kim stvx v23,r11,$sp 5577bded2dbSJung-uk Kim addi r11,r11,32 5587bded2dbSJung-uk Kim stvx v24,r10,$sp 5597bded2dbSJung-uk Kim addi r10,r10,32 5607bded2dbSJung-uk Kim stvx v25,r11,$sp 5617bded2dbSJung-uk Kim addi r11,r11,32 5627bded2dbSJung-uk Kim stvx v26,r10,$sp 5637bded2dbSJung-uk Kim addi r10,r10,32 5647bded2dbSJung-uk Kim stvx v27,r11,$sp 5657bded2dbSJung-uk Kim addi r11,r11,32 5667bded2dbSJung-uk Kim stvx v28,r10,$sp 5677bded2dbSJung-uk Kim addi r10,r10,32 5687bded2dbSJung-uk Kim stvx v29,r11,$sp 5697bded2dbSJung-uk Kim addi r11,r11,32 5707bded2dbSJung-uk Kim stvx v30,r10,$sp 5717bded2dbSJung-uk Kim stvx v31,r11,$sp 5727bded2dbSJung-uk Kim stw r7,`$FRAME-4`($sp) # save vrsave 5737bded2dbSJung-uk Kim li r0, -1 5747bded2dbSJung-uk Kim $PUSH r6,`$FRAME+$LRSAVE`($sp) 5757bded2dbSJung-uk Kim mtspr 256, r0 # preserve all AltiVec registers 5767bded2dbSJung-uk Kim 5777bded2dbSJung-uk Kim bl _vpaes_decrypt_preheat 5787bded2dbSJung-uk Kim 5797bded2dbSJung-uk Kim ?lvsl $inpperm, 0, $inp # prepare for unaligned access 5807bded2dbSJung-uk Kim lvx v0, 0, $inp 5817bded2dbSJung-uk Kim addi $inp, $inp, 15 # 15 is not a typo 5827bded2dbSJung-uk Kim ?lvsr $outperm, 0, $out 5837bded2dbSJung-uk Kim ?lvsl $keyperm, 0, $key 5847bded2dbSJung-uk Kim lvx $inptail, 0, $inp # redundant in aligned case 5857bded2dbSJung-uk Kim ?vperm v0, v0, $inptail, $inpperm 5867bded2dbSJung-uk Kim 5877bded2dbSJung-uk Kim bl _vpaes_decrypt_core 5887bded2dbSJung-uk Kim 58980815a77SJung-uk Kim andi. r8, $out, 15 59080815a77SJung-uk Kim li r9, 16 59180815a77SJung-uk Kim beq Ldec_out_aligned 5927bded2dbSJung-uk Kim 59380815a77SJung-uk Kim vperm v0, v0, v0, $outperm # rotate right/left 59480815a77SJung-uk Kim mtctr r9 59580815a77SJung-uk KimLdec_out_unaligned: 59680815a77SJung-uk Kim stvebx v0, 0, $out 59780815a77SJung-uk Kim addi $out, $out, 1 59880815a77SJung-uk Kim bdnz Ldec_out_unaligned 59980815a77SJung-uk Kim b Ldec_done 60080815a77SJung-uk Kim 60180815a77SJung-uk Kim.align 4 60280815a77SJung-uk KimLdec_out_aligned: 60380815a77SJung-uk Kim stvx v0, 0, $out 60480815a77SJung-uk KimLdec_done: 6057bded2dbSJung-uk Kim 6067bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 6077bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 6087bded2dbSJung-uk Kim mtlr r6 6097bded2dbSJung-uk Kim mtspr 256, r7 # restore vrsave 6107bded2dbSJung-uk Kim lvx v20,r10,$sp 6117bded2dbSJung-uk Kim addi r10,r10,32 6127bded2dbSJung-uk Kim lvx v21,r11,$sp 6137bded2dbSJung-uk Kim addi r11,r11,32 6147bded2dbSJung-uk Kim lvx v22,r10,$sp 6157bded2dbSJung-uk Kim addi r10,r10,32 6167bded2dbSJung-uk Kim lvx v23,r11,$sp 6177bded2dbSJung-uk Kim addi r11,r11,32 6187bded2dbSJung-uk Kim lvx v24,r10,$sp 6197bded2dbSJung-uk Kim addi r10,r10,32 6207bded2dbSJung-uk Kim lvx v25,r11,$sp 6217bded2dbSJung-uk Kim addi r11,r11,32 6227bded2dbSJung-uk Kim lvx v26,r10,$sp 6237bded2dbSJung-uk Kim addi r10,r10,32 6247bded2dbSJung-uk Kim lvx v27,r11,$sp 6257bded2dbSJung-uk Kim addi r11,r11,32 6267bded2dbSJung-uk Kim lvx v28,r10,$sp 6277bded2dbSJung-uk Kim addi r10,r10,32 6287bded2dbSJung-uk Kim lvx v29,r11,$sp 6297bded2dbSJung-uk Kim addi r11,r11,32 6307bded2dbSJung-uk Kim lvx v30,r10,$sp 6317bded2dbSJung-uk Kim lvx v31,r11,$sp 6327bded2dbSJung-uk Kim addi $sp,$sp,$FRAME 6337bded2dbSJung-uk Kim blr 6347bded2dbSJung-uk Kim .long 0 6357bded2dbSJung-uk Kim .byte 0,12,0x04,1,0x80,0,3,0 6367bded2dbSJung-uk Kim .long 0 6377bded2dbSJung-uk Kim.size .vpaes_decrypt,.-.vpaes_decrypt 6387bded2dbSJung-uk Kim 6397bded2dbSJung-uk Kim.globl .vpaes_cbc_encrypt 6407bded2dbSJung-uk Kim.align 5 6417bded2dbSJung-uk Kim.vpaes_cbc_encrypt: 6427bded2dbSJung-uk Kim ${UCMP}i r5,16 6437bded2dbSJung-uk Kim bltlr- 6447bded2dbSJung-uk Kim 6457bded2dbSJung-uk Kim $STU $sp,-`($FRAME+2*$SIZE_T)`($sp) 6467bded2dbSJung-uk Kim mflr r0 6477bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 6487bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 6497bded2dbSJung-uk Kim mfspr r12, 256 6507bded2dbSJung-uk Kim stvx v20,r10,$sp 6517bded2dbSJung-uk Kim addi r10,r10,32 6527bded2dbSJung-uk Kim stvx v21,r11,$sp 6537bded2dbSJung-uk Kim addi r11,r11,32 6547bded2dbSJung-uk Kim stvx v22,r10,$sp 6557bded2dbSJung-uk Kim addi r10,r10,32 6567bded2dbSJung-uk Kim stvx v23,r11,$sp 6577bded2dbSJung-uk Kim addi r11,r11,32 6587bded2dbSJung-uk Kim stvx v24,r10,$sp 6597bded2dbSJung-uk Kim addi r10,r10,32 6607bded2dbSJung-uk Kim stvx v25,r11,$sp 6617bded2dbSJung-uk Kim addi r11,r11,32 6627bded2dbSJung-uk Kim stvx v26,r10,$sp 6637bded2dbSJung-uk Kim addi r10,r10,32 6647bded2dbSJung-uk Kim stvx v27,r11,$sp 6657bded2dbSJung-uk Kim addi r11,r11,32 6667bded2dbSJung-uk Kim stvx v28,r10,$sp 6677bded2dbSJung-uk Kim addi r10,r10,32 6687bded2dbSJung-uk Kim stvx v29,r11,$sp 6697bded2dbSJung-uk Kim addi r11,r11,32 6707bded2dbSJung-uk Kim stvx v30,r10,$sp 6717bded2dbSJung-uk Kim stvx v31,r11,$sp 6727bded2dbSJung-uk Kim stw r12,`$FRAME-4`($sp) # save vrsave 6737bded2dbSJung-uk Kim $PUSH r30,`$FRAME+$SIZE_T*0`($sp) 6747bded2dbSJung-uk Kim $PUSH r31,`$FRAME+$SIZE_T*1`($sp) 6757bded2dbSJung-uk Kim li r9, -16 6767bded2dbSJung-uk Kim $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 6777bded2dbSJung-uk Kim 6787bded2dbSJung-uk Kim and r30, r5, r9 # copy length&-16 67980815a77SJung-uk Kim andi. r9, $out, 15 # is $out aligned? 6807bded2dbSJung-uk Kim mr r5, r6 # copy pointer to key 6817bded2dbSJung-uk Kim mr r31, r7 # copy pointer to iv 6827bded2dbSJung-uk Kim li r6, -1 68380815a77SJung-uk Kim mcrf cr1, cr0 # put aside $out alignment flag 6847bded2dbSJung-uk Kim mr r7, r12 # copy vrsave 6857bded2dbSJung-uk Kim mtspr 256, r6 # preserve all AltiVec registers 6867bded2dbSJung-uk Kim 6877bded2dbSJung-uk Kim lvx v24, 0, r31 # load [potentially unaligned] iv 6887bded2dbSJung-uk Kim li r9, 15 6897bded2dbSJung-uk Kim ?lvsl $inpperm, 0, r31 6907bded2dbSJung-uk Kim lvx v25, r9, r31 6917bded2dbSJung-uk Kim ?vperm v24, v24, v25, $inpperm 6927bded2dbSJung-uk Kim 69380815a77SJung-uk Kim cmpwi r8, 0 # test direction 6947bded2dbSJung-uk Kim neg r8, $inp # prepare for unaligned access 6957bded2dbSJung-uk Kim vxor v7, v7, v7 6967bded2dbSJung-uk Kim ?lvsl $keyperm, 0, $key 6977bded2dbSJung-uk Kim ?lvsr $outperm, 0, $out 6987bded2dbSJung-uk Kim ?lvsr $inpperm, 0, r8 # -$inp 6997bded2dbSJung-uk Kim vnor $outmask, v7, v7 # 0xff..ff 7007bded2dbSJung-uk Kim lvx $inptail, 0, $inp 7017bded2dbSJung-uk Kim ?vperm $outmask, v7, $outmask, $outperm 7027bded2dbSJung-uk Kim addi $inp, $inp, 15 # 15 is not a typo 7037bded2dbSJung-uk Kim 7047bded2dbSJung-uk Kim beq Lcbc_decrypt 7057bded2dbSJung-uk Kim 7067bded2dbSJung-uk Kim bl _vpaes_encrypt_preheat 7077bded2dbSJung-uk Kim li r0, 16 7087bded2dbSJung-uk Kim 70980815a77SJung-uk Kim beq cr1, Lcbc_enc_loop # $out is aligned 71080815a77SJung-uk Kim 71180815a77SJung-uk Kim vmr v0, $inptail 71280815a77SJung-uk Kim lvx $inptail, 0, $inp 71380815a77SJung-uk Kim addi $inp, $inp, 16 71480815a77SJung-uk Kim ?vperm v0, v0, $inptail, $inpperm 71580815a77SJung-uk Kim vxor v0, v0, v24 # ^= iv 71680815a77SJung-uk Kim 71780815a77SJung-uk Kim bl _vpaes_encrypt_core 71880815a77SJung-uk Kim 71980815a77SJung-uk Kim andi. r8, $out, 15 72080815a77SJung-uk Kim vmr v24, v0 # put aside iv 72180815a77SJung-uk Kim sub r9, $out, r8 72280815a77SJung-uk Kim vperm $outhead, v0, v0, $outperm # rotate right/left 72380815a77SJung-uk Kim 72480815a77SJung-uk KimLcbc_enc_head: 72580815a77SJung-uk Kim stvebx $outhead, r8, r9 72680815a77SJung-uk Kim cmpwi r8, 15 72780815a77SJung-uk Kim addi r8, r8, 1 72880815a77SJung-uk Kim bne Lcbc_enc_head 72980815a77SJung-uk Kim 73080815a77SJung-uk Kim sub. r30, r30, r0 # len -= 16 73180815a77SJung-uk Kim addi $out, $out, 16 73280815a77SJung-uk Kim beq Lcbc_unaligned_done 73380815a77SJung-uk Kim 7347bded2dbSJung-uk KimLcbc_enc_loop: 7357bded2dbSJung-uk Kim vmr v0, $inptail 7367bded2dbSJung-uk Kim lvx $inptail, 0, $inp 7377bded2dbSJung-uk Kim addi $inp, $inp, 16 7387bded2dbSJung-uk Kim ?vperm v0, v0, $inptail, $inpperm 7397bded2dbSJung-uk Kim vxor v0, v0, v24 # ^= iv 7407bded2dbSJung-uk Kim 7417bded2dbSJung-uk Kim bl _vpaes_encrypt_core 7427bded2dbSJung-uk Kim 7437bded2dbSJung-uk Kim vmr v24, v0 # put aside iv 7447bded2dbSJung-uk Kim sub. r30, r30, r0 # len -= 16 7457bded2dbSJung-uk Kim vperm v0, v0, v0, $outperm # rotate right/left 7467bded2dbSJung-uk Kim vsel v1, $outhead, v0, $outmask 7477bded2dbSJung-uk Kim vmr $outhead, v0 7487bded2dbSJung-uk Kim stvx v1, 0, $out 7497bded2dbSJung-uk Kim addi $out, $out, 16 7507bded2dbSJung-uk Kim bne Lcbc_enc_loop 7517bded2dbSJung-uk Kim 7527bded2dbSJung-uk Kim b Lcbc_done 7537bded2dbSJung-uk Kim 7547bded2dbSJung-uk Kim.align 5 7557bded2dbSJung-uk KimLcbc_decrypt: 7567bded2dbSJung-uk Kim bl _vpaes_decrypt_preheat 7577bded2dbSJung-uk Kim li r0, 16 7587bded2dbSJung-uk Kim 75980815a77SJung-uk Kim beq cr1, Lcbc_dec_loop # $out is aligned 76080815a77SJung-uk Kim 76180815a77SJung-uk Kim vmr v0, $inptail 76280815a77SJung-uk Kim lvx $inptail, 0, $inp 76380815a77SJung-uk Kim addi $inp, $inp, 16 76480815a77SJung-uk Kim ?vperm v0, v0, $inptail, $inpperm 76580815a77SJung-uk Kim vmr v25, v0 # put aside input 76680815a77SJung-uk Kim 76780815a77SJung-uk Kim bl _vpaes_decrypt_core 76880815a77SJung-uk Kim 76980815a77SJung-uk Kim andi. r8, $out, 15 77080815a77SJung-uk Kim vxor v0, v0, v24 # ^= iv 77180815a77SJung-uk Kim vmr v24, v25 77280815a77SJung-uk Kim sub r9, $out, r8 77380815a77SJung-uk Kim vperm $outhead, v0, v0, $outperm # rotate right/left 77480815a77SJung-uk Kim 77580815a77SJung-uk KimLcbc_dec_head: 77680815a77SJung-uk Kim stvebx $outhead, r8, r9 77780815a77SJung-uk Kim cmpwi r8, 15 77880815a77SJung-uk Kim addi r8, r8, 1 77980815a77SJung-uk Kim bne Lcbc_dec_head 78080815a77SJung-uk Kim 78180815a77SJung-uk Kim sub. r30, r30, r0 # len -= 16 78280815a77SJung-uk Kim addi $out, $out, 16 78380815a77SJung-uk Kim beq Lcbc_unaligned_done 78480815a77SJung-uk Kim 7857bded2dbSJung-uk KimLcbc_dec_loop: 7867bded2dbSJung-uk Kim vmr v0, $inptail 7877bded2dbSJung-uk Kim lvx $inptail, 0, $inp 7887bded2dbSJung-uk Kim addi $inp, $inp, 16 7897bded2dbSJung-uk Kim ?vperm v0, v0, $inptail, $inpperm 7907bded2dbSJung-uk Kim vmr v25, v0 # put aside input 7917bded2dbSJung-uk Kim 7927bded2dbSJung-uk Kim bl _vpaes_decrypt_core 7937bded2dbSJung-uk Kim 7947bded2dbSJung-uk Kim vxor v0, v0, v24 # ^= iv 7957bded2dbSJung-uk Kim vmr v24, v25 7967bded2dbSJung-uk Kim sub. r30, r30, r0 # len -= 16 7977bded2dbSJung-uk Kim vperm v0, v0, v0, $outperm # rotate right/left 7987bded2dbSJung-uk Kim vsel v1, $outhead, v0, $outmask 7997bded2dbSJung-uk Kim vmr $outhead, v0 8007bded2dbSJung-uk Kim stvx v1, 0, $out 8017bded2dbSJung-uk Kim addi $out, $out, 16 8027bded2dbSJung-uk Kim bne Lcbc_dec_loop 8037bded2dbSJung-uk Kim 8047bded2dbSJung-uk KimLcbc_done: 80580815a77SJung-uk Kim beq cr1, Lcbc_write_iv # $out is aligned 8067bded2dbSJung-uk Kim 80780815a77SJung-uk KimLcbc_unaligned_done: 80880815a77SJung-uk Kim andi. r8, $out, 15 80980815a77SJung-uk Kim sub $out, $out, r8 81080815a77SJung-uk Kim li r9, 0 81180815a77SJung-uk KimLcbc_tail: 81280815a77SJung-uk Kim stvebx $outhead, r9, $out 81380815a77SJung-uk Kim addi r9, r9, 1 81480815a77SJung-uk Kim cmpw r9, r8 81580815a77SJung-uk Kim bne Lcbc_tail 81680815a77SJung-uk Kim 81780815a77SJung-uk KimLcbc_write_iv: 8187bded2dbSJung-uk Kim neg r8, r31 # write [potentially unaligned] iv 81980815a77SJung-uk Kim li r10, 4 8207bded2dbSJung-uk Kim ?lvsl $outperm, 0, r8 82180815a77SJung-uk Kim li r11, 8 82280815a77SJung-uk Kim li r12, 12 8237bded2dbSJung-uk Kim vperm v24, v24, v24, $outperm # rotate right/left 82480815a77SJung-uk Kim stvewx v24, 0, r31 # ivp is at least 32-bit aligned 82580815a77SJung-uk Kim stvewx v24, r10, r31 82680815a77SJung-uk Kim stvewx v24, r11, r31 82780815a77SJung-uk Kim stvewx v24, r12, r31 8287bded2dbSJung-uk Kim 8297bded2dbSJung-uk Kim mtspr 256, r7 # restore vrsave 8307bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 8317bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 8327bded2dbSJung-uk Kim lvx v20,r10,$sp 8337bded2dbSJung-uk Kim addi r10,r10,32 8347bded2dbSJung-uk Kim lvx v21,r11,$sp 8357bded2dbSJung-uk Kim addi r11,r11,32 8367bded2dbSJung-uk Kim lvx v22,r10,$sp 8377bded2dbSJung-uk Kim addi r10,r10,32 8387bded2dbSJung-uk Kim lvx v23,r11,$sp 8397bded2dbSJung-uk Kim addi r11,r11,32 8407bded2dbSJung-uk Kim lvx v24,r10,$sp 8417bded2dbSJung-uk Kim addi r10,r10,32 8427bded2dbSJung-uk Kim lvx v25,r11,$sp 8437bded2dbSJung-uk Kim addi r11,r11,32 8447bded2dbSJung-uk Kim lvx v26,r10,$sp 8457bded2dbSJung-uk Kim addi r10,r10,32 8467bded2dbSJung-uk Kim lvx v27,r11,$sp 8477bded2dbSJung-uk Kim addi r11,r11,32 8487bded2dbSJung-uk Kim lvx v28,r10,$sp 8497bded2dbSJung-uk Kim addi r10,r10,32 8507bded2dbSJung-uk Kim lvx v29,r11,$sp 8517bded2dbSJung-uk Kim addi r11,r11,32 8527bded2dbSJung-uk Kim lvx v30,r10,$sp 8537bded2dbSJung-uk Kim lvx v31,r11,$sp 8547bded2dbSJung-uk KimLcbc_abort: 8557bded2dbSJung-uk Kim $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 8567bded2dbSJung-uk Kim $POP r30,`$FRAME+$SIZE_T*0`($sp) 8577bded2dbSJung-uk Kim $POP r31,`$FRAME+$SIZE_T*1`($sp) 8587bded2dbSJung-uk Kim mtlr r0 8597bded2dbSJung-uk Kim addi $sp,$sp,`$FRAME+$SIZE_T*2` 8607bded2dbSJung-uk Kim blr 8617bded2dbSJung-uk Kim .long 0 8627bded2dbSJung-uk Kim .byte 0,12,0x04,1,0x80,2,6,0 8637bded2dbSJung-uk Kim .long 0 8647bded2dbSJung-uk Kim.size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt 8657bded2dbSJung-uk Kim___ 8667bded2dbSJung-uk Kim} 8677bded2dbSJung-uk Kim{ 8687bded2dbSJung-uk Kimmy ($inp,$bits,$out)=map("r$_",(3..5)); 8697bded2dbSJung-uk Kimmy $dir="cr1"; 8707bded2dbSJung-uk Kimmy ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24)); 8717bded2dbSJung-uk Kim 8727bded2dbSJung-uk Kim$code.=<<___; 8737bded2dbSJung-uk Kim######################################################## 8747bded2dbSJung-uk Kim## ## 8757bded2dbSJung-uk Kim## AES key schedule ## 8767bded2dbSJung-uk Kim## ## 8777bded2dbSJung-uk Kim######################################################## 8787bded2dbSJung-uk Kim.align 4 8797bded2dbSJung-uk Kim_vpaes_key_preheat: 8807bded2dbSJung-uk Kim mflr r8 8817bded2dbSJung-uk Kim bl Lconsts 8827bded2dbSJung-uk Kim mtlr r8 8837bded2dbSJung-uk Kim li r11, 0xc0 # Lk_inv 8847bded2dbSJung-uk Kim li r10, 0xd0 8857bded2dbSJung-uk Kim li r9, 0xe0 # L_ipt 8867bded2dbSJung-uk Kim li r8, 0xf0 8877bded2dbSJung-uk Kim 8887bded2dbSJung-uk Kim vspltisb v8,4 # 0x04..04 8897bded2dbSJung-uk Kim vxor v9,v9,v9 # 0x00..00 8907bded2dbSJung-uk Kim lvx $invlo, r12, r11 # Lk_inv 8917bded2dbSJung-uk Kim li r11, 0x120 8927bded2dbSJung-uk Kim lvx $invhi, r12, r10 8937bded2dbSJung-uk Kim li r10, 0x130 8947bded2dbSJung-uk Kim lvx $iptlo, r12, r9 # Lk_ipt 8957bded2dbSJung-uk Kim li r9, 0x220 8967bded2dbSJung-uk Kim lvx $ipthi, r12, r8 8977bded2dbSJung-uk Kim li r8, 0x230 8987bded2dbSJung-uk Kim 8997bded2dbSJung-uk Kim lvx v14, r12, r11 # Lk_sb1 9007bded2dbSJung-uk Kim li r11, 0x240 9017bded2dbSJung-uk Kim lvx v15, r12, r10 9027bded2dbSJung-uk Kim li r10, 0x250 9037bded2dbSJung-uk Kim 9047bded2dbSJung-uk Kim lvx v16, r12, r9 # Lk_dksd 9057bded2dbSJung-uk Kim li r9, 0x260 9067bded2dbSJung-uk Kim lvx v17, r12, r8 9077bded2dbSJung-uk Kim li r8, 0x270 9087bded2dbSJung-uk Kim lvx v18, r12, r11 # Lk_dksb 9097bded2dbSJung-uk Kim li r11, 0x280 9107bded2dbSJung-uk Kim lvx v19, r12, r10 9117bded2dbSJung-uk Kim li r10, 0x290 9127bded2dbSJung-uk Kim lvx v20, r12, r9 # Lk_dkse 9137bded2dbSJung-uk Kim li r9, 0x2a0 9147bded2dbSJung-uk Kim lvx v21, r12, r8 9157bded2dbSJung-uk Kim li r8, 0x2b0 9167bded2dbSJung-uk Kim lvx v22, r12, r11 # Lk_dks9 9177bded2dbSJung-uk Kim lvx v23, r12, r10 9187bded2dbSJung-uk Kim 9197bded2dbSJung-uk Kim lvx v24, r12, r9 # Lk_rcon 9207bded2dbSJung-uk Kim lvx v25, 0, r12 # Lk_mc_forward[0] 9217bded2dbSJung-uk Kim lvx v26, r12, r8 # Lks63 9227bded2dbSJung-uk Kim blr 9237bded2dbSJung-uk Kim .long 0 9247bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 9257bded2dbSJung-uk Kim 9267bded2dbSJung-uk Kim.align 4 9277bded2dbSJung-uk Kim_vpaes_schedule_core: 9287bded2dbSJung-uk Kim mflr r7 9297bded2dbSJung-uk Kim 9307bded2dbSJung-uk Kim bl _vpaes_key_preheat # load the tables 9317bded2dbSJung-uk Kim 9327bded2dbSJung-uk Kim #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned) 9337bded2dbSJung-uk Kim neg r8, $inp # prepare for unaligned access 9347bded2dbSJung-uk Kim lvx v0, 0, $inp 9357bded2dbSJung-uk Kim addi $inp, $inp, 15 # 15 is not typo 9367bded2dbSJung-uk Kim ?lvsr $inpperm, 0, r8 # -$inp 9377bded2dbSJung-uk Kim lvx v6, 0, $inp # v6 serves as inptail 9387bded2dbSJung-uk Kim addi $inp, $inp, 8 9397bded2dbSJung-uk Kim ?vperm v0, v0, v6, $inpperm 9407bded2dbSJung-uk Kim 9417bded2dbSJung-uk Kim # input transform 9427bded2dbSJung-uk Kim vmr v3, v0 # vmovdqa %xmm0, %xmm3 9437bded2dbSJung-uk Kim bl _vpaes_schedule_transform 9447bded2dbSJung-uk Kim vmr v7, v0 # vmovdqa %xmm0, %xmm7 9457bded2dbSJung-uk Kim 9467bded2dbSJung-uk Kim bne $dir, Lschedule_am_decrypting 9477bded2dbSJung-uk Kim 9487bded2dbSJung-uk Kim # encrypting, output zeroth round key after transform 9497bded2dbSJung-uk Kim li r8, 0x30 # mov \$0x30,%r8d 95080815a77SJung-uk Kim li r9, 4 95180815a77SJung-uk Kim li r10, 8 95280815a77SJung-uk Kim li r11, 12 9537bded2dbSJung-uk Kim 9547bded2dbSJung-uk Kim ?lvsr $outperm, 0, $out # prepare for unaligned access 9557bded2dbSJung-uk Kim vnor $outmask, v9, v9 # 0xff..ff 9567bded2dbSJung-uk Kim ?vperm $outmask, v9, $outmask, $outperm 9577bded2dbSJung-uk Kim 9587bded2dbSJung-uk Kim #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) 95980815a77SJung-uk Kim vperm $outhead, v0, v0, $outperm # rotate right/left 96080815a77SJung-uk Kim stvewx $outhead, 0, $out # some are superfluous 96180815a77SJung-uk Kim stvewx $outhead, r9, $out 96280815a77SJung-uk Kim stvewx $outhead, r10, $out 96380815a77SJung-uk Kim addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 96480815a77SJung-uk Kim stvewx $outhead, r11, $out 9657bded2dbSJung-uk Kim b Lschedule_go 9667bded2dbSJung-uk Kim 9677bded2dbSJung-uk KimLschedule_am_decrypting: 9687bded2dbSJung-uk Kim srwi r8, $bits, 1 # shr \$1,%r8d 9697bded2dbSJung-uk Kim andi. r8, r8, 32 # and \$32,%r8d 9707bded2dbSJung-uk Kim xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 9717bded2dbSJung-uk Kim addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 9727bded2dbSJung-uk Kim # decrypting, output zeroth round key after shiftrows 9737bded2dbSJung-uk Kim lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 97480815a77SJung-uk Kim li r9, 4 97580815a77SJung-uk Kim li r10, 8 97680815a77SJung-uk Kim li r11, 12 9777bded2dbSJung-uk Kim vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 9787bded2dbSJung-uk Kim 9797bded2dbSJung-uk Kim neg r0, $out # prepare for unaligned access 9807bded2dbSJung-uk Kim ?lvsl $outperm, 0, r0 9817bded2dbSJung-uk Kim vnor $outmask, v9, v9 # 0xff..ff 9827bded2dbSJung-uk Kim ?vperm $outmask, $outmask, v9, $outperm 9837bded2dbSJung-uk Kim 9847bded2dbSJung-uk Kim #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) 98580815a77SJung-uk Kim vperm $outhead, v4, v4, $outperm # rotate right/left 98680815a77SJung-uk Kim stvewx $outhead, 0, $out # some are superfluous 98780815a77SJung-uk Kim stvewx $outhead, r9, $out 98880815a77SJung-uk Kim stvewx $outhead, r10, $out 98980815a77SJung-uk Kim addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 99080815a77SJung-uk Kim stvewx $outhead, r11, $out 99180815a77SJung-uk Kim addi $out, $out, 15 # 15 is not typo 9927bded2dbSJung-uk Kim xori r8, r8, 0x30 # xor \$0x30, %r8 9937bded2dbSJung-uk Kim 9947bded2dbSJung-uk KimLschedule_go: 9957bded2dbSJung-uk Kim cmplwi $bits, 192 # cmp \$192, %esi 9967bded2dbSJung-uk Kim bgt Lschedule_256 9977bded2dbSJung-uk Kim beq Lschedule_192 9987bded2dbSJung-uk Kim # 128: fall though 9997bded2dbSJung-uk Kim 10007bded2dbSJung-uk Kim## 10017bded2dbSJung-uk Kim## .schedule_128 10027bded2dbSJung-uk Kim## 10037bded2dbSJung-uk Kim## 128-bit specific part of key schedule. 10047bded2dbSJung-uk Kim## 10057bded2dbSJung-uk Kim## This schedule is really simple, because all its parts 10067bded2dbSJung-uk Kim## are accomplished by the subroutines. 10077bded2dbSJung-uk Kim## 10087bded2dbSJung-uk KimLschedule_128: 10097bded2dbSJung-uk Kim li r0, 10 # mov \$10, %esi 10107bded2dbSJung-uk Kim mtctr r0 10117bded2dbSJung-uk Kim 10127bded2dbSJung-uk KimLoop_schedule_128: 10137bded2dbSJung-uk Kim bl _vpaes_schedule_round 10147bded2dbSJung-uk Kim bdz Lschedule_mangle_last # dec %esi 10157bded2dbSJung-uk Kim bl _vpaes_schedule_mangle # write output 10167bded2dbSJung-uk Kim b Loop_schedule_128 10177bded2dbSJung-uk Kim 10187bded2dbSJung-uk Kim## 10197bded2dbSJung-uk Kim## .aes_schedule_192 10207bded2dbSJung-uk Kim## 10217bded2dbSJung-uk Kim## 192-bit specific part of key schedule. 10227bded2dbSJung-uk Kim## 10237bded2dbSJung-uk Kim## The main body of this schedule is the same as the 128-bit 10247bded2dbSJung-uk Kim## schedule, but with more smearing. The long, high side is 10257bded2dbSJung-uk Kim## stored in %xmm7 as before, and the short, low side is in 10267bded2dbSJung-uk Kim## the high bits of %xmm6. 10277bded2dbSJung-uk Kim## 10287bded2dbSJung-uk Kim## This schedule is somewhat nastier, however, because each 10297bded2dbSJung-uk Kim## round produces 192 bits of key material, or 1.5 round keys. 10307bded2dbSJung-uk Kim## Therefore, on each cycle we do 2 rounds and produce 3 round 10317bded2dbSJung-uk Kim## keys. 10327bded2dbSJung-uk Kim## 10337bded2dbSJung-uk Kim.align 4 10347bded2dbSJung-uk KimLschedule_192: 10357bded2dbSJung-uk Kim li r0, 4 # mov \$4, %esi 10367bded2dbSJung-uk Kim lvx v0, 0, $inp 10377bded2dbSJung-uk Kim ?vperm v0, v6, v0, $inpperm 10387bded2dbSJung-uk Kim ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 10397bded2dbSJung-uk Kim bl _vpaes_schedule_transform # input transform 10407bded2dbSJung-uk Kim ?vsldoi v6, v0, v9, 8 10417bded2dbSJung-uk Kim ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros 10427bded2dbSJung-uk Kim mtctr r0 10437bded2dbSJung-uk Kim 10447bded2dbSJung-uk KimLoop_schedule_192: 10457bded2dbSJung-uk Kim bl _vpaes_schedule_round 10467bded2dbSJung-uk Kim ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 10477bded2dbSJung-uk Kim bl _vpaes_schedule_mangle # save key n 10487bded2dbSJung-uk Kim bl _vpaes_schedule_192_smear 10497bded2dbSJung-uk Kim bl _vpaes_schedule_mangle # save key n+1 10507bded2dbSJung-uk Kim bl _vpaes_schedule_round 10517bded2dbSJung-uk Kim bdz Lschedule_mangle_last # dec %esi 10527bded2dbSJung-uk Kim bl _vpaes_schedule_mangle # save key n+2 10537bded2dbSJung-uk Kim bl _vpaes_schedule_192_smear 10547bded2dbSJung-uk Kim b Loop_schedule_192 10557bded2dbSJung-uk Kim 10567bded2dbSJung-uk Kim## 10577bded2dbSJung-uk Kim## .aes_schedule_256 10587bded2dbSJung-uk Kim## 10597bded2dbSJung-uk Kim## 256-bit specific part of key schedule. 10607bded2dbSJung-uk Kim## 10617bded2dbSJung-uk Kim## The structure here is very similar to the 128-bit 10627bded2dbSJung-uk Kim## schedule, but with an additional "low side" in 10637bded2dbSJung-uk Kim## %xmm6. The low side's rounds are the same as the 10647bded2dbSJung-uk Kim## high side's, except no rcon and no rotation. 10657bded2dbSJung-uk Kim## 10667bded2dbSJung-uk Kim.align 4 10677bded2dbSJung-uk KimLschedule_256: 10687bded2dbSJung-uk Kim li r0, 7 # mov \$7, %esi 10697bded2dbSJung-uk Kim addi $inp, $inp, 8 10707bded2dbSJung-uk Kim lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 10717bded2dbSJung-uk Kim ?vperm v0, v6, v0, $inpperm 10727bded2dbSJung-uk Kim bl _vpaes_schedule_transform # input transform 10737bded2dbSJung-uk Kim mtctr r0 10747bded2dbSJung-uk Kim 10757bded2dbSJung-uk KimLoop_schedule_256: 10767bded2dbSJung-uk Kim bl _vpaes_schedule_mangle # output low result 10777bded2dbSJung-uk Kim vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 10787bded2dbSJung-uk Kim 10797bded2dbSJung-uk Kim # high round 10807bded2dbSJung-uk Kim bl _vpaes_schedule_round 10817bded2dbSJung-uk Kim bdz Lschedule_mangle_last # dec %esi 10827bded2dbSJung-uk Kim bl _vpaes_schedule_mangle 10837bded2dbSJung-uk Kim 10847bded2dbSJung-uk Kim # low round. swap xmm7 and xmm6 10857bded2dbSJung-uk Kim ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 10867bded2dbSJung-uk Kim vmr v5, v7 # vmovdqa %xmm7, %xmm5 10877bded2dbSJung-uk Kim vmr v7, v6 # vmovdqa %xmm6, %xmm7 10887bded2dbSJung-uk Kim bl _vpaes_schedule_low_round 10897bded2dbSJung-uk Kim vmr v7, v5 # vmovdqa %xmm5, %xmm7 10907bded2dbSJung-uk Kim 10917bded2dbSJung-uk Kim b Loop_schedule_256 10927bded2dbSJung-uk Kim## 10937bded2dbSJung-uk Kim## .aes_schedule_mangle_last 10947bded2dbSJung-uk Kim## 10957bded2dbSJung-uk Kim## Mangler for last round of key schedule 10967bded2dbSJung-uk Kim## Mangles %xmm0 10977bded2dbSJung-uk Kim## when encrypting, outputs out(%xmm0) ^ 63 10987bded2dbSJung-uk Kim## when decrypting, outputs unskew(%xmm0) 10997bded2dbSJung-uk Kim## 11007bded2dbSJung-uk Kim## Always called right before return... jumps to cleanup and exits 11017bded2dbSJung-uk Kim## 11027bded2dbSJung-uk Kim.align 4 11037bded2dbSJung-uk KimLschedule_mangle_last: 11047bded2dbSJung-uk Kim # schedule last round key from xmm0 11057bded2dbSJung-uk Kim li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11 11067bded2dbSJung-uk Kim li r9, 0x2f0 11077bded2dbSJung-uk Kim bne $dir, Lschedule_mangle_last_dec 11087bded2dbSJung-uk Kim 11097bded2dbSJung-uk Kim # encrypting 11107bded2dbSJung-uk Kim lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1 11117bded2dbSJung-uk Kim li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform 11127bded2dbSJung-uk Kim li r9, 0x2d0 # prepare to output transform 11137bded2dbSJung-uk Kim vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute 11147bded2dbSJung-uk Kim 11157bded2dbSJung-uk Kim lvx $iptlo, r11, r12 # reload $ipt 11167bded2dbSJung-uk Kim lvx $ipthi, r9, r12 11177bded2dbSJung-uk Kim addi $out, $out, 16 # add \$16, %rdx 11187bded2dbSJung-uk Kim vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 11197bded2dbSJung-uk Kim bl _vpaes_schedule_transform # output transform 11207bded2dbSJung-uk Kim 11217bded2dbSJung-uk Kim #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 11227bded2dbSJung-uk Kim vperm v0, v0, v0, $outperm # rotate right/left 112380815a77SJung-uk Kim li r10, 4 11247bded2dbSJung-uk Kim vsel v2, $outhead, v0, $outmask 112580815a77SJung-uk Kim li r11, 8 11267bded2dbSJung-uk Kim stvx v2, 0, $out 112780815a77SJung-uk Kim li r12, 12 112880815a77SJung-uk Kim stvewx v0, 0, $out # some (or all) are redundant 112980815a77SJung-uk Kim stvewx v0, r10, $out 113080815a77SJung-uk Kim stvewx v0, r11, $out 113180815a77SJung-uk Kim stvewx v0, r12, $out 11327bded2dbSJung-uk Kim b Lschedule_mangle_done 11337bded2dbSJung-uk Kim 11347bded2dbSJung-uk Kim.align 4 11357bded2dbSJung-uk KimLschedule_mangle_last_dec: 11367bded2dbSJung-uk Kim lvx $iptlo, r11, r12 # reload $ipt 11377bded2dbSJung-uk Kim lvx $ipthi, r9, r12 11387bded2dbSJung-uk Kim addi $out, $out, -16 # add \$-16, %rdx 11397bded2dbSJung-uk Kim vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 11407bded2dbSJung-uk Kim bl _vpaes_schedule_transform # output transform 11417bded2dbSJung-uk Kim 11427bded2dbSJung-uk Kim #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 114380815a77SJung-uk Kim addi r9, $out, -15 # -15 is not typo 11447bded2dbSJung-uk Kim vperm v0, v0, v0, $outperm # rotate right/left 114580815a77SJung-uk Kim li r10, 4 11467bded2dbSJung-uk Kim vsel v2, $outhead, v0, $outmask 114780815a77SJung-uk Kim li r11, 8 11487bded2dbSJung-uk Kim stvx v2, 0, $out 114980815a77SJung-uk Kim li r12, 12 115080815a77SJung-uk Kim stvewx v0, 0, r9 # some (or all) are redundant 115180815a77SJung-uk Kim stvewx v0, r10, r9 115280815a77SJung-uk Kim stvewx v0, r11, r9 115380815a77SJung-uk Kim stvewx v0, r12, r9 11547bded2dbSJung-uk Kim 11557bded2dbSJung-uk Kim 11567bded2dbSJung-uk KimLschedule_mangle_done: 11577bded2dbSJung-uk Kim mtlr r7 11587bded2dbSJung-uk Kim # cleanup 11597bded2dbSJung-uk Kim vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0 11607bded2dbSJung-uk Kim vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1 11617bded2dbSJung-uk Kim vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2 11627bded2dbSJung-uk Kim vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3 11637bded2dbSJung-uk Kim vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 11647bded2dbSJung-uk Kim vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5 11657bded2dbSJung-uk Kim vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6 11667bded2dbSJung-uk Kim vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7 11677bded2dbSJung-uk Kim 11687bded2dbSJung-uk Kim blr 11697bded2dbSJung-uk Kim .long 0 11707bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 11717bded2dbSJung-uk Kim 11727bded2dbSJung-uk Kim## 11737bded2dbSJung-uk Kim## .aes_schedule_192_smear 11747bded2dbSJung-uk Kim## 11757bded2dbSJung-uk Kim## Smear the short, low side in the 192-bit key schedule. 11767bded2dbSJung-uk Kim## 11777bded2dbSJung-uk Kim## Inputs: 11787bded2dbSJung-uk Kim## %xmm7: high side, b a x y 11797bded2dbSJung-uk Kim## %xmm6: low side, d c 0 0 11807bded2dbSJung-uk Kim## %xmm13: 0 11817bded2dbSJung-uk Kim## 11827bded2dbSJung-uk Kim## Outputs: 11837bded2dbSJung-uk Kim## %xmm6: b+c+d b+c 0 0 11847bded2dbSJung-uk Kim## %xmm0: b+c+d b+c b a 11857bded2dbSJung-uk Kim## 11867bded2dbSJung-uk Kim.align 4 11877bded2dbSJung-uk Kim_vpaes_schedule_192_smear: 11887bded2dbSJung-uk Kim ?vspltw v0, v7, 3 11897bded2dbSJung-uk Kim ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 11907bded2dbSJung-uk Kim ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 11917bded2dbSJung-uk Kim vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 11927bded2dbSJung-uk Kim vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 11937bded2dbSJung-uk Kim vmr v0, v6 11947bded2dbSJung-uk Kim ?vsldoi v6, v6, v9, 8 11957bded2dbSJung-uk Kim ?vsldoi v6, v9, v6, 8 # clobber low side with zeros 11967bded2dbSJung-uk Kim blr 11977bded2dbSJung-uk Kim .long 0 11987bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 11997bded2dbSJung-uk Kim 12007bded2dbSJung-uk Kim## 12017bded2dbSJung-uk Kim## .aes_schedule_round 12027bded2dbSJung-uk Kim## 12037bded2dbSJung-uk Kim## Runs one main round of the key schedule on %xmm0, %xmm7 12047bded2dbSJung-uk Kim## 12057bded2dbSJung-uk Kim## Specifically, runs subbytes on the high dword of %xmm0 12067bded2dbSJung-uk Kim## then rotates it by one byte and xors into the low dword of 12077bded2dbSJung-uk Kim## %xmm7. 12087bded2dbSJung-uk Kim## 12097bded2dbSJung-uk Kim## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 12107bded2dbSJung-uk Kim## next rcon. 12117bded2dbSJung-uk Kim## 12127bded2dbSJung-uk Kim## Smears the dwords of %xmm7 by xoring the low into the 12137bded2dbSJung-uk Kim## second low, result into third, result into highest. 12147bded2dbSJung-uk Kim## 12157bded2dbSJung-uk Kim## Returns results in %xmm7 = %xmm0. 12167bded2dbSJung-uk Kim## Clobbers %xmm1-%xmm4, %r11. 12177bded2dbSJung-uk Kim## 12187bded2dbSJung-uk Kim.align 4 12197bded2dbSJung-uk Kim_vpaes_schedule_round: 12207bded2dbSJung-uk Kim # extract rcon from xmm8 12217bded2dbSJung-uk Kim #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 12227bded2dbSJung-uk Kim ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 12237bded2dbSJung-uk Kim ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 12247bded2dbSJung-uk Kim vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 12257bded2dbSJung-uk Kim 12267bded2dbSJung-uk Kim # rotate 12277bded2dbSJung-uk Kim ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 12287bded2dbSJung-uk Kim ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 12297bded2dbSJung-uk Kim 12307bded2dbSJung-uk Kim # fall through... 12317bded2dbSJung-uk Kim 12327bded2dbSJung-uk Kim # low round: same as high round, but no rotation and no rcon. 12337bded2dbSJung-uk Kim_vpaes_schedule_low_round: 12347bded2dbSJung-uk Kim # smear xmm7 12357bded2dbSJung-uk Kim ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 12367bded2dbSJung-uk Kim vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 12377bded2dbSJung-uk Kim vspltisb v1, 0x0f # 0x0f..0f 12387bded2dbSJung-uk Kim ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 12397bded2dbSJung-uk Kim 12407bded2dbSJung-uk Kim # subbytes 12417bded2dbSJung-uk Kim vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k 12427bded2dbSJung-uk Kim vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 12437bded2dbSJung-uk Kim vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7 12447bded2dbSJung-uk Kim vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 12457bded2dbSJung-uk Kim vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 12467bded2dbSJung-uk Kim vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 12477bded2dbSJung-uk Kim vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 12487bded2dbSJung-uk Kim vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 12497bded2dbSJung-uk Kim vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7 12507bded2dbSJung-uk Kim vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 12517bded2dbSJung-uk Kim vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 12527bded2dbSJung-uk Kim vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 12537bded2dbSJung-uk Kim vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io 12547bded2dbSJung-uk Kim vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 12557bded2dbSJung-uk Kim vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 12567bded2dbSJung-uk Kim vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 12577bded2dbSJung-uk Kim vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 12587bded2dbSJung-uk Kim 12597bded2dbSJung-uk Kim # add in smeared stuff 12607bded2dbSJung-uk Kim vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0 12617bded2dbSJung-uk Kim vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7 12627bded2dbSJung-uk Kim blr 12637bded2dbSJung-uk Kim .long 0 12647bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 12657bded2dbSJung-uk Kim 12667bded2dbSJung-uk Kim## 12677bded2dbSJung-uk Kim## .aes_schedule_transform 12687bded2dbSJung-uk Kim## 12697bded2dbSJung-uk Kim## Linear-transform %xmm0 according to tables at (%r11) 12707bded2dbSJung-uk Kim## 12717bded2dbSJung-uk Kim## Requires that %xmm9 = 0x0F0F... as in preheat 12727bded2dbSJung-uk Kim## Output in %xmm0 12737bded2dbSJung-uk Kim## Clobbers %xmm2 12747bded2dbSJung-uk Kim## 12757bded2dbSJung-uk Kim.align 4 12767bded2dbSJung-uk Kim_vpaes_schedule_transform: 12777bded2dbSJung-uk Kim #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1 12787bded2dbSJung-uk Kim vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 12797bded2dbSJung-uk Kim # vmovdqa (%r11), %xmm2 # lo 12807bded2dbSJung-uk Kim vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 12817bded2dbSJung-uk Kim # vmovdqa 16(%r11), %xmm1 # hi 12827bded2dbSJung-uk Kim vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0 12837bded2dbSJung-uk Kim vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0 12847bded2dbSJung-uk Kim blr 12857bded2dbSJung-uk Kim .long 0 12867bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 12877bded2dbSJung-uk Kim 12887bded2dbSJung-uk Kim## 12897bded2dbSJung-uk Kim## .aes_schedule_mangle 12907bded2dbSJung-uk Kim## 12917bded2dbSJung-uk Kim## Mangle xmm0 from (basis-transformed) standard version 12927bded2dbSJung-uk Kim## to our version. 12937bded2dbSJung-uk Kim## 12947bded2dbSJung-uk Kim## On encrypt, 12957bded2dbSJung-uk Kim## xor with 0x63 12967bded2dbSJung-uk Kim## multiply by circulant 0,1,1,1 12977bded2dbSJung-uk Kim## apply shiftrows transform 12987bded2dbSJung-uk Kim## 12997bded2dbSJung-uk Kim## On decrypt, 13007bded2dbSJung-uk Kim## xor with 0x63 13017bded2dbSJung-uk Kim## multiply by "inverse mixcolumns" circulant E,B,D,9 13027bded2dbSJung-uk Kim## deskew 13037bded2dbSJung-uk Kim## apply shiftrows transform 13047bded2dbSJung-uk Kim## 13057bded2dbSJung-uk Kim## 13067bded2dbSJung-uk Kim## Writes out to (%rdx), and increments or decrements it 13077bded2dbSJung-uk Kim## Keeps track of round number mod 4 in %r8 13087bded2dbSJung-uk Kim## Preserves xmm0 13097bded2dbSJung-uk Kim## Clobbers xmm1-xmm5 13107bded2dbSJung-uk Kim## 13117bded2dbSJung-uk Kim.align 4 13127bded2dbSJung-uk Kim_vpaes_schedule_mangle: 13137bded2dbSJung-uk Kim #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later 13147bded2dbSJung-uk Kim # vmovdqa .Lk_mc_forward(%rip),%xmm5 13157bded2dbSJung-uk Kim bne $dir, Lschedule_mangle_dec 13167bded2dbSJung-uk Kim 13177bded2dbSJung-uk Kim # encrypting 13187bded2dbSJung-uk Kim vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4 13197bded2dbSJung-uk Kim addi $out, $out, 16 # add \$16, %rdx 13207bded2dbSJung-uk Kim vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4 13217bded2dbSJung-uk Kim vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1 13227bded2dbSJung-uk Kim vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3 13237bded2dbSJung-uk Kim vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4 13247bded2dbSJung-uk Kim lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 13257bded2dbSJung-uk Kim vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3 13267bded2dbSJung-uk Kim 13277bded2dbSJung-uk Kim vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 13287bded2dbSJung-uk Kim addi r8, r8, -16 # add \$-16, %r8 13297bded2dbSJung-uk Kim andi. r8, r8, 0x30 # and \$0x30, %r8 13307bded2dbSJung-uk Kim 13317bded2dbSJung-uk Kim #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 13327bded2dbSJung-uk Kim vperm v1, v3, v3, $outperm # rotate right/left 13337bded2dbSJung-uk Kim vsel v2, $outhead, v1, $outmask 13347bded2dbSJung-uk Kim vmr $outhead, v1 13357bded2dbSJung-uk Kim stvx v2, 0, $out 13367bded2dbSJung-uk Kim blr 13377bded2dbSJung-uk Kim 13387bded2dbSJung-uk Kim.align 4 13397bded2dbSJung-uk KimLschedule_mangle_dec: 13407bded2dbSJung-uk Kim # inverse mix columns 13417bded2dbSJung-uk Kim # lea .Lk_dksd(%rip),%r11 13427bded2dbSJung-uk Kim vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 13437bded2dbSJung-uk Kim #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo 13447bded2dbSJung-uk Kim 13457bded2dbSJung-uk Kim # vmovdqa 0x00(%r11), %xmm2 13467bded2dbSJung-uk Kim vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2 13477bded2dbSJung-uk Kim # vmovdqa 0x10(%r11), %xmm3 13487bded2dbSJung-uk Kim vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3 13497bded2dbSJung-uk Kim vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 13507bded2dbSJung-uk Kim vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 13517bded2dbSJung-uk Kim 13527bded2dbSJung-uk Kim # vmovdqa 0x20(%r11), %xmm2 13537bded2dbSJung-uk Kim vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2 13547bded2dbSJung-uk Kim vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 13557bded2dbSJung-uk Kim # vmovdqa 0x30(%r11), %xmm3 13567bded2dbSJung-uk Kim vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3 13577bded2dbSJung-uk Kim vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 13587bded2dbSJung-uk Kim vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 13597bded2dbSJung-uk Kim 13607bded2dbSJung-uk Kim # vmovdqa 0x40(%r11), %xmm2 13617bded2dbSJung-uk Kim vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2 13627bded2dbSJung-uk Kim vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 13637bded2dbSJung-uk Kim # vmovdqa 0x50(%r11), %xmm3 13647bded2dbSJung-uk Kim vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3 13657bded2dbSJung-uk Kim vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 13667bded2dbSJung-uk Kim 13677bded2dbSJung-uk Kim # vmovdqa 0x60(%r11), %xmm2 13687bded2dbSJung-uk Kim vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2 13697bded2dbSJung-uk Kim vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 13707bded2dbSJung-uk Kim # vmovdqa 0x70(%r11), %xmm4 13717bded2dbSJung-uk Kim vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4 13727bded2dbSJung-uk Kim lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 13737bded2dbSJung-uk Kim vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 13747bded2dbSJung-uk Kim vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3 13757bded2dbSJung-uk Kim 13767bded2dbSJung-uk Kim addi $out, $out, -16 # add \$-16, %rdx 13777bded2dbSJung-uk Kim 13787bded2dbSJung-uk Kim vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 13797bded2dbSJung-uk Kim addi r8, r8, -16 # add \$-16, %r8 13807bded2dbSJung-uk Kim andi. r8, r8, 0x30 # and \$0x30, %r8 13817bded2dbSJung-uk Kim 13827bded2dbSJung-uk Kim #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 13837bded2dbSJung-uk Kim vperm v1, v3, v3, $outperm # rotate right/left 13847bded2dbSJung-uk Kim vsel v2, $outhead, v1, $outmask 13857bded2dbSJung-uk Kim vmr $outhead, v1 13867bded2dbSJung-uk Kim stvx v2, 0, $out 13877bded2dbSJung-uk Kim blr 13887bded2dbSJung-uk Kim .long 0 13897bded2dbSJung-uk Kim .byte 0,12,0x14,0,0,0,0,0 13907bded2dbSJung-uk Kim 13917bded2dbSJung-uk Kim.globl .vpaes_set_encrypt_key 13927bded2dbSJung-uk Kim.align 5 13937bded2dbSJung-uk Kim.vpaes_set_encrypt_key: 13947bded2dbSJung-uk Kim $STU $sp,-$FRAME($sp) 13957bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 13967bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 13977bded2dbSJung-uk Kim mflr r0 13987bded2dbSJung-uk Kim mfspr r6, 256 # save vrsave 13997bded2dbSJung-uk Kim stvx v20,r10,$sp 14007bded2dbSJung-uk Kim addi r10,r10,32 14017bded2dbSJung-uk Kim stvx v21,r11,$sp 14027bded2dbSJung-uk Kim addi r11,r11,32 14037bded2dbSJung-uk Kim stvx v22,r10,$sp 14047bded2dbSJung-uk Kim addi r10,r10,32 14057bded2dbSJung-uk Kim stvx v23,r11,$sp 14067bded2dbSJung-uk Kim addi r11,r11,32 14077bded2dbSJung-uk Kim stvx v24,r10,$sp 14087bded2dbSJung-uk Kim addi r10,r10,32 14097bded2dbSJung-uk Kim stvx v25,r11,$sp 14107bded2dbSJung-uk Kim addi r11,r11,32 14117bded2dbSJung-uk Kim stvx v26,r10,$sp 14127bded2dbSJung-uk Kim addi r10,r10,32 14137bded2dbSJung-uk Kim stvx v27,r11,$sp 14147bded2dbSJung-uk Kim addi r11,r11,32 14157bded2dbSJung-uk Kim stvx v28,r10,$sp 14167bded2dbSJung-uk Kim addi r10,r10,32 14177bded2dbSJung-uk Kim stvx v29,r11,$sp 14187bded2dbSJung-uk Kim addi r11,r11,32 14197bded2dbSJung-uk Kim stvx v30,r10,$sp 14207bded2dbSJung-uk Kim stvx v31,r11,$sp 14217bded2dbSJung-uk Kim stw r6,`$FRAME-4`($sp) # save vrsave 14227bded2dbSJung-uk Kim li r7, -1 14237bded2dbSJung-uk Kim $PUSH r0, `$FRAME+$LRSAVE`($sp) 14247bded2dbSJung-uk Kim mtspr 256, r7 # preserve all AltiVec registers 14257bded2dbSJung-uk Kim 14267bded2dbSJung-uk Kim srwi r9, $bits, 5 # shr \$5,%eax 14277bded2dbSJung-uk Kim addi r9, r9, 6 # add \$5,%eax 14287bded2dbSJung-uk Kim stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 14297bded2dbSJung-uk Kim 14307bded2dbSJung-uk Kim cmplw $dir, $bits, $bits # set encrypt direction 14317bded2dbSJung-uk Kim li r8, 0x30 # mov \$0x30,%r8d 14327bded2dbSJung-uk Kim bl _vpaes_schedule_core 14337bded2dbSJung-uk Kim 14347bded2dbSJung-uk Kim $POP r0, `$FRAME+$LRSAVE`($sp) 14357bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 14367bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 14377bded2dbSJung-uk Kim mtspr 256, r6 # restore vrsave 14387bded2dbSJung-uk Kim mtlr r0 14397bded2dbSJung-uk Kim xor r3, r3, r3 14407bded2dbSJung-uk Kim lvx v20,r10,$sp 14417bded2dbSJung-uk Kim addi r10,r10,32 14427bded2dbSJung-uk Kim lvx v21,r11,$sp 14437bded2dbSJung-uk Kim addi r11,r11,32 14447bded2dbSJung-uk Kim lvx v22,r10,$sp 14457bded2dbSJung-uk Kim addi r10,r10,32 14467bded2dbSJung-uk Kim lvx v23,r11,$sp 14477bded2dbSJung-uk Kim addi r11,r11,32 14487bded2dbSJung-uk Kim lvx v24,r10,$sp 14497bded2dbSJung-uk Kim addi r10,r10,32 14507bded2dbSJung-uk Kim lvx v25,r11,$sp 14517bded2dbSJung-uk Kim addi r11,r11,32 14527bded2dbSJung-uk Kim lvx v26,r10,$sp 14537bded2dbSJung-uk Kim addi r10,r10,32 14547bded2dbSJung-uk Kim lvx v27,r11,$sp 14557bded2dbSJung-uk Kim addi r11,r11,32 14567bded2dbSJung-uk Kim lvx v28,r10,$sp 14577bded2dbSJung-uk Kim addi r10,r10,32 14587bded2dbSJung-uk Kim lvx v29,r11,$sp 14597bded2dbSJung-uk Kim addi r11,r11,32 14607bded2dbSJung-uk Kim lvx v30,r10,$sp 14617bded2dbSJung-uk Kim lvx v31,r11,$sp 14627bded2dbSJung-uk Kim addi $sp,$sp,$FRAME 14637bded2dbSJung-uk Kim blr 14647bded2dbSJung-uk Kim .long 0 14657bded2dbSJung-uk Kim .byte 0,12,0x04,1,0x80,0,3,0 14667bded2dbSJung-uk Kim .long 0 14677bded2dbSJung-uk Kim.size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key 14687bded2dbSJung-uk Kim 14697bded2dbSJung-uk Kim.globl .vpaes_set_decrypt_key 14707bded2dbSJung-uk Kim.align 4 14717bded2dbSJung-uk Kim.vpaes_set_decrypt_key: 14727bded2dbSJung-uk Kim $STU $sp,-$FRAME($sp) 14737bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 14747bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 14757bded2dbSJung-uk Kim mflr r0 14767bded2dbSJung-uk Kim mfspr r6, 256 # save vrsave 14777bded2dbSJung-uk Kim stvx v20,r10,$sp 14787bded2dbSJung-uk Kim addi r10,r10,32 14797bded2dbSJung-uk Kim stvx v21,r11,$sp 14807bded2dbSJung-uk Kim addi r11,r11,32 14817bded2dbSJung-uk Kim stvx v22,r10,$sp 14827bded2dbSJung-uk Kim addi r10,r10,32 14837bded2dbSJung-uk Kim stvx v23,r11,$sp 14847bded2dbSJung-uk Kim addi r11,r11,32 14857bded2dbSJung-uk Kim stvx v24,r10,$sp 14867bded2dbSJung-uk Kim addi r10,r10,32 14877bded2dbSJung-uk Kim stvx v25,r11,$sp 14887bded2dbSJung-uk Kim addi r11,r11,32 14897bded2dbSJung-uk Kim stvx v26,r10,$sp 14907bded2dbSJung-uk Kim addi r10,r10,32 14917bded2dbSJung-uk Kim stvx v27,r11,$sp 14927bded2dbSJung-uk Kim addi r11,r11,32 14937bded2dbSJung-uk Kim stvx v28,r10,$sp 14947bded2dbSJung-uk Kim addi r10,r10,32 14957bded2dbSJung-uk Kim stvx v29,r11,$sp 14967bded2dbSJung-uk Kim addi r11,r11,32 14977bded2dbSJung-uk Kim stvx v30,r10,$sp 14987bded2dbSJung-uk Kim stvx v31,r11,$sp 14997bded2dbSJung-uk Kim stw r6,`$FRAME-4`($sp) # save vrsave 15007bded2dbSJung-uk Kim li r7, -1 15017bded2dbSJung-uk Kim $PUSH r0, `$FRAME+$LRSAVE`($sp) 15027bded2dbSJung-uk Kim mtspr 256, r7 # preserve all AltiVec registers 15037bded2dbSJung-uk Kim 15047bded2dbSJung-uk Kim srwi r9, $bits, 5 # shr \$5,%eax 15057bded2dbSJung-uk Kim addi r9, r9, 6 # add \$5,%eax 15067bded2dbSJung-uk Kim stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 15077bded2dbSJung-uk Kim 15087bded2dbSJung-uk Kim slwi r9, r9, 4 # shl \$4,%eax 15097bded2dbSJung-uk Kim add $out, $out, r9 # lea (%rdx,%rax),%rdx 15107bded2dbSJung-uk Kim 15117bded2dbSJung-uk Kim cmplwi $dir, $bits, 0 # set decrypt direction 15127bded2dbSJung-uk Kim srwi r8, $bits, 1 # shr \$1,%r8d 15137bded2dbSJung-uk Kim andi. r8, r8, 32 # and \$32,%r8d 15147bded2dbSJung-uk Kim xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 15157bded2dbSJung-uk Kim bl _vpaes_schedule_core 15167bded2dbSJung-uk Kim 15177bded2dbSJung-uk Kim $POP r0, `$FRAME+$LRSAVE`($sp) 15187bded2dbSJung-uk Kim li r10,`15+6*$SIZE_T` 15197bded2dbSJung-uk Kim li r11,`31+6*$SIZE_T` 15207bded2dbSJung-uk Kim mtspr 256, r6 # restore vrsave 15217bded2dbSJung-uk Kim mtlr r0 15227bded2dbSJung-uk Kim xor r3, r3, r3 15237bded2dbSJung-uk Kim lvx v20,r10,$sp 15247bded2dbSJung-uk Kim addi r10,r10,32 15257bded2dbSJung-uk Kim lvx v21,r11,$sp 15267bded2dbSJung-uk Kim addi r11,r11,32 15277bded2dbSJung-uk Kim lvx v22,r10,$sp 15287bded2dbSJung-uk Kim addi r10,r10,32 15297bded2dbSJung-uk Kim lvx v23,r11,$sp 15307bded2dbSJung-uk Kim addi r11,r11,32 15317bded2dbSJung-uk Kim lvx v24,r10,$sp 15327bded2dbSJung-uk Kim addi r10,r10,32 15337bded2dbSJung-uk Kim lvx v25,r11,$sp 15347bded2dbSJung-uk Kim addi r11,r11,32 15357bded2dbSJung-uk Kim lvx v26,r10,$sp 15367bded2dbSJung-uk Kim addi r10,r10,32 15377bded2dbSJung-uk Kim lvx v27,r11,$sp 15387bded2dbSJung-uk Kim addi r11,r11,32 15397bded2dbSJung-uk Kim lvx v28,r10,$sp 15407bded2dbSJung-uk Kim addi r10,r10,32 15417bded2dbSJung-uk Kim lvx v29,r11,$sp 15427bded2dbSJung-uk Kim addi r11,r11,32 15437bded2dbSJung-uk Kim lvx v30,r10,$sp 15447bded2dbSJung-uk Kim lvx v31,r11,$sp 15457bded2dbSJung-uk Kim addi $sp,$sp,$FRAME 15467bded2dbSJung-uk Kim blr 15477bded2dbSJung-uk Kim .long 0 15487bded2dbSJung-uk Kim .byte 0,12,0x04,1,0x80,0,3,0 15497bded2dbSJung-uk Kim .long 0 15507bded2dbSJung-uk Kim.size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key 15517bded2dbSJung-uk Kim___ 15527bded2dbSJung-uk Kim} 15537bded2dbSJung-uk Kim 15547bded2dbSJung-uk Kimmy $consts=1; 15557bded2dbSJung-uk Kimforeach (split("\n",$code)) { 15567bded2dbSJung-uk Kim s/\`([^\`]*)\`/eval $1/geo; 15577bded2dbSJung-uk Kim 15587bded2dbSJung-uk Kim # constants table endian-specific conversion 15597bded2dbSJung-uk Kim if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) { 15607bded2dbSJung-uk Kim my $conv=$2; 15617bded2dbSJung-uk Kim my @bytes=(); 15627bded2dbSJung-uk Kim 15637bded2dbSJung-uk Kim # convert to endian-agnostic format 15647bded2dbSJung-uk Kim foreach (split(/,\s+/,$1)) { 15657bded2dbSJung-uk Kim my $l = /^0/?oct:int; 15667bded2dbSJung-uk Kim push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; 15677bded2dbSJung-uk Kim } 15687bded2dbSJung-uk Kim 15697bded2dbSJung-uk Kim # little-endian conversion 15707bded2dbSJung-uk Kim if ($flavour =~ /le$/o) { 15717bded2dbSJung-uk Kim SWITCH: for($conv) { 15727bded2dbSJung-uk Kim /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; 15737bded2dbSJung-uk Kim /\?rev/ && do { @bytes=reverse(@bytes); last; }; 15747bded2dbSJung-uk Kim } 15757bded2dbSJung-uk Kim } 15767bded2dbSJung-uk Kim 15777bded2dbSJung-uk Kim #emit 15787bded2dbSJung-uk Kim print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; 15797bded2dbSJung-uk Kim next; 15807bded2dbSJung-uk Kim } 15817bded2dbSJung-uk Kim $consts=0 if (m/Lconsts:/o); # end of table 15827bded2dbSJung-uk Kim 15837bded2dbSJung-uk Kim # instructions prefixed with '?' are endian-specific and need 15847bded2dbSJung-uk Kim # to be adjusted accordingly... 15857bded2dbSJung-uk Kim if ($flavour =~ /le$/o) { # little-endian 15867bded2dbSJung-uk Kim s/\?lvsr/lvsl/o or 15877bded2dbSJung-uk Kim s/\?lvsl/lvsr/o or 15887bded2dbSJung-uk Kim s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or 15897bded2dbSJung-uk Kim s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or 15907bded2dbSJung-uk Kim s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; 15917bded2dbSJung-uk Kim } else { # big-endian 15927bded2dbSJung-uk Kim s/\?([a-z]+)/$1/o; 15937bded2dbSJung-uk Kim } 15947bded2dbSJung-uk Kim 15957bded2dbSJung-uk Kim print $_,"\n"; 15967bded2dbSJung-uk Kim} 15977bded2dbSJung-uk Kim 159817f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 1599