1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19
20######################################################################
21# September 2011.
22#
23# Interface to OpenSSL as "almost" drop-in replacement for
24# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
25# doesn't handle partial vectors (doesn't have to if called from
26# EVP only). "Drop-in" implies that this module doesn't share key
27# schedule structure with the original nor does it make assumption
28# about its alignment...
29#
30# Performance summary. aes-x86_64.pl column lists large-block CBC
31# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32# byte processed with 128-bit key, and vpaes-x86_64.pl column -
33# [also large-block CBC] encrypt/decrypt.
34#
35#		aes-x86_64.pl		vpaes-x86_64.pl
36#
37# Core 2(**)	29.6/41.1/14.3		21.9/25.2(***)
38# Nehalem	29.6/40.3/14.6		10.0/11.8
39# Atom		57.3/74.2/32.1		60.9/77.2(***)
40# Silvermont	52.7/64.0/19.5		48.8/60.8(***)
41# Goldmont	38.9/49.0/17.8		10.6/12.6
42#
43# (*)	"Hyper-threading" in the context refers rather to cache shared
44#	among multiple cores, than to specifically Intel HTT. As vast
45#	majority of contemporary cores share cache, slower code path
46#	is common place. In other words "with-hyper-threading-off"
47#	results are presented mostly for reference purposes.
48#
49# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
50#
51# (***)	Less impressive improvement on Core 2 and Atom is due to slow
52#	pshufb,	yet it's respectable +36%/62% improvement on Core 2
53#	(as implied, over "hyper-threading-safe" code path).
54#
55#						<appro@openssl.org>
56
57$flavour = shift;
58$output  = shift;
59if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
60
61$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
62
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
65( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
66die "can't locate x86_64-xlate.pl";
67
68open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
69*STDOUT=*OUT;
70
71$PREFIX="vpaes";
72
73$code.=<<___;
74.text
75
76##
77##  _aes_encrypt_core
78##
79##  AES-encrypt %xmm0.
80##
81##  Inputs:
82##     %xmm0 = input
83##     %xmm9-%xmm15 as in _vpaes_preheat
84##    (%rdx) = scheduled keys
85##
86##  Output in %xmm0
87##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
88##  Preserves %xmm6 - %xmm8 so you get some local vectors
89##
90##
91.type	_vpaes_encrypt_core,\@abi-omnipotent
92.align 16
93_vpaes_encrypt_core:
94	mov	%rdx,	%r9
95	mov	\$16,	%r11
96	mov	240(%rdx),%eax
97	movdqa	%xmm9,	%xmm1
98	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
99	pandn	%xmm0,	%xmm1
100	movdqu	(%r9),	%xmm5		# round0 key
101	psrld	\$4,	%xmm1
102	pand	%xmm9,	%xmm0
103	pshufb	%xmm0,	%xmm2
104	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
105	pshufb	%xmm1,	%xmm0
106	pxor	%xmm5,	%xmm2
107	add	\$16,	%r9
108	pxor	%xmm2,	%xmm0
109	lea	.Lk_mc_backward(%rip),%r10
110	jmp	.Lenc_entry
111
112.align 16
113.Lenc_loop:
114	# middle of middle round
115	movdqa  %xmm13,	%xmm4	# 4 : sb1u
116	movdqa  %xmm12,	%xmm0	# 0 : sb1t
117	pshufb  %xmm2,	%xmm4	# 4 = sb1u
118	pshufb  %xmm3,	%xmm0	# 0 = sb1t
119	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
120	movdqa  %xmm15,	%xmm5	# 4 : sb2u
121	pxor	%xmm4,	%xmm0	# 0 = A
122	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
123	pshufb	%xmm2,	%xmm5	# 4 = sb2u
124	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
125	movdqa	%xmm14, %xmm2	# 2 : sb2t
126	pshufb	%xmm3,  %xmm2	# 2 = sb2t
127	movdqa	%xmm0,  %xmm3	# 3 = A
128	pxor	%xmm5,	%xmm2	# 2 = 2A
129	pshufb  %xmm1,  %xmm0	# 0 = B
130	add	\$16,	%r9	# next key
131	pxor	%xmm2,  %xmm0	# 0 = 2A+B
132	pshufb	%xmm4,	%xmm3	# 3 = D
133	add	\$16,	%r11	# next mc
134	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
135	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
136	and	\$0x30,	%r11	# ... mod 4
137	sub	\$1,%rax	# nr--
138	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
139
140.Lenc_entry:
141	# top of round
142	movdqa  %xmm9, 	%xmm1	# 1 : i
143	movdqa	%xmm11, %xmm5	# 2 : a/k
144	pandn	%xmm0, 	%xmm1	# 1 = i<<4
145	psrld	\$4,   	%xmm1   # 1 = i
146	pand	%xmm9, 	%xmm0   # 0 = k
147	pshufb  %xmm0,  %xmm5	# 2 = a/k
148	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
149	pxor	%xmm1,	%xmm0	# 0 = j
150	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
151	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
152	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
153	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
154	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
155	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
156	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
157	movdqa	%xmm10, %xmm3   # 3 : 1/jak
158	pxor	%xmm0, 	%xmm2  	# 2 = io
159	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
160	movdqu	(%r9),	%xmm5
161	pxor	%xmm1,  %xmm3   # 3 = jo
162	jnz	.Lenc_loop
163
164	# middle of last round
165	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
166	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
167	pshufb  %xmm2,  %xmm4	# 4 = sbou
168	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
169	pshufb  %xmm3,	%xmm0	# 0 = sb1t
170	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
171	pxor	%xmm4,	%xmm0	# 0 = A
172	pshufb	%xmm1,	%xmm0
173	ret
174.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
175
176##
177##  Decryption core
178##
179##  Same API as encryption core.
180##
181.type	_vpaes_decrypt_core,\@abi-omnipotent
182.align	16
183_vpaes_decrypt_core:
184	mov	%rdx,	%r9		# load key
185	mov	240(%rdx),%eax
186	movdqa	%xmm9,	%xmm1
187	movdqa	.Lk_dipt(%rip), %xmm2	# iptlo
188	pandn	%xmm0,	%xmm1
189	mov	%rax,	%r11
190	psrld	\$4,	%xmm1
191	movdqu	(%r9),	%xmm5		# round0 key
192	shl	\$4,	%r11
193	pand	%xmm9,	%xmm0
194	pshufb	%xmm0,	%xmm2
195	movdqa	.Lk_dipt+16(%rip), %xmm0 # ipthi
196	xor	\$0x30,	%r11
197	lea	.Lk_dsbd(%rip),%r10
198	pshufb	%xmm1,	%xmm0
199	and	\$0x30,	%r11
200	pxor	%xmm5,	%xmm2
201	movdqa	.Lk_mc_forward+48(%rip), %xmm5
202	pxor	%xmm2,	%xmm0
203	add	\$16,	%r9
204	add	%r10,	%r11
205	jmp	.Ldec_entry
206
207.align 16
208.Ldec_loop:
209##
210##  Inverse mix columns
211##
212	movdqa  -0x20(%r10),%xmm4	# 4 : sb9u
213	movdqa  -0x10(%r10),%xmm1	# 0 : sb9t
214	pshufb	%xmm2,	%xmm4		# 4 = sb9u
215	pshufb	%xmm3,	%xmm1		# 0 = sb9t
216	pxor	%xmm4,	%xmm0
217	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
218	pxor	%xmm1,	%xmm0		# 0 = ch
219	movdqa  0x10(%r10),%xmm1	# 0 : sbdt
220
221	pshufb	%xmm2,	%xmm4		# 4 = sbdu
222	pshufb	%xmm5,	%xmm0		# MC ch
223	pshufb	%xmm3,	%xmm1		# 0 = sbdt
224	pxor	%xmm4,	%xmm0		# 4 = ch
225	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
226	pxor	%xmm1,	%xmm0		# 0 = ch
227	movdqa  0x30(%r10),%xmm1	# 0 : sbbt
228
229	pshufb	%xmm2,	%xmm4		# 4 = sbbu
230	pshufb	%xmm5,	%xmm0		# MC ch
231	pshufb	%xmm3,	%xmm1		# 0 = sbbt
232	pxor	%xmm4,	%xmm0		# 4 = ch
233	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
234	pxor	%xmm1,	%xmm0		# 0 = ch
235	movdqa  0x50(%r10),%xmm1	# 0 : sbet
236
237	pshufb	%xmm2,	%xmm4		# 4 = sbeu
238	pshufb	%xmm5,	%xmm0		# MC ch
239	pshufb	%xmm3,	%xmm1		# 0 = sbet
240	pxor	%xmm4,	%xmm0		# 4 = ch
241	add	\$16, %r9		# next round key
242	palignr	\$12,	%xmm5,	%xmm5
243	pxor	%xmm1,	%xmm0		# 0 = ch
244	sub	\$1,%rax		# nr--
245
246.Ldec_entry:
247	# top of round
248	movdqa  %xmm9, 	%xmm1	# 1 : i
249	pandn	%xmm0, 	%xmm1	# 1 = i<<4
250	movdqa	%xmm11, %xmm2	# 2 : a/k
251	psrld	\$4,    %xmm1	# 1 = i
252	pand	%xmm9, 	%xmm0	# 0 = k
253	pshufb  %xmm0,  %xmm2	# 2 = a/k
254	movdqa	%xmm10,	%xmm3	# 3 : 1/i
255	pxor	%xmm1,	%xmm0	# 0 = j
256	pshufb  %xmm1, 	%xmm3	# 3 = 1/i
257	movdqa	%xmm10,	%xmm4	# 4 : 1/j
258	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
259	pshufb	%xmm0, 	%xmm4	# 4 = 1/j
260	pxor	%xmm2, 	%xmm4	# 4 = jak = 1/j + a/k
261	movdqa	%xmm10,	%xmm2	# 2 : 1/iak
262	pshufb  %xmm3,	%xmm2	# 2 = 1/iak
263	movdqa	%xmm10, %xmm3	# 3 : 1/jak
264	pxor	%xmm0, 	%xmm2	# 2 = io
265	pshufb  %xmm4,  %xmm3	# 3 = 1/jak
266	movdqu	(%r9),	%xmm0
267	pxor	%xmm1,  %xmm3	# 3 = jo
268	jnz	.Ldec_loop
269
270	# middle of last round
271	movdqa	0x60(%r10), %xmm4	# 3 : sbou
272	pshufb  %xmm2,  %xmm4	# 4 = sbou
273	pxor	%xmm0,  %xmm4	# 4 = sb1u + k
274	movdqa	0x70(%r10), %xmm0	# 0 : sbot
275	movdqa	-0x160(%r11), %xmm2	# .Lk_sr-.Lk_dsbd=-0x160
276	pshufb  %xmm3,	%xmm0	# 0 = sb1t
277	pxor	%xmm4,	%xmm0	# 0 = A
278	pshufb	%xmm2,	%xmm0
279	ret
280.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
281
282########################################################
283##                                                    ##
284##                  AES key schedule                  ##
285##                                                    ##
286########################################################
287.type	_vpaes_schedule_core,\@abi-omnipotent
288.align	16
289_vpaes_schedule_core:
290	# rdi = key
291	# rsi = size in bits
292	# rdx = buffer
293	# rcx = direction.  0=encrypt, 1=decrypt
294
295	call	_vpaes_preheat		# load the tables
296	movdqa	.Lk_rcon(%rip), %xmm8	# load rcon
297	movdqu	(%rdi),	%xmm0		# load key (unaligned)
298
299	# input transform
300	movdqa	%xmm0,	%xmm3
301	lea	.Lk_ipt(%rip), %r11
302	call	_vpaes_schedule_transform
303	movdqa	%xmm0,	%xmm7
304
305	lea	.Lk_sr(%rip),%r10
306	test	%rcx,	%rcx
307	jnz	.Lschedule_am_decrypting
308
309	# encrypting, output zeroth round key after transform
310	movdqu	%xmm0,	(%rdx)
311	jmp	.Lschedule_go
312
313.Lschedule_am_decrypting:
314	# decrypting, output zeroth round key after shiftrows
315	movdqa	(%r8,%r10),%xmm1
316	pshufb  %xmm1,	%xmm3
317	movdqu	%xmm3,	(%rdx)
318	xor	\$0x30, %r8
319
320.Lschedule_go:
321	cmp	\$192,	%esi
322	ja	.Lschedule_256
323	je	.Lschedule_192
324	# 128: fall though
325
326##
327##  .schedule_128
328##
329##  128-bit specific part of key schedule.
330##
331##  This schedule is really simple, because all its parts
332##  are accomplished by the subroutines.
333##
334.Lschedule_128:
335	mov	\$10, %esi
336
337.Loop_schedule_128:
338	call 	_vpaes_schedule_round
339	dec	%rsi
340	jz 	.Lschedule_mangle_last
341	call	_vpaes_schedule_mangle	# write output
342	jmp 	.Loop_schedule_128
343
344##
345##  .aes_schedule_192
346##
347##  192-bit specific part of key schedule.
348##
349##  The main body of this schedule is the same as the 128-bit
350##  schedule, but with more smearing.  The long, high side is
351##  stored in %xmm7 as before, and the short, low side is in
352##  the high bits of %xmm6.
353##
354##  This schedule is somewhat nastier, however, because each
355##  round produces 192 bits of key material, or 1.5 round keys.
356##  Therefore, on each cycle we do 2 rounds and produce 3 round
357##  keys.
358##
359.align	16
360.Lschedule_192:
361	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
362	call	_vpaes_schedule_transform	# input transform
363	movdqa	%xmm0,	%xmm6		# save short part
364	pxor	%xmm4,	%xmm4		# clear 4
365	movhlps	%xmm4,	%xmm6		# clobber low side with zeros
366	mov	\$4,	%esi
367
368.Loop_schedule_192:
369	call	_vpaes_schedule_round
370	palignr	\$8,%xmm6,%xmm0
371	call	_vpaes_schedule_mangle	# save key n
372	call	_vpaes_schedule_192_smear
373	call	_vpaes_schedule_mangle	# save key n+1
374	call	_vpaes_schedule_round
375	dec	%rsi
376	jz 	.Lschedule_mangle_last
377	call	_vpaes_schedule_mangle	# save key n+2
378	call	_vpaes_schedule_192_smear
379	jmp	.Loop_schedule_192
380
381##
382##  .aes_schedule_256
383##
384##  256-bit specific part of key schedule.
385##
386##  The structure here is very similar to the 128-bit
387##  schedule, but with an additional "low side" in
388##  %xmm6.  The low side's rounds are the same as the
389##  high side's, except no rcon and no rotation.
390##
391.align	16
392.Lschedule_256:
393	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
394	call	_vpaes_schedule_transform	# input transform
395	mov	\$7, %esi
396
397.Loop_schedule_256:
398	call	_vpaes_schedule_mangle	# output low result
399	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
400
401	# high round
402	call	_vpaes_schedule_round
403	dec	%rsi
404	jz 	.Lschedule_mangle_last
405	call	_vpaes_schedule_mangle
406
407	# low round. swap xmm7 and xmm6
408	pshufd	\$0xFF,	%xmm0,	%xmm0
409	movdqa	%xmm7,	%xmm5
410	movdqa	%xmm6,	%xmm7
411	call	_vpaes_schedule_low_round
412	movdqa	%xmm5,	%xmm7
413
414	jmp	.Loop_schedule_256
415
416
417##
418##  .aes_schedule_mangle_last
419##
420##  Mangler for last round of key schedule
421##  Mangles %xmm0
422##    when encrypting, outputs out(%xmm0) ^ 63
423##    when decrypting, outputs unskew(%xmm0)
424##
425##  Always called right before return... jumps to cleanup and exits
426##
427.align	16
428.Lschedule_mangle_last:
429	# schedule last round key from xmm0
430	lea	.Lk_deskew(%rip),%r11	# prepare to deskew
431	test	%rcx, 	%rcx
432	jnz	.Lschedule_mangle_last_dec
433
434	# encrypting
435	movdqa	(%r8,%r10),%xmm1
436	pshufb	%xmm1,	%xmm0		# output permute
437	lea	.Lk_opt(%rip),	%r11	# prepare to output transform
438	add	\$32,	%rdx
439
440.Lschedule_mangle_last_dec:
441	add	\$-16,	%rdx
442	pxor	.Lk_s63(%rip),	%xmm0
443	call	_vpaes_schedule_transform # output transform
444	movdqu	%xmm0,	(%rdx)		# save last key
445
446	# cleanup
447	pxor	%xmm0,  %xmm0
448	pxor	%xmm1,  %xmm1
449	pxor	%xmm2,  %xmm2
450	pxor	%xmm3,  %xmm3
451	pxor	%xmm4,  %xmm4
452	pxor	%xmm5,  %xmm5
453	pxor	%xmm6,  %xmm6
454	pxor	%xmm7,  %xmm7
455	ret
456.size	_vpaes_schedule_core,.-_vpaes_schedule_core
457
458##
459##  .aes_schedule_192_smear
460##
461##  Smear the short, low side in the 192-bit key schedule.
462##
463##  Inputs:
464##    %xmm7: high side, b  a  x  y
465##    %xmm6:  low side, d  c  0  0
466##    %xmm13: 0
467##
468##  Outputs:
469##    %xmm6: b+c+d  b+c  0  0
470##    %xmm0: b+c+d  b+c  b  a
471##
472.type	_vpaes_schedule_192_smear,\@abi-omnipotent
473.align	16
474_vpaes_schedule_192_smear:
475	pshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
476	pshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
477	pxor	%xmm1,	%xmm6		# -> c+d c 0 0
478	pxor	%xmm1,	%xmm1
479	pxor	%xmm0,	%xmm6		# -> b+c+d b+c b a
480	movdqa	%xmm6,	%xmm0
481	movhlps	%xmm1,	%xmm6		# clobber low side with zeros
482	ret
483.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
484
485##
486##  .aes_schedule_round
487##
488##  Runs one main round of the key schedule on %xmm0, %xmm7
489##
490##  Specifically, runs subbytes on the high dword of %xmm0
491##  then rotates it by one byte and xors into the low dword of
492##  %xmm7.
493##
494##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
495##  next rcon.
496##
497##  Smears the dwords of %xmm7 by xoring the low into the
498##  second low, result into third, result into highest.
499##
500##  Returns results in %xmm7 = %xmm0.
501##  Clobbers %xmm1-%xmm4, %r11.
502##
503.type	_vpaes_schedule_round,\@abi-omnipotent
504.align	16
505_vpaes_schedule_round:
506	# extract rcon from xmm8
507	pxor	%xmm1,	%xmm1
508	palignr	\$15,	%xmm8,	%xmm1
509	palignr	\$15,	%xmm8,	%xmm8
510	pxor	%xmm1,	%xmm7
511
512	# rotate
513	pshufd	\$0xFF,	%xmm0,	%xmm0
514	palignr	\$1,	%xmm0,	%xmm0
515
516	# fall through...
517
518	# low round: same as high round, but no rotation and no rcon.
519_vpaes_schedule_low_round:
520	# smear xmm7
521	movdqa	%xmm7,	%xmm1
522	pslldq	\$4,	%xmm7
523	pxor	%xmm1,	%xmm7
524	movdqa	%xmm7,	%xmm1
525	pslldq	\$8,	%xmm7
526	pxor	%xmm1,	%xmm7
527	pxor	.Lk_s63(%rip), %xmm7
528
529	# subbytes
530	movdqa  %xmm9, 	%xmm1
531	pandn	%xmm0, 	%xmm1
532	psrld	\$4,    %xmm1		# 1 = i
533	pand	%xmm9, 	%xmm0		# 0 = k
534	movdqa	%xmm11, %xmm2		# 2 : a/k
535	pshufb  %xmm0,  %xmm2		# 2 = a/k
536	pxor	%xmm1,	%xmm0		# 0 = j
537	movdqa	%xmm10,	%xmm3		# 3 : 1/i
538	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
539	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
540	movdqa	%xmm10,	%xmm4		# 4 : 1/j
541	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
542	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
543	movdqa	%xmm10,	%xmm2		# 2 : 1/iak
544	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
545	pxor	%xmm0, 	%xmm2		# 2 = io
546	movdqa	%xmm10, %xmm3		# 3 : 1/jak
547	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
548	pxor	%xmm1,  %xmm3		# 3 = jo
549	movdqa	%xmm13, %xmm4		# 4 : sbou
550	pshufb  %xmm2,  %xmm4		# 4 = sbou
551	movdqa	%xmm12, %xmm0		# 0 : sbot
552	pshufb  %xmm3,	%xmm0		# 0 = sb1t
553	pxor	%xmm4, 	%xmm0		# 0 = sbox output
554
555	# add in smeared stuff
556	pxor	%xmm7,	%xmm0
557	movdqa	%xmm0,	%xmm7
558	ret
559.size	_vpaes_schedule_round,.-_vpaes_schedule_round
560
561##
562##  .aes_schedule_transform
563##
564##  Linear-transform %xmm0 according to tables at (%r11)
565##
566##  Requires that %xmm9 = 0x0F0F... as in preheat
567##  Output in %xmm0
568##  Clobbers %xmm1, %xmm2
569##
570.type	_vpaes_schedule_transform,\@abi-omnipotent
571.align	16
572_vpaes_schedule_transform:
573	movdqa	%xmm9,	%xmm1
574	pandn	%xmm0,	%xmm1
575	psrld	\$4,	%xmm1
576	pand	%xmm9,	%xmm0
577	movdqa	(%r11), %xmm2 	# lo
578	pshufb	%xmm0,	%xmm2
579	movdqa	16(%r11), %xmm0 # hi
580	pshufb	%xmm1,	%xmm0
581	pxor	%xmm2,	%xmm0
582	ret
583.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
584
585##
586##  .aes_schedule_mangle
587##
588##  Mangle xmm0 from (basis-transformed) standard version
589##  to our version.
590##
591##  On encrypt,
592##    xor with 0x63
593##    multiply by circulant 0,1,1,1
594##    apply shiftrows transform
595##
596##  On decrypt,
597##    xor with 0x63
598##    multiply by "inverse mixcolumns" circulant E,B,D,9
599##    deskew
600##    apply shiftrows transform
601##
602##
603##  Writes out to (%rdx), and increments or decrements it
604##  Keeps track of round number mod 4 in %r8
605##  Preserves xmm0
606##  Clobbers xmm1-xmm5
607##
608.type	_vpaes_schedule_mangle,\@abi-omnipotent
609.align	16
610_vpaes_schedule_mangle:
611	movdqa	%xmm0,	%xmm4	# save xmm0 for later
612	movdqa	.Lk_mc_forward(%rip),%xmm5
613	test	%rcx, 	%rcx
614	jnz	.Lschedule_mangle_dec
615
616	# encrypting
617	add	\$16,	%rdx
618	pxor	.Lk_s63(%rip),%xmm4
619	pshufb	%xmm5,	%xmm4
620	movdqa	%xmm4,	%xmm3
621	pshufb	%xmm5,	%xmm4
622	pxor	%xmm4,	%xmm3
623	pshufb	%xmm5,	%xmm4
624	pxor	%xmm4,	%xmm3
625
626	jmp	.Lschedule_mangle_both
627.align	16
628.Lschedule_mangle_dec:
629	# inverse mix columns
630	lea	.Lk_dksd(%rip),%r11
631	movdqa	%xmm9,	%xmm1
632	pandn	%xmm4,	%xmm1
633	psrld	\$4,	%xmm1	# 1 = hi
634	pand	%xmm9,	%xmm4	# 4 = lo
635
636	movdqa	0x00(%r11), %xmm2
637	pshufb	%xmm4,	%xmm2
638	movdqa	0x10(%r11), %xmm3
639	pshufb	%xmm1,	%xmm3
640	pxor	%xmm2,	%xmm3
641	pshufb	%xmm5,	%xmm3
642
643	movdqa	0x20(%r11), %xmm2
644	pshufb	%xmm4,	%xmm2
645	pxor	%xmm3,	%xmm2
646	movdqa	0x30(%r11), %xmm3
647	pshufb	%xmm1,	%xmm3
648	pxor	%xmm2,	%xmm3
649	pshufb	%xmm5,	%xmm3
650
651	movdqa	0x40(%r11), %xmm2
652	pshufb	%xmm4,	%xmm2
653	pxor	%xmm3,	%xmm2
654	movdqa	0x50(%r11), %xmm3
655	pshufb	%xmm1,	%xmm3
656	pxor	%xmm2,	%xmm3
657	pshufb	%xmm5,	%xmm3
658
659	movdqa	0x60(%r11), %xmm2
660	pshufb	%xmm4,	%xmm2
661	pxor	%xmm3,	%xmm2
662	movdqa	0x70(%r11), %xmm3
663	pshufb	%xmm1,	%xmm3
664	pxor	%xmm2,	%xmm3
665
666	add	\$-16,	%rdx
667
668.Lschedule_mangle_both:
669	movdqa	(%r8,%r10),%xmm1
670	pshufb	%xmm1,%xmm3
671	add	\$-16,	%r8
672	and	\$0x30,	%r8
673	movdqu	%xmm3,	(%rdx)
674	ret
675.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
676
677#
678# Interface to OpenSSL
679#
680.globl	${PREFIX}_set_encrypt_key
681.type	${PREFIX}_set_encrypt_key,\@function,3
682.align	16
683${PREFIX}_set_encrypt_key:
684___
685$code.=<<___ if ($win64);
686	lea	-0xb8(%rsp),%rsp
687	movaps	%xmm6,0x10(%rsp)
688	movaps	%xmm7,0x20(%rsp)
689	movaps	%xmm8,0x30(%rsp)
690	movaps	%xmm9,0x40(%rsp)
691	movaps	%xmm10,0x50(%rsp)
692	movaps	%xmm11,0x60(%rsp)
693	movaps	%xmm12,0x70(%rsp)
694	movaps	%xmm13,0x80(%rsp)
695	movaps	%xmm14,0x90(%rsp)
696	movaps	%xmm15,0xa0(%rsp)
697.Lenc_key_body:
698___
699$code.=<<___;
700	mov	%esi,%eax
701	shr	\$5,%eax
702	add	\$5,%eax
703	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
704
705	mov	\$0,%ecx
706	mov	\$0x30,%r8d
707	call	_vpaes_schedule_core
708___
709$code.=<<___ if ($win64);
710	movaps	0x10(%rsp),%xmm6
711	movaps	0x20(%rsp),%xmm7
712	movaps	0x30(%rsp),%xmm8
713	movaps	0x40(%rsp),%xmm9
714	movaps	0x50(%rsp),%xmm10
715	movaps	0x60(%rsp),%xmm11
716	movaps	0x70(%rsp),%xmm12
717	movaps	0x80(%rsp),%xmm13
718	movaps	0x90(%rsp),%xmm14
719	movaps	0xa0(%rsp),%xmm15
720	lea	0xb8(%rsp),%rsp
721.Lenc_key_epilogue:
722___
723$code.=<<___;
724	xor	%eax,%eax
725	ret
726.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
727
728.globl	${PREFIX}_set_decrypt_key
729.type	${PREFIX}_set_decrypt_key,\@function,3
730.align	16
731${PREFIX}_set_decrypt_key:
732___
733$code.=<<___ if ($win64);
734	lea	-0xb8(%rsp),%rsp
735	movaps	%xmm6,0x10(%rsp)
736	movaps	%xmm7,0x20(%rsp)
737	movaps	%xmm8,0x30(%rsp)
738	movaps	%xmm9,0x40(%rsp)
739	movaps	%xmm10,0x50(%rsp)
740	movaps	%xmm11,0x60(%rsp)
741	movaps	%xmm12,0x70(%rsp)
742	movaps	%xmm13,0x80(%rsp)
743	movaps	%xmm14,0x90(%rsp)
744	movaps	%xmm15,0xa0(%rsp)
745.Ldec_key_body:
746___
747$code.=<<___;
748	mov	%esi,%eax
749	shr	\$5,%eax
750	add	\$5,%eax
751	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
752	shl	\$4,%eax
753	lea	16(%rdx,%rax),%rdx
754
755	mov	\$1,%ecx
756	mov	%esi,%r8d
757	shr	\$1,%r8d
758	and	\$32,%r8d
759	xor	\$32,%r8d	# nbits==192?0:32
760	call	_vpaes_schedule_core
761___
762$code.=<<___ if ($win64);
763	movaps	0x10(%rsp),%xmm6
764	movaps	0x20(%rsp),%xmm7
765	movaps	0x30(%rsp),%xmm8
766	movaps	0x40(%rsp),%xmm9
767	movaps	0x50(%rsp),%xmm10
768	movaps	0x60(%rsp),%xmm11
769	movaps	0x70(%rsp),%xmm12
770	movaps	0x80(%rsp),%xmm13
771	movaps	0x90(%rsp),%xmm14
772	movaps	0xa0(%rsp),%xmm15
773	lea	0xb8(%rsp),%rsp
774.Ldec_key_epilogue:
775___
776$code.=<<___;
777	xor	%eax,%eax
778	ret
779.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
780
781.globl	${PREFIX}_encrypt
782.type	${PREFIX}_encrypt,\@function,3
783.align	16
784${PREFIX}_encrypt:
785___
786$code.=<<___ if ($win64);
787	lea	-0xb8(%rsp),%rsp
788	movaps	%xmm6,0x10(%rsp)
789	movaps	%xmm7,0x20(%rsp)
790	movaps	%xmm8,0x30(%rsp)
791	movaps	%xmm9,0x40(%rsp)
792	movaps	%xmm10,0x50(%rsp)
793	movaps	%xmm11,0x60(%rsp)
794	movaps	%xmm12,0x70(%rsp)
795	movaps	%xmm13,0x80(%rsp)
796	movaps	%xmm14,0x90(%rsp)
797	movaps	%xmm15,0xa0(%rsp)
798.Lenc_body:
799___
800$code.=<<___;
801	movdqu	(%rdi),%xmm0
802	call	_vpaes_preheat
803	call	_vpaes_encrypt_core
804	movdqu	%xmm0,(%rsi)
805___
806$code.=<<___ if ($win64);
807	movaps	0x10(%rsp),%xmm6
808	movaps	0x20(%rsp),%xmm7
809	movaps	0x30(%rsp),%xmm8
810	movaps	0x40(%rsp),%xmm9
811	movaps	0x50(%rsp),%xmm10
812	movaps	0x60(%rsp),%xmm11
813	movaps	0x70(%rsp),%xmm12
814	movaps	0x80(%rsp),%xmm13
815	movaps	0x90(%rsp),%xmm14
816	movaps	0xa0(%rsp),%xmm15
817	lea	0xb8(%rsp),%rsp
818.Lenc_epilogue:
819___
820$code.=<<___;
821	ret
822.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
823
824.globl	${PREFIX}_decrypt
825.type	${PREFIX}_decrypt,\@function,3
826.align	16
827${PREFIX}_decrypt:
828___
829$code.=<<___ if ($win64);
830	lea	-0xb8(%rsp),%rsp
831	movaps	%xmm6,0x10(%rsp)
832	movaps	%xmm7,0x20(%rsp)
833	movaps	%xmm8,0x30(%rsp)
834	movaps	%xmm9,0x40(%rsp)
835	movaps	%xmm10,0x50(%rsp)
836	movaps	%xmm11,0x60(%rsp)
837	movaps	%xmm12,0x70(%rsp)
838	movaps	%xmm13,0x80(%rsp)
839	movaps	%xmm14,0x90(%rsp)
840	movaps	%xmm15,0xa0(%rsp)
841.Ldec_body:
842___
843$code.=<<___;
844	movdqu	(%rdi),%xmm0
845	call	_vpaes_preheat
846	call	_vpaes_decrypt_core
847	movdqu	%xmm0,(%rsi)
848___
849$code.=<<___ if ($win64);
850	movaps	0x10(%rsp),%xmm6
851	movaps	0x20(%rsp),%xmm7
852	movaps	0x30(%rsp),%xmm8
853	movaps	0x40(%rsp),%xmm9
854	movaps	0x50(%rsp),%xmm10
855	movaps	0x60(%rsp),%xmm11
856	movaps	0x70(%rsp),%xmm12
857	movaps	0x80(%rsp),%xmm13
858	movaps	0x90(%rsp),%xmm14
859	movaps	0xa0(%rsp),%xmm15
860	lea	0xb8(%rsp),%rsp
861.Ldec_epilogue:
862___
863$code.=<<___;
864	ret
865.size	${PREFIX}_decrypt,.-${PREFIX}_decrypt
866___
867{
868my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
869# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
870#                       size_t length, const AES_KEY *key,
871#                       unsigned char *ivp,const int enc);
872$code.=<<___;
873.globl	${PREFIX}_cbc_encrypt
874.type	${PREFIX}_cbc_encrypt,\@function,6
875.align	16
876${PREFIX}_cbc_encrypt:
877	xchg	$key,$len
878___
879($len,$key)=($key,$len);
880$code.=<<___;
881	sub	\$16,$len
882	jc	.Lcbc_abort
883___
884$code.=<<___ if ($win64);
885	lea	-0xb8(%rsp),%rsp
886	movaps	%xmm6,0x10(%rsp)
887	movaps	%xmm7,0x20(%rsp)
888	movaps	%xmm8,0x30(%rsp)
889	movaps	%xmm9,0x40(%rsp)
890	movaps	%xmm10,0x50(%rsp)
891	movaps	%xmm11,0x60(%rsp)
892	movaps	%xmm12,0x70(%rsp)
893	movaps	%xmm13,0x80(%rsp)
894	movaps	%xmm14,0x90(%rsp)
895	movaps	%xmm15,0xa0(%rsp)
896.Lcbc_body:
897___
898$code.=<<___;
899	movdqu	($ivp),%xmm6		# load IV
900	sub	$inp,$out
901	call	_vpaes_preheat
902	cmp	\$0,${enc}d
903	je	.Lcbc_dec_loop
904	jmp	.Lcbc_enc_loop
905.align	16
906.Lcbc_enc_loop:
907	movdqu	($inp),%xmm0
908	pxor	%xmm6,%xmm0
909	call	_vpaes_encrypt_core
910	movdqa	%xmm0,%xmm6
911	movdqu	%xmm0,($out,$inp)
912	lea	16($inp),$inp
913	sub	\$16,$len
914	jnc	.Lcbc_enc_loop
915	jmp	.Lcbc_done
916.align	16
917.Lcbc_dec_loop:
918	movdqu	($inp),%xmm0
919	movdqa	%xmm0,%xmm7
920	call	_vpaes_decrypt_core
921	pxor	%xmm6,%xmm0
922	movdqa	%xmm7,%xmm6
923	movdqu	%xmm0,($out,$inp)
924	lea	16($inp),$inp
925	sub	\$16,$len
926	jnc	.Lcbc_dec_loop
927.Lcbc_done:
928	movdqu	%xmm6,($ivp)		# save IV
929___
930$code.=<<___ if ($win64);
931	movaps	0x10(%rsp),%xmm6
932	movaps	0x20(%rsp),%xmm7
933	movaps	0x30(%rsp),%xmm8
934	movaps	0x40(%rsp),%xmm9
935	movaps	0x50(%rsp),%xmm10
936	movaps	0x60(%rsp),%xmm11
937	movaps	0x70(%rsp),%xmm12
938	movaps	0x80(%rsp),%xmm13
939	movaps	0x90(%rsp),%xmm14
940	movaps	0xa0(%rsp),%xmm15
941	lea	0xb8(%rsp),%rsp
942.Lcbc_epilogue:
943___
944$code.=<<___;
945.Lcbc_abort:
946	ret
947.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
948___
949}
950$code.=<<___;
951##
952##  _aes_preheat
953##
954##  Fills register %r10 -> .aes_consts (so you can -fPIC)
955##  and %xmm9-%xmm15 as specified below.
956##
957.type	_vpaes_preheat,\@abi-omnipotent
958.align	16
959_vpaes_preheat:
960	lea	.Lk_s0F(%rip), %r10
961	movdqa	-0x20(%r10), %xmm10	# .Lk_inv
962	movdqa	-0x10(%r10), %xmm11	# .Lk_inv+16
963	movdqa	0x00(%r10), %xmm9	# .Lk_s0F
964	movdqa	0x30(%r10), %xmm13	# .Lk_sb1
965	movdqa	0x40(%r10), %xmm12	# .Lk_sb1+16
966	movdqa	0x50(%r10), %xmm15	# .Lk_sb2
967	movdqa	0x60(%r10), %xmm14	# .Lk_sb2+16
968	ret
969.size	_vpaes_preheat,.-_vpaes_preheat
970########################################################
971##                                                    ##
972##                     Constants                      ##
973##                                                    ##
974########################################################
975.type	_vpaes_consts,\@object
976.align	64
977_vpaes_consts:
978.Lk_inv:	# inv, inva
979	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
980	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
981
982.Lk_s0F:	# s0F
983	.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
984
985.Lk_ipt:	# input transform (lo, hi)
986	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
987	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
988
989.Lk_sb1:	# sb1u, sb1t
990	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
991	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
992.Lk_sb2:	# sb2u, sb2t
993	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
994	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
995.Lk_sbo:	# sbou, sbot
996	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
997	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
998
999.Lk_mc_forward:	# mc_forward
1000	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
1001	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
1002	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
1003	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
1004
1005.Lk_mc_backward:# mc_backward
1006	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
1007	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
1008	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
1009	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
1010
1011.Lk_sr:		# sr
1012	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
1013	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
1014	.quad	0x0F060D040B020900, 0x070E050C030A0108
1015	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
1016
1017.Lk_rcon:	# rcon
1018	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1019
1020.Lk_s63:	# s63: all equal to 0x63 transformed
1021	.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1022
1023.Lk_opt:	# output transform
1024	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
1025	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1026
1027.Lk_deskew:	# deskew tables: inverts the sbox's "skew"
1028	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1029	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1030
1031##
1032##  Decryption stuff
1033##  Key schedule constants
1034##
1035.Lk_dksd:	# decryption key schedule: invskew x*D
1036	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1037	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1038.Lk_dksb:	# decryption key schedule: invskew x*B
1039	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
1040	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1041.Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
1042	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
1043	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1044.Lk_dks9:	# decryption key schedule: invskew x*9
1045	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
1046	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
1047
1048##
1049##  Decryption stuff
1050##  Round function constants
1051##
1052.Lk_dipt:	# decryption input transform
1053	.quad	0x0F505B040B545F00, 0x154A411E114E451A
1054	.quad	0x86E383E660056500, 0x12771772F491F194
1055
1056.Lk_dsb9:	# decryption sbox output *9*u, *9*t
1057	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
1058	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1059.Lk_dsbd:	# decryption sbox output *D*u, *D*t
1060	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1061	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1062.Lk_dsbb:	# decryption sbox output *B*u, *B*t
1063	.quad	0xD022649296B44200, 0x602646F6B0F2D404
1064	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1065.Lk_dsbe:	# decryption sbox output *E*u, *E*t
1066	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
1067	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1068.Lk_dsbo:	# decryption sbox final output
1069	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1070	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1071.asciz	"Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1072.align	64
1073.size	_vpaes_consts,.-_vpaes_consts
1074___
1075
1076if ($win64) {
1077# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1078#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1079$rec="%rcx";
1080$frame="%rdx";
1081$context="%r8";
1082$disp="%r9";
1083
1084$code.=<<___;
1085.extern	__imp_RtlVirtualUnwind
1086.type	se_handler,\@abi-omnipotent
1087.align	16
1088se_handler:
1089	push	%rsi
1090	push	%rdi
1091	push	%rbx
1092	push	%rbp
1093	push	%r12
1094	push	%r13
1095	push	%r14
1096	push	%r15
1097	pushfq
1098	sub	\$64,%rsp
1099
1100	mov	120($context),%rax	# pull context->Rax
1101	mov	248($context),%rbx	# pull context->Rip
1102
1103	mov	8($disp),%rsi		# disp->ImageBase
1104	mov	56($disp),%r11		# disp->HandlerData
1105
1106	mov	0(%r11),%r10d		# HandlerData[0]
1107	lea	(%rsi,%r10),%r10	# prologue label
1108	cmp	%r10,%rbx		# context->Rip<prologue label
1109	jb	.Lin_prologue
1110
1111	mov	152($context),%rax	# pull context->Rsp
1112
1113	mov	4(%r11),%r10d		# HandlerData[1]
1114	lea	(%rsi,%r10),%r10	# epilogue label
1115	cmp	%r10,%rbx		# context->Rip>=epilogue label
1116	jae	.Lin_prologue
1117
1118	lea	16(%rax),%rsi		# %xmm save area
1119	lea	512($context),%rdi	# &context.Xmm6
1120	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
1121	.long	0xa548f3fc		# cld; rep movsq
1122	lea	0xb8(%rax),%rax		# adjust stack pointer
1123
1124.Lin_prologue:
1125	mov	8(%rax),%rdi
1126	mov	16(%rax),%rsi
1127	mov	%rax,152($context)	# restore context->Rsp
1128	mov	%rsi,168($context)	# restore context->Rsi
1129	mov	%rdi,176($context)	# restore context->Rdi
1130
1131	mov	40($disp),%rdi		# disp->ContextRecord
1132	mov	$context,%rsi		# context
1133	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
1134	.long	0xa548f3fc		# cld; rep movsq
1135
1136	mov	$disp,%rsi
1137	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1138	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1139	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1140	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1141	mov	40(%rsi),%r10		# disp->ContextRecord
1142	lea	56(%rsi),%r11		# &disp->HandlerData
1143	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1144	mov	%r10,32(%rsp)		# arg5
1145	mov	%r11,40(%rsp)		# arg6
1146	mov	%r12,48(%rsp)		# arg7
1147	mov	%rcx,56(%rsp)		# arg8, (NULL)
1148	call	*__imp_RtlVirtualUnwind(%rip)
1149
1150	mov	\$1,%eax		# ExceptionContinueSearch
1151	add	\$64,%rsp
1152	popfq
1153	pop	%r15
1154	pop	%r14
1155	pop	%r13
1156	pop	%r12
1157	pop	%rbp
1158	pop	%rbx
1159	pop	%rdi
1160	pop	%rsi
1161	ret
1162.size	se_handler,.-se_handler
1163
1164.section	.pdata
1165.align	4
1166	.rva	.LSEH_begin_${PREFIX}_set_encrypt_key
1167	.rva	.LSEH_end_${PREFIX}_set_encrypt_key
1168	.rva	.LSEH_info_${PREFIX}_set_encrypt_key
1169
1170	.rva	.LSEH_begin_${PREFIX}_set_decrypt_key
1171	.rva	.LSEH_end_${PREFIX}_set_decrypt_key
1172	.rva	.LSEH_info_${PREFIX}_set_decrypt_key
1173
1174	.rva	.LSEH_begin_${PREFIX}_encrypt
1175	.rva	.LSEH_end_${PREFIX}_encrypt
1176	.rva	.LSEH_info_${PREFIX}_encrypt
1177
1178	.rva	.LSEH_begin_${PREFIX}_decrypt
1179	.rva	.LSEH_end_${PREFIX}_decrypt
1180	.rva	.LSEH_info_${PREFIX}_decrypt
1181
1182	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
1183	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
1184	.rva	.LSEH_info_${PREFIX}_cbc_encrypt
1185
1186.section	.xdata
1187.align	8
1188.LSEH_info_${PREFIX}_set_encrypt_key:
1189	.byte	9,0,0,0
1190	.rva	se_handler
1191	.rva	.Lenc_key_body,.Lenc_key_epilogue	# HandlerData[]
1192.LSEH_info_${PREFIX}_set_decrypt_key:
1193	.byte	9,0,0,0
1194	.rva	se_handler
1195	.rva	.Ldec_key_body,.Ldec_key_epilogue	# HandlerData[]
1196.LSEH_info_${PREFIX}_encrypt:
1197	.byte	9,0,0,0
1198	.rva	se_handler
1199	.rva	.Lenc_body,.Lenc_epilogue		# HandlerData[]
1200.LSEH_info_${PREFIX}_decrypt:
1201	.byte	9,0,0,0
1202	.rva	se_handler
1203	.rva	.Ldec_body,.Ldec_epilogue		# HandlerData[]
1204.LSEH_info_${PREFIX}_cbc_encrypt:
1205	.byte	9,0,0,0
1206	.rva	se_handler
1207	.rva	.Lcbc_body,.Lcbc_epilogue		# HandlerData[]
1208___
1209}
1210
1211$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1212
1213print $code;
1214
1215close STDOUT;
1216