1*1dcdf01fSchristos#! /usr/bin/env perl
2*1dcdf01fSchristos# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.
3*1dcdf01fSchristos#
4*1dcdf01fSchristos# Licensed under the OpenSSL license (the "License").  You may not use
5*1dcdf01fSchristos# this file except in compliance with the License.  You can obtain a copy
6*1dcdf01fSchristos# in the file LICENSE in the source distribution or at
7*1dcdf01fSchristos# https://www.openssl.org/source/license.html
8*1dcdf01fSchristos
9*1dcdf01fSchristos
10*1dcdf01fSchristos# ====================================================================
11*1dcdf01fSchristos# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12*1dcdf01fSchristos# project. The module is, however, dual licensed under OpenSSL and
13*1dcdf01fSchristos# CRYPTOGAMS licenses depending on where you obtain it. For further
14*1dcdf01fSchristos# details see http://www.openssl.org/~appro/cryptogams/.
15*1dcdf01fSchristos# ====================================================================
16*1dcdf01fSchristos
17*1dcdf01fSchristos# September 2011
18*1dcdf01fSchristos#
19*1dcdf01fSchristos# Assembler helpers for Padlock engine. Compared to original engine
20*1dcdf01fSchristos# version relying on inline assembler and compiled with gcc 3.4.6 it
21*1dcdf01fSchristos# was measured to provide ~100% improvement on misaligned data in ECB
22*1dcdf01fSchristos# mode and ~75% in CBC mode. For aligned data improvement can be
23*1dcdf01fSchristos# observed for short inputs only, e.g. 45% for 64-byte messages in
24*1dcdf01fSchristos# ECB mode, 20% in CBC. Difference in performance for aligned vs.
25*1dcdf01fSchristos# misaligned data depends on misalignment and is either ~1.8x or 2.9x.
26*1dcdf01fSchristos# These are approximately same factors as for hardware support, so
27*1dcdf01fSchristos# there is little reason to rely on the latter. On the contrary, it
28*1dcdf01fSchristos# might actually hurt performance in mixture of aligned and misaligned
29*1dcdf01fSchristos# buffers, because a) if you choose to flip 'align' flag in control
30*1dcdf01fSchristos# word on per-buffer basis, then you'd have to reload key context,
31*1dcdf01fSchristos# which incurs penalty; b) if you choose to set 'align' flag
32*1dcdf01fSchristos# permanently, it limits performance even for aligned data to ~1/2.
33*1dcdf01fSchristos# All above mentioned results were collected on 1.5GHz C7. Nano on the
34*1dcdf01fSchristos# other hand handles unaligned data more gracefully. Depending on
35*1dcdf01fSchristos# algorithm and how unaligned data is, hardware can be up to 70% more
36*1dcdf01fSchristos# efficient than below software alignment procedures, nor does 'align'
37*1dcdf01fSchristos# flag have affect on aligned performance [if has any meaning at all].
38*1dcdf01fSchristos# Therefore suggestion is to unconditionally set 'align' flag on Nano
39*1dcdf01fSchristos# for optimal performance.
40*1dcdf01fSchristos
41*1dcdf01fSchristos$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42*1dcdf01fSchristospush(@INC,"${dir}","${dir}../../crypto/perlasm");
43*1dcdf01fSchristosrequire "x86asm.pl";
44*1dcdf01fSchristos
45*1dcdf01fSchristos$output=pop;
46*1dcdf01fSchristosopen STDOUT,">$output";
47*1dcdf01fSchristos
48*1dcdf01fSchristos&asm_init($ARGV[0]);
49*1dcdf01fSchristos
50*1dcdf01fSchristos%PADLOCK_PREFETCH=(ecb=>128, cbc=>64);	# prefetch errata
51*1dcdf01fSchristos$PADLOCK_CHUNK=512;	# Must be a power of 2 larger than 16
52*1dcdf01fSchristos
53*1dcdf01fSchristos$ctx="edx";
54*1dcdf01fSchristos$out="edi";
55*1dcdf01fSchristos$inp="esi";
56*1dcdf01fSchristos$len="ecx";
57*1dcdf01fSchristos$chunk="ebx";
58*1dcdf01fSchristos
59*1dcdf01fSchristos&function_begin_B("padlock_capability");
60*1dcdf01fSchristos	&push	("ebx");
61*1dcdf01fSchristos	&pushf	();
62*1dcdf01fSchristos	&pop	("eax");
63*1dcdf01fSchristos	&mov	("ecx","eax");
64*1dcdf01fSchristos	&xor	("eax",1<<21);
65*1dcdf01fSchristos	&push	("eax");
66*1dcdf01fSchristos	&popf	();
67*1dcdf01fSchristos	&pushf	();
68*1dcdf01fSchristos	&pop	("eax");
69*1dcdf01fSchristos	&xor	("ecx","eax");
70*1dcdf01fSchristos	&xor	("eax","eax");
71*1dcdf01fSchristos	&bt	("ecx",21);
72*1dcdf01fSchristos	&jnc	(&label("noluck"));
73*1dcdf01fSchristos	&cpuid	();
74*1dcdf01fSchristos	&xor	("eax","eax");
75*1dcdf01fSchristos	&cmp	("ebx","0x".unpack("H*",'tneC'));
76*1dcdf01fSchristos	&jne	(&label("zhaoxin"));
77*1dcdf01fSchristos	&cmp	("edx","0x".unpack("H*",'Hrua'));
78*1dcdf01fSchristos	&jne	(&label("noluck"));
79*1dcdf01fSchristos	&cmp	("ecx","0x".unpack("H*",'slua'));
80*1dcdf01fSchristos	&jne	(&label("noluck"));
81*1dcdf01fSchristos	&jmp	(&label("zhaoxinEnd"));
82*1dcdf01fSchristos&set_label("zhaoxin");
83*1dcdf01fSchristos	&cmp	("ebx","0x".unpack("H*",'hS  '));
84*1dcdf01fSchristos	&jne	(&label("noluck"));
85*1dcdf01fSchristos	&cmp	("edx","0x".unpack("H*",'hgna'));
86*1dcdf01fSchristos	&jne	(&label("noluck"));
87*1dcdf01fSchristos	&cmp	("ecx","0x".unpack("H*",'  ia'));
88*1dcdf01fSchristos	&jne	(&label("noluck"));
89*1dcdf01fSchristos&set_label("zhaoxinEnd");
90*1dcdf01fSchristos	&mov	("eax",0xC0000000);
91*1dcdf01fSchristos	&cpuid	();
92*1dcdf01fSchristos	&mov	("edx","eax");
93*1dcdf01fSchristos	&xor	("eax","eax");
94*1dcdf01fSchristos	&cmp	("edx",0xC0000001);
95*1dcdf01fSchristos	&jb	(&label("noluck"));
96*1dcdf01fSchristos	&mov	("eax",1);
97*1dcdf01fSchristos	&cpuid	();
98*1dcdf01fSchristos	&or	("eax",0x0f);
99*1dcdf01fSchristos	&xor	("ebx","ebx");
100*1dcdf01fSchristos	&and	("eax",0x0fff);
101*1dcdf01fSchristos	&cmp	("eax",0x06ff);		# check for Nano
102*1dcdf01fSchristos	&sete	("bl");
103*1dcdf01fSchristos	&mov	("eax",0xC0000001);
104*1dcdf01fSchristos	&push	("ebx");
105*1dcdf01fSchristos	&cpuid	();
106*1dcdf01fSchristos	&pop	("ebx");
107*1dcdf01fSchristos	&mov	("eax","edx");
108*1dcdf01fSchristos	&shl	("ebx",4);		# bit#4 denotes Nano
109*1dcdf01fSchristos	&and	("eax",0xffffffef);
110*1dcdf01fSchristos	&or	("eax","ebx")
111*1dcdf01fSchristos&set_label("noluck");
112*1dcdf01fSchristos	&pop	("ebx");
113*1dcdf01fSchristos	&ret	();
114*1dcdf01fSchristos&function_end_B("padlock_capability")
115*1dcdf01fSchristos
116*1dcdf01fSchristos&function_begin_B("padlock_key_bswap");
117*1dcdf01fSchristos	&mov	("edx",&wparam(0));
118*1dcdf01fSchristos	&mov	("ecx",&DWP(240,"edx"));
119*1dcdf01fSchristos	&inc	("ecx");
120*1dcdf01fSchristos	&shl	("ecx",2);
121*1dcdf01fSchristos&set_label("bswap_loop");
122*1dcdf01fSchristos	&mov	("eax",&DWP(0,"edx"));
123*1dcdf01fSchristos	&bswap	("eax");
124*1dcdf01fSchristos	&mov	(&DWP(0,"edx"),"eax");
125*1dcdf01fSchristos	&lea	("edx",&DWP(4,"edx"));
126*1dcdf01fSchristos	&sub	("ecx",1);
127*1dcdf01fSchristos	&jnz	(&label("bswap_loop"));
128*1dcdf01fSchristos	&ret	();
129*1dcdf01fSchristos&function_end_B("padlock_key_bswap");
130*1dcdf01fSchristos
131*1dcdf01fSchristos# This is heuristic key context tracing. At first one
132*1dcdf01fSchristos# believes that one should use atomic swap instructions,
133*1dcdf01fSchristos# but it's not actually necessary. Point is that if
134*1dcdf01fSchristos# padlock_saved_context was changed by another thread
135*1dcdf01fSchristos# after we've read it and before we compare it with ctx,
136*1dcdf01fSchristos# our key *shall* be reloaded upon thread context switch
137*1dcdf01fSchristos# and we are therefore set in either case...
138*1dcdf01fSchristos&static_label("padlock_saved_context");
139*1dcdf01fSchristos
140*1dcdf01fSchristos&function_begin_B("padlock_verify_context");
141*1dcdf01fSchristos	&mov	($ctx,&wparam(0));
142*1dcdf01fSchristos	&lea	("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
143*1dcdf01fSchristos		       &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
144*1dcdf01fSchristos	&pushf	();
145*1dcdf01fSchristos	&call	("_padlock_verify_ctx");
146*1dcdf01fSchristos&set_label("verify_pic_point");
147*1dcdf01fSchristos	&lea	("esp",&DWP(4,"esp"));
148*1dcdf01fSchristos	&ret	();
149*1dcdf01fSchristos&function_end_B("padlock_verify_context");
150*1dcdf01fSchristos
151*1dcdf01fSchristos&function_begin_B("_padlock_verify_ctx");
152*1dcdf01fSchristos	&add	("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context
153*1dcdf01fSchristos	&bt	(&DWP(4,"esp"),30);		# eflags
154*1dcdf01fSchristos	&jnc	(&label("verified"));
155*1dcdf01fSchristos	&cmp	($ctx,&DWP(0,"eax"));
156*1dcdf01fSchristos	&je	(&label("verified"));
157*1dcdf01fSchristos	&pushf	();
158*1dcdf01fSchristos	&popf	();
159*1dcdf01fSchristos&set_label("verified");
160*1dcdf01fSchristos	&mov	(&DWP(0,"eax"),$ctx);
161*1dcdf01fSchristos	&ret	();
162*1dcdf01fSchristos&function_end_B("_padlock_verify_ctx");
163*1dcdf01fSchristos
164*1dcdf01fSchristos&function_begin_B("padlock_reload_key");
165*1dcdf01fSchristos	&pushf	();
166*1dcdf01fSchristos	&popf	();
167*1dcdf01fSchristos	&ret	();
168*1dcdf01fSchristos&function_end_B("padlock_reload_key");
169*1dcdf01fSchristos
170*1dcdf01fSchristos&function_begin_B("padlock_aes_block");
171*1dcdf01fSchristos	&push	("edi");
172*1dcdf01fSchristos	&push	("esi");
173*1dcdf01fSchristos	&push	("ebx");
174*1dcdf01fSchristos	&mov	($out,&wparam(0));		# must be 16-byte aligned
175*1dcdf01fSchristos	&mov	($inp,&wparam(1));		# must be 16-byte aligned
176*1dcdf01fSchristos	&mov	($ctx,&wparam(2));
177*1dcdf01fSchristos	&mov	($len,1);
178*1dcdf01fSchristos	&lea	("ebx",&DWP(32,$ctx));		# key
179*1dcdf01fSchristos	&lea	($ctx,&DWP(16,$ctx));		# control word
180*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa7,0xc8);	# rep xcryptecb
181*1dcdf01fSchristos	&pop	("ebx");
182*1dcdf01fSchristos	&pop	("esi");
183*1dcdf01fSchristos	&pop	("edi");
184*1dcdf01fSchristos	&ret	();
185*1dcdf01fSchristos&function_end_B("padlock_aes_block");
186*1dcdf01fSchristos
187*1dcdf01fSchristossub generate_mode {
188*1dcdf01fSchristosmy ($mode,$opcode) = @_;
189*1dcdf01fSchristos# int padlock_$mode_encrypt(void *out, const void *inp,
190*1dcdf01fSchristos#		struct padlock_cipher_data *ctx, size_t len);
191*1dcdf01fSchristos&function_begin("padlock_${mode}_encrypt");
192*1dcdf01fSchristos	&mov	($out,&wparam(0));
193*1dcdf01fSchristos	&mov	($inp,&wparam(1));
194*1dcdf01fSchristos	&mov	($ctx,&wparam(2));
195*1dcdf01fSchristos	&mov	($len,&wparam(3));
196*1dcdf01fSchristos	&test	($ctx,15);
197*1dcdf01fSchristos	&jnz	(&label("${mode}_abort"));
198*1dcdf01fSchristos	&test	($len,15);
199*1dcdf01fSchristos	&jnz	(&label("${mode}_abort"));
200*1dcdf01fSchristos	&lea	("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
201*1dcdf01fSchristos		       &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
202*1dcdf01fSchristos	&pushf	();
203*1dcdf01fSchristos	&cld	();
204*1dcdf01fSchristos	&call	("_padlock_verify_ctx");
205*1dcdf01fSchristos&set_label("${mode}_pic_point");
206*1dcdf01fSchristos	&lea	($ctx,&DWP(16,$ctx));	# control word
207*1dcdf01fSchristos	&xor	("eax","eax");
208*1dcdf01fSchristos					if ($mode eq "ctr32") {
209*1dcdf01fSchristos	&movq	("mm0",&QWP(-16,$ctx));	# load [upper part of] counter
210*1dcdf01fSchristos					} else {
211*1dcdf01fSchristos	&xor	("ebx","ebx");
212*1dcdf01fSchristos	&test	(&DWP(0,$ctx),1<<5);	# align bit in control word
213*1dcdf01fSchristos	&jnz	(&label("${mode}_aligned"));
214*1dcdf01fSchristos	&test	($out,0x0f);
215*1dcdf01fSchristos	&setz	("al");			# !out_misaligned
216*1dcdf01fSchristos	&test	($inp,0x0f);
217*1dcdf01fSchristos	&setz	("bl");			# !inp_misaligned
218*1dcdf01fSchristos	&test	("eax","ebx");
219*1dcdf01fSchristos	&jnz	(&label("${mode}_aligned"));
220*1dcdf01fSchristos	&neg	("eax");
221*1dcdf01fSchristos					}
222*1dcdf01fSchristos	&mov	($chunk,$PADLOCK_CHUNK);
223*1dcdf01fSchristos	&not	("eax");		# out_misaligned?-1:0
224*1dcdf01fSchristos	&lea	("ebp",&DWP(-24,"esp"));
225*1dcdf01fSchristos	&cmp	($len,$chunk);
226*1dcdf01fSchristos	&cmovc	($chunk,$len);		# chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
227*1dcdf01fSchristos	&and	("eax",$chunk);		# out_misaligned?chunk:0
228*1dcdf01fSchristos	&mov	($chunk,$len);
229*1dcdf01fSchristos	&neg	("eax");
230*1dcdf01fSchristos	&and	($chunk,$PADLOCK_CHUNK-1);	# chunk=len%PADLOCK_CHUNK
231*1dcdf01fSchristos	&lea	("esp",&DWP(0,"eax","ebp"));	# alloca
232*1dcdf01fSchristos	&mov	("eax",$PADLOCK_CHUNK);
233*1dcdf01fSchristos	&cmovz	($chunk,"eax");			# chunk=chunk?:PADLOCK_CHUNK
234*1dcdf01fSchristos	&mov	("eax","ebp");
235*1dcdf01fSchristos	&and	("ebp",-16);
236*1dcdf01fSchristos	&and	("esp",-16);
237*1dcdf01fSchristos	&mov	(&DWP(16,"ebp"),"eax");
238*1dcdf01fSchristos    if ($PADLOCK_PREFETCH{$mode}) {
239*1dcdf01fSchristos	&cmp	($len,$chunk);
240*1dcdf01fSchristos	&ja	(&label("${mode}_loop"));
241*1dcdf01fSchristos	&mov	("eax",$inp);		# check if prefetch crosses page
242*1dcdf01fSchristos	&cmp	("ebp","esp");
243*1dcdf01fSchristos	&cmove	("eax",$out);
244*1dcdf01fSchristos	&add	("eax",$len);
245*1dcdf01fSchristos	&neg	("eax");
246*1dcdf01fSchristos	&and	("eax",0xfff);		# distance to page boundary
247*1dcdf01fSchristos	&cmp	("eax",$PADLOCK_PREFETCH{$mode});
248*1dcdf01fSchristos	&mov	("eax",-$PADLOCK_PREFETCH{$mode});
249*1dcdf01fSchristos	&cmovae	("eax",$chunk);		# mask=distance<prefetch?-prefetch:-1
250*1dcdf01fSchristos	&and	($chunk,"eax");
251*1dcdf01fSchristos	&jz	(&label("${mode}_unaligned_tail"));
252*1dcdf01fSchristos    }
253*1dcdf01fSchristos	&jmp	(&label("${mode}_loop"));
254*1dcdf01fSchristos
255*1dcdf01fSchristos&set_label("${mode}_loop",16);
256*1dcdf01fSchristos	&mov	(&DWP(0,"ebp"),$out);		# save parameters
257*1dcdf01fSchristos	&mov	(&DWP(4,"ebp"),$inp);
258*1dcdf01fSchristos	&mov	(&DWP(8,"ebp"),$len);
259*1dcdf01fSchristos	&mov	($len,$chunk);
260*1dcdf01fSchristos	&mov	(&DWP(12,"ebp"),$chunk);	# chunk
261*1dcdf01fSchristos						if ($mode eq "ctr32") {
262*1dcdf01fSchristos	&mov	("ecx",&DWP(-4,$ctx));
263*1dcdf01fSchristos	&xor	($out,$out);
264*1dcdf01fSchristos	&mov	("eax",&DWP(-8,$ctx));		# borrow $len
265*1dcdf01fSchristos&set_label("${mode}_prepare");
266*1dcdf01fSchristos	&mov	(&DWP(12,"esp",$out),"ecx");
267*1dcdf01fSchristos	&bswap	("ecx");
268*1dcdf01fSchristos	&movq	(&QWP(0,"esp",$out),"mm0");
269*1dcdf01fSchristos	&inc	("ecx");
270*1dcdf01fSchristos	&mov	(&DWP(8,"esp",$out),"eax");
271*1dcdf01fSchristos	&bswap	("ecx");
272*1dcdf01fSchristos	&lea	($out,&DWP(16,$out));
273*1dcdf01fSchristos	&cmp	($out,$chunk);
274*1dcdf01fSchristos	&jb	(&label("${mode}_prepare"));
275*1dcdf01fSchristos
276*1dcdf01fSchristos	&mov	(&DWP(-4,$ctx),"ecx");
277*1dcdf01fSchristos	&lea	($inp,&DWP(0,"esp"));
278*1dcdf01fSchristos	&lea	($out,&DWP(0,"esp"));
279*1dcdf01fSchristos	&mov	($len,$chunk);
280*1dcdf01fSchristos						} else {
281*1dcdf01fSchristos	&test	($out,0x0f);			# out_misaligned
282*1dcdf01fSchristos	&cmovnz	($out,"esp");
283*1dcdf01fSchristos	&test	($inp,0x0f);			# inp_misaligned
284*1dcdf01fSchristos	&jz	(&label("${mode}_inp_aligned"));
285*1dcdf01fSchristos	&shr	($len,2);
286*1dcdf01fSchristos	&data_byte(0xf3,0xa5);			# rep movsl
287*1dcdf01fSchristos	&sub	($out,$chunk);
288*1dcdf01fSchristos	&mov	($len,$chunk);
289*1dcdf01fSchristos	&mov	($inp,$out);
290*1dcdf01fSchristos&set_label("${mode}_inp_aligned");
291*1dcdf01fSchristos						}
292*1dcdf01fSchristos	&lea	("eax",&DWP(-16,$ctx));		# ivp
293*1dcdf01fSchristos	&lea	("ebx",&DWP(16,$ctx));		# key
294*1dcdf01fSchristos	&shr	($len,4);			# len/=AES_BLOCK_SIZE
295*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa7,$opcode);	# rep xcrypt*
296*1dcdf01fSchristos						if ($mode !~ /ecb|ctr/) {
297*1dcdf01fSchristos	&movaps	("xmm0",&QWP(0,"eax"));
298*1dcdf01fSchristos	&movaps	(&QWP(-16,$ctx),"xmm0");	# copy [or refresh] iv
299*1dcdf01fSchristos						}
300*1dcdf01fSchristos	&mov	($out,&DWP(0,"ebp"));		# restore parameters
301*1dcdf01fSchristos	&mov	($chunk,&DWP(12,"ebp"));
302*1dcdf01fSchristos						if ($mode eq "ctr32") {
303*1dcdf01fSchristos	&mov	($inp,&DWP(4,"ebp"));
304*1dcdf01fSchristos	&xor	($len,$len);
305*1dcdf01fSchristos&set_label("${mode}_xor");
306*1dcdf01fSchristos	&movups	("xmm1",&QWP(0,$inp,$len));
307*1dcdf01fSchristos	&lea	($len,&DWP(16,$len));
308*1dcdf01fSchristos	&pxor	("xmm1",&QWP(-16,"esp",$len));
309*1dcdf01fSchristos	&movups	(&QWP(-16,$out,$len),"xmm1");
310*1dcdf01fSchristos	&cmp	($len,$chunk);
311*1dcdf01fSchristos	&jb	(&label("${mode}_xor"));
312*1dcdf01fSchristos						} else {
313*1dcdf01fSchristos	&test	($out,0x0f);
314*1dcdf01fSchristos	&jz	(&label("${mode}_out_aligned"));
315*1dcdf01fSchristos	&mov	($len,$chunk);
316*1dcdf01fSchristos	&lea	($inp,&DWP(0,"esp"));
317*1dcdf01fSchristos	&shr	($len,2);
318*1dcdf01fSchristos	&data_byte(0xf3,0xa5);			# rep movsl
319*1dcdf01fSchristos	&sub	($out,$chunk);
320*1dcdf01fSchristos&set_label("${mode}_out_aligned");
321*1dcdf01fSchristos	&mov	($inp,&DWP(4,"ebp"));
322*1dcdf01fSchristos						}
323*1dcdf01fSchristos	&mov	($len,&DWP(8,"ebp"));
324*1dcdf01fSchristos	&add	($out,$chunk);
325*1dcdf01fSchristos	&add	($inp,$chunk);
326*1dcdf01fSchristos	&sub	($len,$chunk);
327*1dcdf01fSchristos	&mov	($chunk,$PADLOCK_CHUNK);
328*1dcdf01fSchristos    if (!$PADLOCK_PREFETCH{$mode}) {
329*1dcdf01fSchristos	&jnz	(&label("${mode}_loop"));
330*1dcdf01fSchristos    } else {
331*1dcdf01fSchristos	&jz	(&label("${mode}_break"));
332*1dcdf01fSchristos	&cmp	($len,$chunk);
333*1dcdf01fSchristos	&jae	(&label("${mode}_loop"));
334*1dcdf01fSchristos
335*1dcdf01fSchristos&set_label("${mode}_unaligned_tail");
336*1dcdf01fSchristos	&xor	("eax","eax");
337*1dcdf01fSchristos	&cmp	("esp","ebp");
338*1dcdf01fSchristos	&cmove	("eax",$len);
339*1dcdf01fSchristos	&sub	("esp","eax");			# alloca
340*1dcdf01fSchristos	&mov	("eax", $out);			# save parameters
341*1dcdf01fSchristos	&mov	($chunk,$len);
342*1dcdf01fSchristos	&shr	($len,2);
343*1dcdf01fSchristos	&lea	($out,&DWP(0,"esp"));
344*1dcdf01fSchristos	&data_byte(0xf3,0xa5);			# rep movsl
345*1dcdf01fSchristos	&mov	($inp,"esp");
346*1dcdf01fSchristos	&mov	($out,"eax");			# restore parameters
347*1dcdf01fSchristos	&mov	($len,$chunk);
348*1dcdf01fSchristos	&jmp	(&label("${mode}_loop"));
349*1dcdf01fSchristos
350*1dcdf01fSchristos&set_label("${mode}_break",16);
351*1dcdf01fSchristos    }
352*1dcdf01fSchristos						if ($mode ne "ctr32") {
353*1dcdf01fSchristos	&cmp	("esp","ebp");
354*1dcdf01fSchristos	&je	(&label("${mode}_done"));
355*1dcdf01fSchristos						}
356*1dcdf01fSchristos	&pxor	("xmm0","xmm0");
357*1dcdf01fSchristos	&lea	("eax",&DWP(0,"esp"));
358*1dcdf01fSchristos&set_label("${mode}_bzero");
359*1dcdf01fSchristos	&movaps	(&QWP(0,"eax"),"xmm0");
360*1dcdf01fSchristos	&lea	("eax",&DWP(16,"eax"));
361*1dcdf01fSchristos	&cmp	("ebp","eax");
362*1dcdf01fSchristos	&ja	(&label("${mode}_bzero"));
363*1dcdf01fSchristos
364*1dcdf01fSchristos&set_label("${mode}_done");
365*1dcdf01fSchristos	&mov	("ebp",&DWP(16,"ebp"));
366*1dcdf01fSchristos	&lea	("esp",&DWP(24,"ebp"));
367*1dcdf01fSchristos						if ($mode ne "ctr32") {
368*1dcdf01fSchristos	&jmp	(&label("${mode}_exit"));
369*1dcdf01fSchristos
370*1dcdf01fSchristos&set_label("${mode}_aligned",16);
371*1dcdf01fSchristos    if ($PADLOCK_PREFETCH{$mode}) {
372*1dcdf01fSchristos	&lea	("ebp",&DWP(0,$inp,$len));
373*1dcdf01fSchristos	&neg	("ebp");
374*1dcdf01fSchristos	&and	("ebp",0xfff);			# distance to page boundary
375*1dcdf01fSchristos	&xor	("eax","eax");
376*1dcdf01fSchristos	&cmp	("ebp",$PADLOCK_PREFETCH{$mode});
377*1dcdf01fSchristos	&mov	("ebp",$PADLOCK_PREFETCH{$mode}-1);
378*1dcdf01fSchristos	&cmovae	("ebp","eax");
379*1dcdf01fSchristos	&and	("ebp",$len);			# remainder
380*1dcdf01fSchristos	&sub	($len,"ebp");
381*1dcdf01fSchristos	&jz	(&label("${mode}_aligned_tail"));
382*1dcdf01fSchristos    }
383*1dcdf01fSchristos	&lea	("eax",&DWP(-16,$ctx));		# ivp
384*1dcdf01fSchristos	&lea	("ebx",&DWP(16,$ctx));		# key
385*1dcdf01fSchristos	&shr	($len,4);			# len/=AES_BLOCK_SIZE
386*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa7,$opcode);	# rep xcrypt*
387*1dcdf01fSchristos						if ($mode ne "ecb") {
388*1dcdf01fSchristos	&movaps	("xmm0",&QWP(0,"eax"));
389*1dcdf01fSchristos	&movaps	(&QWP(-16,$ctx),"xmm0");	# copy [or refresh] iv
390*1dcdf01fSchristos						}
391*1dcdf01fSchristos    if ($PADLOCK_PREFETCH{$mode}) {
392*1dcdf01fSchristos	&test	("ebp","ebp");
393*1dcdf01fSchristos	&jz	(&label("${mode}_exit"));
394*1dcdf01fSchristos
395*1dcdf01fSchristos&set_label("${mode}_aligned_tail");
396*1dcdf01fSchristos	&mov	($len,"ebp");
397*1dcdf01fSchristos	&lea	("ebp",&DWP(-24,"esp"));
398*1dcdf01fSchristos	&mov	("esp","ebp");
399*1dcdf01fSchristos	&mov	("eax","ebp");
400*1dcdf01fSchristos	&sub	("esp",$len);
401*1dcdf01fSchristos	&and	("ebp",-16);
402*1dcdf01fSchristos	&and	("esp",-16);
403*1dcdf01fSchristos	&mov	(&DWP(16,"ebp"),"eax");
404*1dcdf01fSchristos	&mov	("eax", $out);			# save parameters
405*1dcdf01fSchristos	&mov	($chunk,$len);
406*1dcdf01fSchristos	&shr	($len,2);
407*1dcdf01fSchristos	&lea	($out,&DWP(0,"esp"));
408*1dcdf01fSchristos	&data_byte(0xf3,0xa5);			# rep movsl
409*1dcdf01fSchristos	&mov	($inp,"esp");
410*1dcdf01fSchristos	&mov	($out,"eax");			# restore parameters
411*1dcdf01fSchristos	&mov	($len,$chunk);
412*1dcdf01fSchristos	&jmp	(&label("${mode}_loop"));
413*1dcdf01fSchristos    }
414*1dcdf01fSchristos&set_label("${mode}_exit");			}
415*1dcdf01fSchristos	&mov	("eax",1);
416*1dcdf01fSchristos	&lea	("esp",&DWP(4,"esp"));		# popf
417*1dcdf01fSchristos	&emms	()				if ($mode eq "ctr32");
418*1dcdf01fSchristos&set_label("${mode}_abort");
419*1dcdf01fSchristos&function_end("padlock_${mode}_encrypt");
420*1dcdf01fSchristos}
421*1dcdf01fSchristos
422*1dcdf01fSchristos&generate_mode("ecb",0xc8);
423*1dcdf01fSchristos&generate_mode("cbc",0xd0);
424*1dcdf01fSchristos&generate_mode("cfb",0xe0);
425*1dcdf01fSchristos&generate_mode("ofb",0xe8);
426*1dcdf01fSchristos&generate_mode("ctr32",0xc8);	# yes, it implements own CTR with ECB opcode,
427*1dcdf01fSchristos				# because hardware CTR was introduced later
428*1dcdf01fSchristos				# and even has errata on certain C7 stepping.
429*1dcdf01fSchristos				# own implementation *always* works, though
430*1dcdf01fSchristos				# ~15% slower than dedicated hardware...
431*1dcdf01fSchristos
432*1dcdf01fSchristos&function_begin_B("padlock_xstore");
433*1dcdf01fSchristos	&push	("edi");
434*1dcdf01fSchristos	&mov	("edi",&wparam(0));
435*1dcdf01fSchristos	&mov	("edx",&wparam(1));
436*1dcdf01fSchristos	&data_byte(0x0f,0xa7,0xc0);		# xstore
437*1dcdf01fSchristos	&pop	("edi");
438*1dcdf01fSchristos	&ret	();
439*1dcdf01fSchristos&function_end_B("padlock_xstore");
440*1dcdf01fSchristos
441*1dcdf01fSchristos&function_begin_B("_win32_segv_handler");
442*1dcdf01fSchristos	&mov	("eax",1);			# ExceptionContinueSearch
443*1dcdf01fSchristos	&mov	("edx",&wparam(0));		# *ExceptionRecord
444*1dcdf01fSchristos	&mov	("ecx",&wparam(2));		# *ContextRecord
445*1dcdf01fSchristos	&cmp	(&DWP(0,"edx"),0xC0000005)	# ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
446*1dcdf01fSchristos	&jne	(&label("ret"));
447*1dcdf01fSchristos	&add	(&DWP(184,"ecx"),4);		# skip over rep sha*
448*1dcdf01fSchristos	&mov	("eax",0);			# ExceptionContinueExecution
449*1dcdf01fSchristos&set_label("ret");
450*1dcdf01fSchristos	&ret	();
451*1dcdf01fSchristos&function_end_B("_win32_segv_handler");
452*1dcdf01fSchristos&safeseh("_win32_segv_handler")			if ($::win32);
453*1dcdf01fSchristos
454*1dcdf01fSchristos&function_begin_B("padlock_sha1_oneshot");
455*1dcdf01fSchristos	&push	("edi");
456*1dcdf01fSchristos	&push	("esi");
457*1dcdf01fSchristos	&xor	("eax","eax");
458*1dcdf01fSchristos	&mov	("edi",&wparam(0));
459*1dcdf01fSchristos	&mov	("esi",&wparam(1));
460*1dcdf01fSchristos	&mov	("ecx",&wparam(2));
461*1dcdf01fSchristos    if ($::win32 or $::coff) {
462*1dcdf01fSchristos    	&push	(&::islabel("_win32_segv_handler"));
463*1dcdf01fSchristos	&data_byte(0x64,0xff,0x30);		# push	%fs:(%eax)
464*1dcdf01fSchristos	&data_byte(0x64,0x89,0x20);		# mov	%esp,%fs:(%eax)
465*1dcdf01fSchristos    }
466*1dcdf01fSchristos	&mov	("edx","esp");			# put aside %esp
467*1dcdf01fSchristos	&add	("esp",-128);			# 32 is enough but spec says 128
468*1dcdf01fSchristos	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
469*1dcdf01fSchristos	&and	("esp",-16);
470*1dcdf01fSchristos	&mov	("eax",&DWP(16,"edi"));
471*1dcdf01fSchristos	&movaps	(&QWP(0,"esp"),"xmm0");
472*1dcdf01fSchristos	&mov	("edi","esp");
473*1dcdf01fSchristos	&mov	(&DWP(16,"esp"),"eax");
474*1dcdf01fSchristos	&xor	("eax","eax");
475*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa6,0xc8);	# rep xsha1
476*1dcdf01fSchristos	&movaps	("xmm0",&QWP(0,"esp"));
477*1dcdf01fSchristos	&mov	("eax",&DWP(16,"esp"));
478*1dcdf01fSchristos	&mov	("esp","edx");			# restore %esp
479*1dcdf01fSchristos    if ($::win32 or $::coff) {
480*1dcdf01fSchristos	&data_byte(0x64,0x8f,0x05,0,0,0,0);	# pop	%fs:0
481*1dcdf01fSchristos	&lea	("esp",&DWP(4,"esp"));
482*1dcdf01fSchristos    }
483*1dcdf01fSchristos	&mov	("edi",&wparam(0));
484*1dcdf01fSchristos	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
485*1dcdf01fSchristos	&mov	(&DWP(16,"edi"),"eax");
486*1dcdf01fSchristos	&pop	("esi");
487*1dcdf01fSchristos	&pop	("edi");
488*1dcdf01fSchristos	&ret	();
489*1dcdf01fSchristos&function_end_B("padlock_sha1_oneshot");
490*1dcdf01fSchristos
491*1dcdf01fSchristos&function_begin_B("padlock_sha1_blocks");
492*1dcdf01fSchristos	&push	("edi");
493*1dcdf01fSchristos	&push	("esi");
494*1dcdf01fSchristos	&mov	("edi",&wparam(0));
495*1dcdf01fSchristos	&mov	("esi",&wparam(1));
496*1dcdf01fSchristos	&mov	("edx","esp");			# put aside %esp
497*1dcdf01fSchristos	&mov	("ecx",&wparam(2));
498*1dcdf01fSchristos	&add	("esp",-128);
499*1dcdf01fSchristos	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
500*1dcdf01fSchristos	&and	("esp",-16);
501*1dcdf01fSchristos	&mov	("eax",&DWP(16,"edi"));
502*1dcdf01fSchristos	&movaps	(&QWP(0,"esp"),"xmm0");
503*1dcdf01fSchristos	&mov	("edi","esp");
504*1dcdf01fSchristos	&mov	(&DWP(16,"esp"),"eax");
505*1dcdf01fSchristos	&mov	("eax",-1);
506*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa6,0xc8);	# rep xsha1
507*1dcdf01fSchristos	&movaps	("xmm0",&QWP(0,"esp"));
508*1dcdf01fSchristos	&mov	("eax",&DWP(16,"esp"));
509*1dcdf01fSchristos	&mov	("esp","edx");			# restore %esp
510*1dcdf01fSchristos	&mov	("edi",&wparam(0));
511*1dcdf01fSchristos	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
512*1dcdf01fSchristos	&mov	(&DWP(16,"edi"),"eax");
513*1dcdf01fSchristos 	&pop	("esi");
514*1dcdf01fSchristos	&pop	("edi");
515*1dcdf01fSchristos	&ret	();
516*1dcdf01fSchristos&function_end_B("padlock_sha1_blocks");
517*1dcdf01fSchristos
518*1dcdf01fSchristos&function_begin_B("padlock_sha256_oneshot");
519*1dcdf01fSchristos	&push	("edi");
520*1dcdf01fSchristos	&push	("esi");
521*1dcdf01fSchristos	&xor	("eax","eax");
522*1dcdf01fSchristos	&mov	("edi",&wparam(0));
523*1dcdf01fSchristos	&mov	("esi",&wparam(1));
524*1dcdf01fSchristos	&mov	("ecx",&wparam(2));
525*1dcdf01fSchristos    if ($::win32 or $::coff) {
526*1dcdf01fSchristos    	&push	(&::islabel("_win32_segv_handler"));
527*1dcdf01fSchristos	&data_byte(0x64,0xff,0x30);		# push	%fs:(%eax)
528*1dcdf01fSchristos	&data_byte(0x64,0x89,0x20);		# mov	%esp,%fs:(%eax)
529*1dcdf01fSchristos    }
530*1dcdf01fSchristos	&mov	("edx","esp");			# put aside %esp
531*1dcdf01fSchristos	&add	("esp",-128);
532*1dcdf01fSchristos	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
533*1dcdf01fSchristos	&and	("esp",-16);
534*1dcdf01fSchristos	&movups	("xmm1",&QWP(16,"edi"));
535*1dcdf01fSchristos	&movaps	(&QWP(0,"esp"),"xmm0");
536*1dcdf01fSchristos	&mov	("edi","esp");
537*1dcdf01fSchristos	&movaps	(&QWP(16,"esp"),"xmm1");
538*1dcdf01fSchristos	&xor	("eax","eax");
539*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa6,0xd0);	# rep xsha256
540*1dcdf01fSchristos	&movaps	("xmm0",&QWP(0,"esp"));
541*1dcdf01fSchristos	&movaps	("xmm1",&QWP(16,"esp"));
542*1dcdf01fSchristos	&mov	("esp","edx");			# restore %esp
543*1dcdf01fSchristos    if ($::win32 or $::coff) {
544*1dcdf01fSchristos	&data_byte(0x64,0x8f,0x05,0,0,0,0);	# pop	%fs:0
545*1dcdf01fSchristos	&lea	("esp",&DWP(4,"esp"));
546*1dcdf01fSchristos    }
547*1dcdf01fSchristos	&mov	("edi",&wparam(0));
548*1dcdf01fSchristos	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
549*1dcdf01fSchristos	&movups	(&QWP(16,"edi"),"xmm1");
550*1dcdf01fSchristos	&pop	("esi");
551*1dcdf01fSchristos	&pop	("edi");
552*1dcdf01fSchristos	&ret	();
553*1dcdf01fSchristos&function_end_B("padlock_sha256_oneshot");
554*1dcdf01fSchristos
555*1dcdf01fSchristos&function_begin_B("padlock_sha256_blocks");
556*1dcdf01fSchristos	&push	("edi");
557*1dcdf01fSchristos	&push	("esi");
558*1dcdf01fSchristos	&mov	("edi",&wparam(0));
559*1dcdf01fSchristos	&mov	("esi",&wparam(1));
560*1dcdf01fSchristos	&mov	("ecx",&wparam(2));
561*1dcdf01fSchristos	&mov	("edx","esp");			# put aside %esp
562*1dcdf01fSchristos	&add	("esp",-128);
563*1dcdf01fSchristos	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
564*1dcdf01fSchristos	&and	("esp",-16);
565*1dcdf01fSchristos	&movups	("xmm1",&QWP(16,"edi"));
566*1dcdf01fSchristos	&movaps	(&QWP(0,"esp"),"xmm0");
567*1dcdf01fSchristos	&mov	("edi","esp");
568*1dcdf01fSchristos	&movaps	(&QWP(16,"esp"),"xmm1");
569*1dcdf01fSchristos	&mov	("eax",-1);
570*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa6,0xd0);	# rep xsha256
571*1dcdf01fSchristos	&movaps	("xmm0",&QWP(0,"esp"));
572*1dcdf01fSchristos	&movaps	("xmm1",&QWP(16,"esp"));
573*1dcdf01fSchristos	&mov	("esp","edx");			# restore %esp
574*1dcdf01fSchristos	&mov	("edi",&wparam(0));
575*1dcdf01fSchristos	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
576*1dcdf01fSchristos	&movups	(&QWP(16,"edi"),"xmm1");
577*1dcdf01fSchristos	&pop	("esi");
578*1dcdf01fSchristos	&pop	("edi");
579*1dcdf01fSchristos	&ret	();
580*1dcdf01fSchristos&function_end_B("padlock_sha256_blocks");
581*1dcdf01fSchristos
582*1dcdf01fSchristos&function_begin_B("padlock_sha512_blocks");
583*1dcdf01fSchristos	&push	("edi");
584*1dcdf01fSchristos	&push	("esi");
585*1dcdf01fSchristos	&mov	("edi",&wparam(0));
586*1dcdf01fSchristos	&mov	("esi",&wparam(1));
587*1dcdf01fSchristos	&mov	("ecx",&wparam(2));
588*1dcdf01fSchristos	&mov	("edx","esp");			# put aside %esp
589*1dcdf01fSchristos	&add	("esp",-128);
590*1dcdf01fSchristos	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
591*1dcdf01fSchristos	&and	("esp",-16);
592*1dcdf01fSchristos	&movups	("xmm1",&QWP(16,"edi"));
593*1dcdf01fSchristos	&movups	("xmm2",&QWP(32,"edi"));
594*1dcdf01fSchristos	&movups	("xmm3",&QWP(48,"edi"));
595*1dcdf01fSchristos	&movaps	(&QWP(0,"esp"),"xmm0");
596*1dcdf01fSchristos	&mov	("edi","esp");
597*1dcdf01fSchristos	&movaps	(&QWP(16,"esp"),"xmm1");
598*1dcdf01fSchristos	&movaps	(&QWP(32,"esp"),"xmm2");
599*1dcdf01fSchristos	&movaps	(&QWP(48,"esp"),"xmm3");
600*1dcdf01fSchristos	&data_byte(0xf3,0x0f,0xa6,0xe0);	# rep xsha512
601*1dcdf01fSchristos	&movaps	("xmm0",&QWP(0,"esp"));
602*1dcdf01fSchristos	&movaps	("xmm1",&QWP(16,"esp"));
603*1dcdf01fSchristos	&movaps	("xmm2",&QWP(32,"esp"));
604*1dcdf01fSchristos	&movaps	("xmm3",&QWP(48,"esp"));
605*1dcdf01fSchristos	&mov	("esp","edx");			# restore %esp
606*1dcdf01fSchristos	&mov	("edi",&wparam(0));
607*1dcdf01fSchristos	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
608*1dcdf01fSchristos	&movups	(&QWP(16,"edi"),"xmm1");
609*1dcdf01fSchristos	&movups	(&QWP(32,"edi"),"xmm2");
610*1dcdf01fSchristos	&movups	(&QWP(48,"edi"),"xmm3");
611*1dcdf01fSchristos	&pop	("esi");
612*1dcdf01fSchristos	&pop	("edi");
613*1dcdf01fSchristos	&ret	();
614*1dcdf01fSchristos&function_end_B("padlock_sha512_blocks");
615*1dcdf01fSchristos
616*1dcdf01fSchristos&asciz	("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
617*1dcdf01fSchristos&align	(16);
618*1dcdf01fSchristos
619*1dcdf01fSchristos&dataseg();
620*1dcdf01fSchristos# Essentially this variable belongs in thread local storage.
621*1dcdf01fSchristos# Having this variable global on the other hand can only cause
622*1dcdf01fSchristos# few bogus key reloads [if any at all on signle-CPU system],
623*1dcdf01fSchristos# so we accept the penalty...
624*1dcdf01fSchristos&set_label("padlock_saved_context",4);
625*1dcdf01fSchristos&data_word(0);
626*1dcdf01fSchristos
627*1dcdf01fSchristos&asm_finish();
628*1dcdf01fSchristos
629*1dcdf01fSchristosclose STDOUT;
630