1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10$flavour = shift;
11$output  = shift;
12if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
13
14$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
15
16$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
17( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
18( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
19die "can't locate x86_64-xlate.pl";
20
21open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
22*STDOUT=*OUT;
23
24($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :	# Win64 order
25				 ("%rdi","%rsi","%rdx","%rcx");	# Unix order
26
27print<<___;
28.extern		OPENSSL_cpuid_setup
29.hidden		OPENSSL_cpuid_setup
30.section	.init
31	call	OPENSSL_cpuid_setup
32
33.hidden	OPENSSL_ia32cap_P
34.comm	OPENSSL_ia32cap_P,16,4
35
36.text
37
38.globl	OPENSSL_atomic_add
39.type	OPENSSL_atomic_add,\@abi-omnipotent
40.align	16
41OPENSSL_atomic_add:
42.cfi_startproc
43	movl	($arg1),%eax
44.Lspin:	leaq	($arg2,%rax),%r8
45	.byte	0xf0		# lock
46	cmpxchgl	%r8d,($arg1)
47	jne	.Lspin
48	movl	%r8d,%eax
49	.byte	0x48,0x98	# cltq/cdqe
50	ret
51.cfi_endproc
52.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
53
54.globl	OPENSSL_rdtsc
55.type	OPENSSL_rdtsc,\@abi-omnipotent
56.align	16
57OPENSSL_rdtsc:
58.cfi_startproc
59	rdtsc
60	shl	\$32,%rdx
61	or	%rdx,%rax
62	ret
63.cfi_endproc
64.size	OPENSSL_rdtsc,.-OPENSSL_rdtsc
65
66.globl	OPENSSL_ia32_cpuid
67.type	OPENSSL_ia32_cpuid,\@function,1
68.align	16
69OPENSSL_ia32_cpuid:
70.cfi_startproc
71	mov	%rbx,%r8		# save %rbx
72.cfi_register	%rbx,%r8
73
74	xor	%eax,%eax
75	mov	%rax,8(%rdi)		# clear extended feature flags
76	cpuid
77	mov	%eax,%r11d		# max value for standard query level
78
79	xor	%eax,%eax
80	cmp	\$0x756e6547,%ebx	# "Genu"
81	setne	%al
82	mov	%eax,%r9d
83	cmp	\$0x49656e69,%edx	# "ineI"
84	setne	%al
85	or	%eax,%r9d
86	cmp	\$0x6c65746e,%ecx	# "ntel"
87	setne	%al
88	or	%eax,%r9d		# 0 indicates Intel CPU
89	jz	.Lintel
90
91	cmp	\$0x68747541,%ebx	# "Auth"
92	setne	%al
93	mov	%eax,%r10d
94	cmp	\$0x69746E65,%edx	# "enti"
95	setne	%al
96	or	%eax,%r10d
97	cmp	\$0x444D4163,%ecx	# "cAMD"
98	setne	%al
99	or	%eax,%r10d		# 0 indicates AMD CPU
100	jnz	.Lintel
101
102	# AMD specific
103	mov	\$0x80000000,%eax
104	cpuid
105	cmp	\$0x80000001,%eax
106	jb	.Lintel
107	mov	%eax,%r10d
108	mov	\$0x80000001,%eax
109	cpuid
110	or	%ecx,%r9d
111	and	\$0x00000801,%r9d	# isolate AMD XOP bit, 1<<11
112
113	cmp	\$0x80000008,%r10d
114	jb	.Lintel
115
116	mov	\$0x80000008,%eax
117	cpuid
118	movzb	%cl,%r10		# number of cores - 1
119	inc	%r10			# number of cores
120
121	mov	\$1,%eax
122	cpuid
123	bt	\$28,%edx		# test hyper-threading bit
124	jnc	.Lgeneric
125	shr	\$16,%ebx		# number of logical processors
126	cmp	%r10b,%bl
127	ja	.Lgeneric
128	and	\$0xefffffff,%edx	# ~(1<<28)
129	jmp	.Lgeneric
130
131.Lintel:
132	cmp	\$4,%r11d
133	mov	\$-1,%r10d
134	jb	.Lnocacheinfo
135
136	mov	\$4,%eax
137	mov	\$0,%ecx		# query L1D
138	cpuid
139	mov	%eax,%r10d
140	shr	\$14,%r10d
141	and	\$0xfff,%r10d		# number of cores -1 per L1D
142
143.Lnocacheinfo:
144	mov	\$1,%eax
145	cpuid
146	movd	%eax,%xmm0		# put aside processor id
147	and	\$0xbfefffff,%edx	# force reserved bits to 0
148	cmp	\$0,%r9d
149	jne	.Lnotintel
150	or	\$0x40000000,%edx	# set reserved bit#30 on Intel CPUs
151	and	\$15,%ah
152	cmp	\$15,%ah		# examine Family ID
153	jne	.LnotP4
154	or	\$0x00100000,%edx	# set reserved bit#20 to engage RC4_CHAR
155.LnotP4:
156	cmp	\$6,%ah
157	jne	.Lnotintel
158	and	\$0x0fff0ff0,%eax
159	cmp	\$0x00050670,%eax	# Knights Landing
160	je	.Lknights
161	cmp	\$0x00080650,%eax	# Knights Mill (according to sde)
162	jne	.Lnotintel
163.Lknights:
164	and	\$0xfbffffff,%ecx	# clear XSAVE flag to mimic Silvermont
165
166.Lnotintel:
167	bt	\$28,%edx		# test hyper-threading bit
168	jnc	.Lgeneric
169	and	\$0xefffffff,%edx	# ~(1<<28)
170	cmp	\$0,%r10d
171	je	.Lgeneric
172
173	or	\$0x10000000,%edx	# 1<<28
174	shr	\$16,%ebx
175	cmp	\$1,%bl			# see if cache is shared
176	ja	.Lgeneric
177	and	\$0xefffffff,%edx	# ~(1<<28)
178.Lgeneric:
179	and	\$0x00000800,%r9d	# isolate AMD XOP flag
180	and	\$0xfffff7ff,%ecx
181	or	%ecx,%r9d		# merge AMD XOP flag
182
183	mov	%edx,%r10d		# %r9d:%r10d is copy of %ecx:%edx
184
185	cmp	\$7,%r11d
186	jb	.Lno_extended_info
187	mov	\$7,%eax
188	xor	%ecx,%ecx
189	cpuid
190	bt	\$26,%r9d		# check XSAVE bit, cleared on Knights
191	jc	.Lnotknights
192	and	\$0xfff7ffff,%ebx	# clear ADCX/ADOX flag
193.Lnotknights:
194	movd	%xmm0,%eax		# restore processor id
195	and	\$0x0fff0ff0,%eax
196	cmp	\$0x00050650,%eax	# Skylake-X
197	jne	.Lnotskylakex
198	and	\$0xfffeffff,%ebx	# ~(1<<16)
199					# suppress AVX512F flag on Skylake-X
200.Lnotskylakex:
201	mov	%ebx,8(%rdi)		# save extended feature flags
202	mov	%ecx,12(%rdi)
203.Lno_extended_info:
204
205	bt	\$27,%r9d		# check OSXSAVE bit
206	jnc	.Lclear_avx
207	xor	%ecx,%ecx		# XCR0
208	.byte	0x0f,0x01,0xd0		# xgetbv
209	and	\$0xe6,%eax		# isolate XMM, YMM and ZMM state support
210	cmp	\$0xe6,%eax
211	je	.Ldone
212	andl	\$0x3fdeffff,8(%rdi)	# ~(1<<31|1<<30|1<<21|1<<16)
213					# clear AVX512F+BW+VL+FIMA, all of
214					# them are EVEX-encoded, which requires
215					# ZMM state support even if one uses
216					# only XMM and YMM :-(
217	and	\$6,%eax		# isolate XMM and YMM state support
218	cmp	\$6,%eax
219	je	.Ldone
220.Lclear_avx:
221	mov	\$0xefffe7ff,%eax	# ~(1<<28|1<<12|1<<11)
222	and	%eax,%r9d		# clear AVX, FMA and AMD XOP bits
223	mov	\$0x3fdeffdf,%eax	# ~(1<<31|1<<30|1<<21|1<<16|1<<5)
224	and	%eax,8(%rdi)		# clear AVX2 and AVX512* bits
225.Ldone:
226	shl	\$32,%r9
227	mov	%r10d,%eax
228	mov	%r8,%rbx		# restore %rbx
229.cfi_restore	%rbx
230	or	%r9,%rax
231	ret
232.cfi_endproc
233.size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
234
235.globl  OPENSSL_cleanse
236.type   OPENSSL_cleanse,\@abi-omnipotent
237.align  16
238OPENSSL_cleanse:
239.cfi_startproc
240	xor	%rax,%rax
241	cmp	\$15,$arg2
242	jae	.Lot
243	cmp	\$0,$arg2
244	je	.Lret
245.Little:
246	mov	%al,($arg1)
247	sub	\$1,$arg2
248	lea	1($arg1),$arg1
249	jnz	.Little
250.Lret:
251	ret
252.align	16
253.Lot:
254	test	\$7,$arg1
255	jz	.Laligned
256	mov	%al,($arg1)
257	lea	-1($arg2),$arg2
258	lea	1($arg1),$arg1
259	jmp	.Lot
260.Laligned:
261	mov	%rax,($arg1)
262	lea	-8($arg2),$arg2
263	test	\$-8,$arg2
264	lea	8($arg1),$arg1
265	jnz	.Laligned
266	cmp	\$0,$arg2
267	jne	.Little
268	ret
269.cfi_endproc
270.size	OPENSSL_cleanse,.-OPENSSL_cleanse
271
272.globl  CRYPTO_memcmp
273.type   CRYPTO_memcmp,\@abi-omnipotent
274.align  16
275CRYPTO_memcmp:
276.cfi_startproc
277	xor	%rax,%rax
278	xor	%r10,%r10
279	cmp	\$0,$arg3
280	je	.Lno_data
281	cmp	\$16,$arg3
282	jne	.Loop_cmp
283	mov	($arg1),%r10
284	mov	8($arg1),%r11
285	mov	\$1,$arg3
286	xor	($arg2),%r10
287	xor	8($arg2),%r11
288	or	%r11,%r10
289	cmovnz	$arg3,%rax
290	ret
291
292.align	16
293.Loop_cmp:
294	mov	($arg1),%r10b
295	lea	1($arg1),$arg1
296	xor	($arg2),%r10b
297	lea	1($arg2),$arg2
298	or	%r10b,%al
299	dec	$arg3
300	jnz	.Loop_cmp
301	neg	%rax
302	shr	\$63,%rax
303.Lno_data:
304	ret
305.cfi_endproc
306.size	CRYPTO_memcmp,.-CRYPTO_memcmp
307___
308
309print<<___ if (!$win64);
310.globl	OPENSSL_wipe_cpu
311.type	OPENSSL_wipe_cpu,\@abi-omnipotent
312.align	16
313OPENSSL_wipe_cpu:
314.cfi_startproc
315	pxor	%xmm0,%xmm0
316	pxor	%xmm1,%xmm1
317	pxor	%xmm2,%xmm2
318	pxor	%xmm3,%xmm3
319	pxor	%xmm4,%xmm4
320	pxor	%xmm5,%xmm5
321	pxor	%xmm6,%xmm6
322	pxor	%xmm7,%xmm7
323	pxor	%xmm8,%xmm8
324	pxor	%xmm9,%xmm9
325	pxor	%xmm10,%xmm10
326	pxor	%xmm11,%xmm11
327	pxor	%xmm12,%xmm12
328	pxor	%xmm13,%xmm13
329	pxor	%xmm14,%xmm14
330	pxor	%xmm15,%xmm15
331	xorq	%rcx,%rcx
332	xorq	%rdx,%rdx
333	xorq	%rsi,%rsi
334	xorq	%rdi,%rdi
335	xorq	%r8,%r8
336	xorq	%r9,%r9
337	xorq	%r10,%r10
338	xorq	%r11,%r11
339	leaq	8(%rsp),%rax
340	ret
341.cfi_endproc
342.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
343___
344print<<___ if ($win64);
345.globl	OPENSSL_wipe_cpu
346.type	OPENSSL_wipe_cpu,\@abi-omnipotent
347.align	16
348OPENSSL_wipe_cpu:
349	pxor	%xmm0,%xmm0
350	pxor	%xmm1,%xmm1
351	pxor	%xmm2,%xmm2
352	pxor	%xmm3,%xmm3
353	pxor	%xmm4,%xmm4
354	pxor	%xmm5,%xmm5
355	xorq	%rcx,%rcx
356	xorq	%rdx,%rdx
357	xorq	%r8,%r8
358	xorq	%r9,%r9
359	xorq	%r10,%r10
360	xorq	%r11,%r11
361	leaq	8(%rsp),%rax
362	ret
363.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
364___
365{
366my $out="%r10";
367my $cnt="%rcx";
368my $max="%r11";
369my $lasttick="%r8d";
370my $lastdiff="%r9d";
371my $redzone=win64?8:-8;
372
373print<<___;
374.globl	OPENSSL_instrument_bus
375.type	OPENSSL_instrument_bus,\@abi-omnipotent
376.align	16
377OPENSSL_instrument_bus:
378.cfi_startproc
379	mov	$arg1,$out	# tribute to Win64
380	mov	$arg2,$cnt
381	mov	$arg2,$max
382
383	rdtsc			# collect 1st tick
384	mov	%eax,$lasttick	# lasttick = tick
385	mov	\$0,$lastdiff	# lastdiff = 0
386	clflush	($out)
387	.byte	0xf0		# lock
388	add	$lastdiff,($out)
389	jmp	.Loop
390.align	16
391.Loop:	rdtsc
392	mov	%eax,%edx
393	sub	$lasttick,%eax
394	mov	%edx,$lasttick
395	mov	%eax,$lastdiff
396	clflush	($out)
397	.byte	0xf0		# lock
398	add	%eax,($out)
399	lea	4($out),$out
400	sub	\$1,$cnt
401	jnz	.Loop
402
403	mov	$max,%rax
404	ret
405.cfi_endproc
406.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
407
408.globl	OPENSSL_instrument_bus2
409.type	OPENSSL_instrument_bus2,\@abi-omnipotent
410.align	16
411OPENSSL_instrument_bus2:
412.cfi_startproc
413	mov	$arg1,$out	# tribute to Win64
414	mov	$arg2,$cnt
415	mov	$arg3,$max
416	mov	$cnt,$redzone(%rsp)
417
418	rdtsc			# collect 1st tick
419	mov	%eax,$lasttick	# lasttick = tick
420	mov	\$0,$lastdiff	# lastdiff = 0
421
422	clflush	($out)
423	.byte	0xf0		# lock
424	add	$lastdiff,($out)
425
426	rdtsc			# collect 1st diff
427	mov	%eax,%edx
428	sub	$lasttick,%eax	# diff
429	mov	%edx,$lasttick	# lasttick = tick
430	mov	%eax,$lastdiff	# lastdiff = diff
431.Loop2:
432	clflush	($out)
433	.byte	0xf0		# lock
434	add	%eax,($out)	# accumulate diff
435
436	sub	\$1,$max
437	jz	.Ldone2
438
439	rdtsc
440	mov	%eax,%edx
441	sub	$lasttick,%eax	# diff
442	mov	%edx,$lasttick	# lasttick = tick
443	cmp	$lastdiff,%eax
444	mov	%eax,$lastdiff	# lastdiff = diff
445	mov	\$0,%edx
446	setne	%dl
447	sub	%rdx,$cnt	# conditional --$cnt
448	lea	($out,%rdx,4),$out	# conditional ++$out
449	jnz	.Loop2
450
451.Ldone2:
452	mov	$redzone(%rsp),%rax
453	sub	$cnt,%rax
454	ret
455.cfi_endproc
456.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
457___
458}
459
460sub gen_random {
461my $rdop = shift;
462print<<___;
463.globl	OPENSSL_ia32_${rdop}_bytes
464.type	OPENSSL_ia32_${rdop}_bytes,\@abi-omnipotent
465.align	16
466OPENSSL_ia32_${rdop}_bytes:
467.cfi_startproc
468	xor	%rax, %rax	# return value
469	cmp	\$0,$arg2
470	je	.Ldone_${rdop}_bytes
471
472	mov	\$8,%r11
473.Loop_${rdop}_bytes:
474	${rdop}	%r10
475	jc	.Lbreak_${rdop}_bytes
476	dec	%r11
477	jnz	.Loop_${rdop}_bytes
478	jmp	.Ldone_${rdop}_bytes
479
480.align	16
481.Lbreak_${rdop}_bytes:
482	cmp	\$8,$arg2
483	jb	.Ltail_${rdop}_bytes
484	mov	%r10,($arg1)
485	lea	8($arg1),$arg1
486	add	\$8,%rax
487	sub	\$8,$arg2
488	jz	.Ldone_${rdop}_bytes
489	mov	\$8,%r11
490	jmp	.Loop_${rdop}_bytes
491
492.align	16
493.Ltail_${rdop}_bytes:
494	mov	%r10b,($arg1)
495	lea	1($arg1),$arg1
496	inc	%rax
497	shr	\$8,%r10
498	dec	$arg2
499	jnz	.Ltail_${rdop}_bytes
500
501.Ldone_${rdop}_bytes:
502	xor	%r10,%r10	# Clear sensitive data from register
503	ret
504.cfi_endproc
505.size	OPENSSL_ia32_${rdop}_bytes,.-OPENSSL_ia32_${rdop}_bytes
506___
507}
508gen_random("rdrand");
509gen_random("rdseed");
510
511close STDOUT or die "error closing STDOUT: $!";	# flush
512