1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for x86_64.
17#
18# June 2017.
19#
20# Below code is [lane complementing] KECCAK_2X implementation (see
21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22# instead of actually unrolling the loop pair-wise I simply flip
23# pointers to T[][] and A[][] at the end of round. Since number of
24# rounds is even, last round writes to A[][] and everything works out.
25# How does it compare to x86_64 assembly module in Keccak Code Package?
26# Depending on processor it's either as fast or faster by up to 15%...
27#
28########################################################################
29# Numbers are cycles per processed byte out of large message.
30#
31#			r=1088(*)
32#
33# P4			25.8
34# Core 2		12.9
35# Westmere		13.7
36# Sandy Bridge		12.9(**)
37# Haswell		9.6
38# Skylake		9.4
39# Silvermont		22.8
40# Goldmont		15.8
41# VIA Nano		17.3
42# Sledgehammer		13.3
43# Bulldozer		16.5
44# Ryzen			8.8
45#
46# (*)	Corresponds to SHA3-256. Improvement over compiler-generate
47#	varies a lot, most common coefficient is 15% in comparison to
48#	gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49# (**)	Sandy Bridge has broken rotate instruction. Performance can be
50#	improved by 14% by replacing rotates with double-precision
51#	shift with same register as source and destination.
52
53$flavour = shift;
54$output  = shift;
55if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56
57$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62die "can't locate x86_64-xlate.pl";
63
64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
65*STDOUT=*OUT;
66
67my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
68              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
69
70my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
71my @D = map("%r$_",(8..12));
72my @T = map("%r$_",(13..14));
73my $iotas = "%r15";
74
75my @rhotates = ([  0,  1, 62, 28, 27 ],
76                [ 36, 44,  6, 55, 20 ],
77                [  3, 10, 43, 25, 39 ],
78                [ 41, 45, 15, 21,  8 ],
79                [ 18,  2, 61, 56, 14 ]);
80
81$code.=<<___;
82.text
83
84.type	__KeccakF1600,\@abi-omnipotent
85.align	32
86__KeccakF1600:
87.cfi_startproc
88	mov	$A[4][0](%rdi),@C[0]
89	mov	$A[4][1](%rdi),@C[1]
90	mov	$A[4][2](%rdi),@C[2]
91	mov	$A[4][3](%rdi),@C[3]
92	mov	$A[4][4](%rdi),@C[4]
93	jmp	.Loop
94
95.align	32
96.Loop:
97	mov	$A[0][0](%rdi),@D[0]
98	mov	$A[1][1](%rdi),@D[1]
99	mov	$A[2][2](%rdi),@D[2]
100	mov	$A[3][3](%rdi),@D[3]
101
102	xor	$A[0][2](%rdi),@C[2]
103	xor	$A[0][3](%rdi),@C[3]
104	xor	@D[0],         @C[0]
105	xor	$A[0][1](%rdi),@C[1]
106	 xor	$A[1][2](%rdi),@C[2]
107	 xor	$A[1][0](%rdi),@C[0]
108	mov	@C[4],@D[4]
109	xor	$A[0][4](%rdi),@C[4]
110
111	xor	@D[2],         @C[2]
112	xor	$A[2][0](%rdi),@C[0]
113	 xor	$A[1][3](%rdi),@C[3]
114	 xor	@D[1],         @C[1]
115	 xor	$A[1][4](%rdi),@C[4]
116
117	xor	$A[3][2](%rdi),@C[2]
118	xor	$A[3][0](%rdi),@C[0]
119	 xor	$A[2][3](%rdi),@C[3]
120	 xor	$A[2][1](%rdi),@C[1]
121	 xor	$A[2][4](%rdi),@C[4]
122
123	mov	@C[2],@T[0]
124	rol	\$1,@C[2]
125	xor	@C[0],@C[2]		# D[1] = ROL64(C[2], 1) ^ C[0]
126	 xor	@D[3],         @C[3]
127
128	rol	\$1,@C[0]
129	xor	@C[3],@C[0]		# D[4] = ROL64(C[0], 1) ^ C[3]
130	 xor	$A[3][1](%rdi),@C[1]
131
132	rol	\$1,@C[3]
133	xor	@C[1],@C[3]		# D[2] = ROL64(C[3], 1) ^ C[1]
134	 xor	$A[3][4](%rdi),@C[4]
135
136	rol	\$1,@C[1]
137	xor	@C[4],@C[1]		# D[0] = ROL64(C[1], 1) ^ C[4]
138
139	rol	\$1,@C[4]
140	xor	@T[0],@C[4]		# D[3] = ROL64(C[4], 1) ^ C[2]
141___
142	(@D[0..4], @C) = (@C[1..4,0], @D);
143$code.=<<___;
144	xor	@D[1],@C[1]
145	xor	@D[2],@C[2]
146	rol	\$$rhotates[1][1],@C[1]
147	xor	@D[3],@C[3]
148	xor	@D[4],@C[4]
149	rol	\$$rhotates[2][2],@C[2]
150	xor	@D[0],@C[0]
151	 mov	@C[1],@T[0]
152	rol	\$$rhotates[3][3],@C[3]
153	 or	@C[2],@C[1]
154	 xor	@C[0],@C[1]		#           C[0] ^ ( C[1] | C[2])
155	rol	\$$rhotates[4][4],@C[4]
156
157	 xor	($iotas),@C[1]
158	 lea	8($iotas),$iotas
159
160	mov	@C[4],@T[1]
161	and	@C[3],@C[4]
162	 mov	@C[1],$A[0][0](%rsi)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
163	xor	@C[2],@C[4]		#           C[2] ^ ( C[4] & C[3])
164	not	@C[2]
165	mov	@C[4],$A[0][2](%rsi)	# R[0][2] = C[2] ^ ( C[4] & C[3])
166
167	or	@C[3],@C[2]
168	  mov	$A[4][2](%rdi),@C[4]
169	xor	@T[0],@C[2]		#           C[1] ^ (~C[2] | C[3])
170	mov	@C[2],$A[0][1](%rsi)	# R[0][1] = C[1] ^ (~C[2] | C[3])
171
172	and	@C[0],@T[0]
173	  mov	$A[1][4](%rdi),@C[1]
174	xor	@T[1],@T[0]		#           C[4] ^ ( C[1] & C[0])
175	  mov	$A[2][0](%rdi),@C[2]
176	mov	@T[0],$A[0][4](%rsi)	# R[0][4] = C[4] ^ ( C[1] & C[0])
177
178	or	@C[0],@T[1]
179	  mov	$A[0][3](%rdi),@C[0]
180	xor	@C[3],@T[1]		#           C[3] ^ ( C[4] | C[0])
181	  mov	$A[3][1](%rdi),@C[3]
182	mov	@T[1],$A[0][3](%rsi)	# R[0][3] = C[3] ^ ( C[4] | C[0])
183
184
185	xor	@D[3],@C[0]
186	xor	@D[2],@C[4]
187	rol	\$$rhotates[0][3],@C[0]
188	xor	@D[1],@C[3]
189	xor	@D[4],@C[1]
190	rol	\$$rhotates[4][2],@C[4]
191	rol	\$$rhotates[3][1],@C[3]
192	xor	@D[0],@C[2]
193	rol	\$$rhotates[1][4],@C[1]
194	 mov	@C[0],@T[0]
195	 or	@C[4],@C[0]
196	rol	\$$rhotates[2][0],@C[2]
197
198	xor	@C[3],@C[0]		#           C[3] ^ (C[0] |  C[4])
199	mov	@C[0],$A[1][3](%rsi)	# R[1][3] = C[3] ^ (C[0] |  C[4])
200
201	mov	@C[1],@T[1]
202	and	@T[0],@C[1]
203	  mov	$A[0][1](%rdi),@C[0]
204	xor	@C[4],@C[1]		#           C[4] ^ (C[1] &  C[0])
205	not	@C[4]
206	mov	@C[1],$A[1][4](%rsi)	# R[1][4] = C[4] ^ (C[1] &  C[0])
207
208	or	@C[3],@C[4]
209	  mov	$A[1][2](%rdi),@C[1]
210	xor	@C[2],@C[4]		#           C[2] ^ (~C[4] | C[3])
211	mov	@C[4],$A[1][2](%rsi)	# R[1][2] = C[2] ^ (~C[4] | C[3])
212
213	and	@C[2],@C[3]
214	  mov	$A[4][0](%rdi),@C[4]
215	xor	@T[1],@C[3]		#           C[1] ^ (C[3] &  C[2])
216	mov	@C[3],$A[1][1](%rsi)	# R[1][1] = C[1] ^ (C[3] &  C[2])
217
218	or	@C[2],@T[1]
219	  mov	$A[2][3](%rdi),@C[2]
220	xor	@T[0],@T[1]		#           C[0] ^ (C[1] |  C[2])
221	  mov	$A[3][4](%rdi),@C[3]
222	mov	@T[1],$A[1][0](%rsi)	# R[1][0] = C[0] ^ (C[1] |  C[2])
223
224
225	xor	@D[3],@C[2]
226	xor	@D[4],@C[3]
227	rol	\$$rhotates[2][3],@C[2]
228	xor	@D[2],@C[1]
229	rol	\$$rhotates[3][4],@C[3]
230	xor	@D[0],@C[4]
231	rol	\$$rhotates[1][2],@C[1]
232	xor	@D[1],@C[0]
233	rol	\$$rhotates[4][0],@C[4]
234	 mov	@C[2],@T[0]
235	 and	@C[3],@C[2]
236	rol	\$$rhotates[0][1],@C[0]
237
238	not	@C[3]
239	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] & C[3])
240	mov	@C[2],$A[2][1](%rsi)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
241
242	mov	@C[4],@T[1]
243	and	@C[3],@C[4]
244	  mov	$A[2][1](%rdi),@C[2]
245	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] & ~C[3])
246	mov	@C[4],$A[2][2](%rsi)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
247
248	or	@C[1],@T[0]
249	  mov	$A[4][3](%rdi),@C[4]
250	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] | C[1])
251	mov	@T[0],$A[2][0](%rsi)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
252
253	and	@C[0],@C[1]
254	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] & C[0])
255	mov	@C[1],$A[2][4](%rsi)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
256
257	or	@C[0],@T[1]
258	  mov	$A[1][0](%rdi),@C[1]
259	xor	@C[3],@T[1]		#           ~C[3] ^ ( C[0] | C[4])
260	  mov	$A[3][2](%rdi),@C[3]
261	mov	@T[1],$A[2][3](%rsi)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
262
263
264	mov	$A[0][4](%rdi),@C[0]
265
266	xor	@D[1],@C[2]
267	xor	@D[2],@C[3]
268	rol	\$$rhotates[2][1],@C[2]
269	xor	@D[0],@C[1]
270	rol	\$$rhotates[3][2],@C[3]
271	xor	@D[3],@C[4]
272	rol	\$$rhotates[1][0],@C[1]
273	xor	@D[4],@C[0]
274	rol	\$$rhotates[4][3],@C[4]
275	 mov	@C[2],@T[0]
276	 or	@C[3],@C[2]
277	rol	\$$rhotates[0][4],@C[0]
278
279	not	@C[3]
280	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] | C[3])
281	mov	@C[2],$A[3][1](%rsi)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
282
283	mov	@C[4],@T[1]
284	or	@C[3],@C[4]
285	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] | ~C[3])
286	mov	@C[4],$A[3][2](%rsi)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
287
288	and	@C[1],@T[0]
289	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] & C[1])
290	mov	@T[0],$A[3][0](%rsi)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
291
292	or	@C[0],@C[1]
293	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] | C[0])
294	mov	@C[1],$A[3][4](%rsi)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
295
296	and	@T[1],@C[0]
297	xor	@C[3],@C[0]		#           ~C[3] ^ ( C[0] & C[4])
298	mov	@C[0],$A[3][3](%rsi)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
299
300
301	xor	$A[0][2](%rdi),@D[2]
302	xor	$A[1][3](%rdi),@D[3]
303	rol	\$$rhotates[0][2],@D[2]
304	xor	$A[4][1](%rdi),@D[1]
305	rol	\$$rhotates[1][3],@D[3]
306	xor	$A[2][4](%rdi),@D[4]
307	rol	\$$rhotates[4][1],@D[1]
308	xor	$A[3][0](%rdi),@D[0]
309	xchg	%rsi,%rdi
310	rol	\$$rhotates[2][4],@D[4]
311	rol	\$$rhotates[3][0],@D[0]
312___
313	@C = @D[2..4,0,1];
314$code.=<<___;
315	mov	@C[0],@T[0]
316	and	@C[1],@C[0]
317	not	@C[1]
318	xor	@C[4],@C[0]		#            C[4] ^ ( C[0] & C[1])
319	mov	@C[0],$A[4][4](%rdi)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
320
321	mov	@C[2],@T[1]
322	and	@C[1],@C[2]
323	xor	@T[0],@C[2]		#            C[0] ^ ( C[2] & ~C[1])
324	mov	@C[2],$A[4][0](%rdi)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
325
326	or	@C[4],@T[0]
327	xor	@C[3],@T[0]		#            C[3] ^ ( C[0] | C[4])
328	mov	@T[0],$A[4][3](%rdi)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
329
330	and	@C[3],@C[4]
331	xor	@T[1],@C[4]		#            C[2] ^ ( C[4] & C[3])
332	mov	@C[4],$A[4][2](%rdi)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
333
334	or	@T[1],@C[3]
335	xor	@C[1],@C[3]		#           ~C[1] ^ ( C[2] | C[3])
336	mov	@C[3],$A[4][1](%rdi)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
337
338	mov	@C[0],@C[1]		# harmonize with the loop top
339	mov	@T[0],@C[0]
340
341	test	\$255,$iotas
342	jnz	.Loop
343
344	lea	-192($iotas),$iotas	# rewind iotas
345	ret
346.cfi_endproc
347.size	__KeccakF1600,.-__KeccakF1600
348
349.type	KeccakF1600,\@abi-omnipotent
350.align	32
351KeccakF1600:
352.cfi_startproc
353	push	%rbx
354.cfi_push	%rbx
355	push	%rbp
356.cfi_push	%rbp
357	push	%r12
358.cfi_push	%r12
359	push	%r13
360.cfi_push	%r13
361	push	%r14
362.cfi_push	%r14
363	push	%r15
364.cfi_push	%r15
365
366	lea	100(%rdi),%rdi		# size optimization
367	sub	\$200,%rsp
368.cfi_adjust_cfa_offset	200
369
370	notq	$A[0][1](%rdi)
371	notq	$A[0][2](%rdi)
372	notq	$A[1][3](%rdi)
373	notq	$A[2][2](%rdi)
374	notq	$A[3][2](%rdi)
375	notq	$A[4][0](%rdi)
376
377	lea	iotas(%rip),$iotas
378	lea	100(%rsp),%rsi		# size optimization
379
380	call	__KeccakF1600
381
382	notq	$A[0][1](%rdi)
383	notq	$A[0][2](%rdi)
384	notq	$A[1][3](%rdi)
385	notq	$A[2][2](%rdi)
386	notq	$A[3][2](%rdi)
387	notq	$A[4][0](%rdi)
388	lea	-100(%rdi),%rdi		# preserve A[][]
389
390	add	\$200,%rsp
391.cfi_adjust_cfa_offset	-200
392
393	pop	%r15
394.cfi_pop	%r15
395	pop	%r14
396.cfi_pop	%r14
397	pop	%r13
398.cfi_pop	%r13
399	pop	%r12
400.cfi_pop	%r12
401	pop	%rbp
402.cfi_pop	%rbp
403	pop	%rbx
404.cfi_pop	%rbx
405	ret
406.cfi_endproc
407.size	KeccakF1600,.-KeccakF1600
408___
409
410{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
411     ($A_flat,$inp) = ("%r8","%r9");
412$code.=<<___;
413.globl	SHA3_absorb
414.type	SHA3_absorb,\@function,4
415.align	32
416SHA3_absorb:
417.cfi_startproc
418	push	%rbx
419.cfi_push	%rbx
420	push	%rbp
421.cfi_push	%rbp
422	push	%r12
423.cfi_push	%r12
424	push	%r13
425.cfi_push	%r13
426	push	%r14
427.cfi_push	%r14
428	push	%r15
429.cfi_push	%r15
430
431	lea	100(%rdi),%rdi		# size optimization
432	sub	\$232,%rsp
433.cfi_adjust_cfa_offset	232
434
435	mov	%rsi,$inp
436	lea	100(%rsp),%rsi		# size optimization
437
438	notq	$A[0][1](%rdi)
439	notq	$A[0][2](%rdi)
440	notq	$A[1][3](%rdi)
441	notq	$A[2][2](%rdi)
442	notq	$A[3][2](%rdi)
443	notq	$A[4][0](%rdi)
444	lea	iotas(%rip),$iotas
445
446	mov	$bsz,216-100(%rsi)	# save bsz
447
448.Loop_absorb:
449	cmp	$bsz,$len
450	jc	.Ldone_absorb
451
452	shr	\$3,$bsz
453	lea	-100(%rdi),$A_flat
454
455.Lblock_absorb:
456	mov	($inp),%rax
457	lea	8($inp),$inp
458	xor	($A_flat),%rax
459	lea	8($A_flat),$A_flat
460	sub	\$8,$len
461	mov	%rax,-8($A_flat)
462	sub	\$1,$bsz
463	jnz	.Lblock_absorb
464
465	mov	$inp,200-100(%rsi)	# save inp
466	mov	$len,208-100(%rsi)	# save len
467	call	__KeccakF1600
468	mov	200-100(%rsi),$inp	# pull inp
469	mov	208-100(%rsi),$len	# pull len
470	mov	216-100(%rsi),$bsz	# pull bsz
471	jmp	.Loop_absorb
472
473.align	32
474.Ldone_absorb:
475	mov	$len,%rax		# return value
476
477	notq	$A[0][1](%rdi)
478	notq	$A[0][2](%rdi)
479	notq	$A[1][3](%rdi)
480	notq	$A[2][2](%rdi)
481	notq	$A[3][2](%rdi)
482	notq	$A[4][0](%rdi)
483
484	add	\$232,%rsp
485.cfi_adjust_cfa_offset	-232
486
487	pop	%r15
488.cfi_pop	%r15
489	pop	%r14
490.cfi_pop	%r14
491	pop	%r13
492.cfi_pop	%r13
493	pop	%r12
494.cfi_pop	%r12
495	pop	%rbp
496.cfi_pop	%rbp
497	pop	%rbx
498.cfi_pop	%rbx
499	ret
500.cfi_endproc
501.size	SHA3_absorb,.-SHA3_absorb
502___
503}
504{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
505     ($out,$len,$bsz) = ("%r12","%r13","%r14");
506
507$code.=<<___;
508.globl	SHA3_squeeze
509.type	SHA3_squeeze,\@function,4
510.align	32
511SHA3_squeeze:
512.cfi_startproc
513	push	%r12
514.cfi_push	%r12
515	push	%r13
516.cfi_push	%r13
517	push	%r14
518.cfi_push	%r14
519
520	shr	\$3,%rcx
521	mov	$A_flat,%r8
522	mov	%rsi,$out
523	mov	%rdx,$len
524	mov	%rcx,$bsz
525	jmp	.Loop_squeeze
526
527.align	32
528.Loop_squeeze:
529	cmp	\$8,$len
530	jb	.Ltail_squeeze
531
532	mov	(%r8),%rax
533	lea	8(%r8),%r8
534	mov	%rax,($out)
535	lea	8($out),$out
536	sub	\$8,$len		# len -= 8
537	jz	.Ldone_squeeze
538
539	sub	\$1,%rcx		# bsz--
540	jnz	.Loop_squeeze
541
542	call	KeccakF1600
543	mov	$A_flat,%r8
544	mov	$bsz,%rcx
545	jmp	.Loop_squeeze
546
547.Ltail_squeeze:
548	mov	%r8, %rsi
549	mov	$out,%rdi
550	mov	$len,%rcx
551	.byte	0xf3,0xa4		# rep	movsb
552
553.Ldone_squeeze:
554	pop	%r14
555.cfi_pop	%r14
556	pop	%r13
557.cfi_pop	%r13
558	pop	%r12
559.cfi_pop	%r13
560	ret
561.cfi_endproc
562.size	SHA3_squeeze,.-SHA3_squeeze
563___
564}
565$code.=<<___;
566.align	256
567	.quad	0,0,0,0,0,0,0,0
568.type	iotas,\@object
569iotas:
570	.quad	0x0000000000000001
571	.quad	0x0000000000008082
572	.quad	0x800000000000808a
573	.quad	0x8000000080008000
574	.quad	0x000000000000808b
575	.quad	0x0000000080000001
576	.quad	0x8000000080008081
577	.quad	0x8000000000008009
578	.quad	0x000000000000008a
579	.quad	0x0000000000000088
580	.quad	0x0000000080008009
581	.quad	0x000000008000000a
582	.quad	0x000000008000808b
583	.quad	0x800000000000008b
584	.quad	0x8000000000008089
585	.quad	0x8000000000008003
586	.quad	0x8000000000008002
587	.quad	0x8000000000000080
588	.quad	0x000000000000800a
589	.quad	0x800000008000000a
590	.quad	0x8000000080008081
591	.quad	0x8000000000008080
592	.quad	0x0000000080000001
593	.quad	0x8000000080008008
594.size	iotas,.-iotas
595.asciz	"Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
596___
597
598foreach (split("\n",$code)) {
599	# Below replacement results in 11.2 on Sandy Bridge, 9.4 on
600	# Haswell, but it hurts other processors by up to 2-3-4x...
601	#s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
602	# Below replacement results in 9.3 on Haswell [as well as
603	# on Ryzen, i.e. it *hurts* Ryzen]...
604	#s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
605
606	print $_, "\n";
607}
608
609close STDOUT or die "error closing STDOUT: $!";
610