1#!/usr/bin/env perl
2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for x86_64.
17#
18# June 2017.
19#
20# Below code is [lane complementing] KECCAK_2X implementation (see
21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22# instead of actually unrolling the loop pair-wise I simply flip
23# pointers to T[][] and A[][] at the end of round. Since number of
24# rounds is even, last round writes to A[][] and everything works out.
25# How does it compare to x86_64 assembly module in Keccak Code Package?
26# Depending on processor it's either as fast or faster by up to 15%...
27#
28########################################################################
29# Numbers are cycles per processed byte out of large message.
30#
31#			r=1088(*)
32#
33# P4			25.8
34# Core 2		12.9
35# Westmere		13.7
36# Sandy Bridge		12.9(**)
37# Haswell		9.6
38# Skylake		9.4
39# Silvermont		22.8
40# Goldmont		15.8
41# VIA Nano		17.3
42# Sledgehammer		13.3
43# Bulldozer		16.5
44# Ryzen			8.8
45#
46# (*)	Corresponds to SHA3-256. Improvement over compiler-generate
47#	varies a lot, most commont coefficient is 15% in comparison to
48#	gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49# (**)	Sandy Bridge has broken rotate instruction. Performance can be
50#	improved by 14% by replacing rotates with double-precision
51#	shift with same register as source and destination.
52
53$flavour = shift;
54$output  = shift;
55if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56
57$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62die "can't locate x86_64-xlate.pl";
63
64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
65*STDOUT=*OUT;
66
67my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
68              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
69
70my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
71my @D = map("%r$_",(8..12));
72my @T = map("%r$_",(13..14));
73my $iotas = "%r15";
74
75my @rhotates = ([  0,  1, 62, 28, 27 ],
76                [ 36, 44,  6, 55, 20 ],
77                [  3, 10, 43, 25, 39 ],
78                [ 41, 45, 15, 21,  8 ],
79                [ 18,  2, 61, 56, 14 ]);
80
81$code.=<<___;
82.text
83
84.type	__KeccakF1600,\@abi-omnipotent
85.align	32
86__KeccakF1600:
87	mov	$A[4][0](%rdi),@C[0]
88	mov	$A[4][1](%rdi),@C[1]
89	mov	$A[4][2](%rdi),@C[2]
90	mov	$A[4][3](%rdi),@C[3]
91	mov	$A[4][4](%rdi),@C[4]
92	jmp	.Loop
93
94.align	32
95.Loop:
96	mov	$A[0][0](%rdi),@D[0]
97	mov	$A[1][1](%rdi),@D[1]
98	mov	$A[2][2](%rdi),@D[2]
99	mov	$A[3][3](%rdi),@D[3]
100
101	xor	$A[0][2](%rdi),@C[2]
102	xor	$A[0][3](%rdi),@C[3]
103	xor	@D[0],         @C[0]
104	xor	$A[0][1](%rdi),@C[1]
105	 xor	$A[1][2](%rdi),@C[2]
106	 xor	$A[1][0](%rdi),@C[0]
107	mov	@C[4],@D[4]
108	xor	$A[0][4](%rdi),@C[4]
109
110	xor	@D[2],         @C[2]
111	xor	$A[2][0](%rdi),@C[0]
112	 xor	$A[1][3](%rdi),@C[3]
113	 xor	@D[1],         @C[1]
114	 xor	$A[1][4](%rdi),@C[4]
115
116	xor	$A[3][2](%rdi),@C[2]
117	xor	$A[3][0](%rdi),@C[0]
118	 xor	$A[2][3](%rdi),@C[3]
119	 xor	$A[2][1](%rdi),@C[1]
120	 xor	$A[2][4](%rdi),@C[4]
121
122	mov	@C[2],@T[0]
123	rol	\$1,@C[2]
124	xor	@C[0],@C[2]		# D[1] = ROL64(C[2], 1) ^ C[0]
125	 xor	@D[3],         @C[3]
126
127	rol	\$1,@C[0]
128	xor	@C[3],@C[0]		# D[4] = ROL64(C[0], 1) ^ C[3]
129	 xor	$A[3][1](%rdi),@C[1]
130
131	rol	\$1,@C[3]
132	xor	@C[1],@C[3]		# D[2] = ROL64(C[3], 1) ^ C[1]
133	 xor	$A[3][4](%rdi),@C[4]
134
135	rol	\$1,@C[1]
136	xor	@C[4],@C[1]		# D[0] = ROL64(C[1], 1) ^ C[4]
137
138	rol	\$1,@C[4]
139	xor	@T[0],@C[4]		# D[3] = ROL64(C[4], 1) ^ C[2]
140___
141	(@D[0..4], @C) = (@C[1..4,0], @D);
142$code.=<<___;
143	xor	@D[1],@C[1]
144	xor	@D[2],@C[2]
145	rol	\$$rhotates[1][1],@C[1]
146	xor	@D[3],@C[3]
147	xor	@D[4],@C[4]
148	rol	\$$rhotates[2][2],@C[2]
149	xor	@D[0],@C[0]
150	 mov	@C[1],@T[0]
151	rol	\$$rhotates[3][3],@C[3]
152	 or	@C[2],@C[1]
153	 xor	@C[0],@C[1]		#           C[0] ^ ( C[1] | C[2])
154	rol	\$$rhotates[4][4],@C[4]
155
156	 xor	($iotas),@C[1]
157	 lea	8($iotas),$iotas
158
159	mov	@C[4],@T[1]
160	and	@C[3],@C[4]
161	 mov	@C[1],$A[0][0](%rsi)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
162	xor	@C[2],@C[4]		#           C[2] ^ ( C[4] & C[3])
163	not	@C[2]
164	mov	@C[4],$A[0][2](%rsi)	# R[0][2] = C[2] ^ ( C[4] & C[3])
165
166	or	@C[3],@C[2]
167	  mov	$A[4][2](%rdi),@C[4]
168	xor	@T[0],@C[2]		#           C[1] ^ (~C[2] | C[3])
169	mov	@C[2],$A[0][1](%rsi)	# R[0][1] = C[1] ^ (~C[2] | C[3])
170
171	and	@C[0],@T[0]
172	  mov	$A[1][4](%rdi),@C[1]
173	xor	@T[1],@T[0]		#           C[4] ^ ( C[1] & C[0])
174	  mov	$A[2][0](%rdi),@C[2]
175	mov	@T[0],$A[0][4](%rsi)	# R[0][4] = C[4] ^ ( C[1] & C[0])
176
177	or	@C[0],@T[1]
178	  mov	$A[0][3](%rdi),@C[0]
179	xor	@C[3],@T[1]		#           C[3] ^ ( C[4] | C[0])
180	  mov	$A[3][1](%rdi),@C[3]
181	mov	@T[1],$A[0][3](%rsi)	# R[0][3] = C[3] ^ ( C[4] | C[0])
182
183
184	xor	@D[3],@C[0]
185	xor	@D[2],@C[4]
186	rol	\$$rhotates[0][3],@C[0]
187	xor	@D[1],@C[3]
188	xor	@D[4],@C[1]
189	rol	\$$rhotates[4][2],@C[4]
190	rol	\$$rhotates[3][1],@C[3]
191	xor	@D[0],@C[2]
192	rol	\$$rhotates[1][4],@C[1]
193	 mov	@C[0],@T[0]
194	 or	@C[4],@C[0]
195	rol	\$$rhotates[2][0],@C[2]
196
197	xor	@C[3],@C[0]		#           C[3] ^ (C[0] |  C[4])
198	mov	@C[0],$A[1][3](%rsi)	# R[1][3] = C[3] ^ (C[0] |  C[4])
199
200	mov	@C[1],@T[1]
201	and	@T[0],@C[1]
202	  mov	$A[0][1](%rdi),@C[0]
203	xor	@C[4],@C[1]		#           C[4] ^ (C[1] &  C[0])
204	not	@C[4]
205	mov	@C[1],$A[1][4](%rsi)	# R[1][4] = C[4] ^ (C[1] &  C[0])
206
207	or	@C[3],@C[4]
208	  mov	$A[1][2](%rdi),@C[1]
209	xor	@C[2],@C[4]		#           C[2] ^ (~C[4] | C[3])
210	mov	@C[4],$A[1][2](%rsi)	# R[1][2] = C[2] ^ (~C[4] | C[3])
211
212	and	@C[2],@C[3]
213	  mov	$A[4][0](%rdi),@C[4]
214	xor	@T[1],@C[3]		#           C[1] ^ (C[3] &  C[2])
215	mov	@C[3],$A[1][1](%rsi)	# R[1][1] = C[1] ^ (C[3] &  C[2])
216
217	or	@C[2],@T[1]
218	  mov	$A[2][3](%rdi),@C[2]
219	xor	@T[0],@T[1]		#           C[0] ^ (C[1] |  C[2])
220	  mov	$A[3][4](%rdi),@C[3]
221	mov	@T[1],$A[1][0](%rsi)	# R[1][0] = C[0] ^ (C[1] |  C[2])
222
223
224	xor	@D[3],@C[2]
225	xor	@D[4],@C[3]
226	rol	\$$rhotates[2][3],@C[2]
227	xor	@D[2],@C[1]
228	rol	\$$rhotates[3][4],@C[3]
229	xor	@D[0],@C[4]
230	rol	\$$rhotates[1][2],@C[1]
231	xor	@D[1],@C[0]
232	rol	\$$rhotates[4][0],@C[4]
233	 mov	@C[2],@T[0]
234	 and	@C[3],@C[2]
235	rol	\$$rhotates[0][1],@C[0]
236
237	not	@C[3]
238	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] & C[3])
239	mov	@C[2],$A[2][1](%rsi)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
240
241	mov	@C[4],@T[1]
242	and	@C[3],@C[4]
243	  mov	$A[2][1](%rdi),@C[2]
244	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] & ~C[3])
245	mov	@C[4],$A[2][2](%rsi)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
246
247	or	@C[1],@T[0]
248	  mov	$A[4][3](%rdi),@C[4]
249	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] | C[1])
250	mov	@T[0],$A[2][0](%rsi)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
251
252	and	@C[0],@C[1]
253	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] & C[0])
254	mov	@C[1],$A[2][4](%rsi)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
255
256	or	@C[0],@T[1]
257	  mov	$A[1][0](%rdi),@C[1]
258	xor	@C[3],@T[1]		#           ~C[3] ^ ( C[0] | C[4])
259	  mov	$A[3][2](%rdi),@C[3]
260	mov	@T[1],$A[2][3](%rsi)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
261
262
263	mov	$A[0][4](%rdi),@C[0]
264
265	xor	@D[1],@C[2]
266	xor	@D[2],@C[3]
267	rol	\$$rhotates[2][1],@C[2]
268	xor	@D[0],@C[1]
269	rol	\$$rhotates[3][2],@C[3]
270	xor	@D[3],@C[4]
271	rol	\$$rhotates[1][0],@C[1]
272	xor	@D[4],@C[0]
273	rol	\$$rhotates[4][3],@C[4]
274	 mov	@C[2],@T[0]
275	 or	@C[3],@C[2]
276	rol	\$$rhotates[0][4],@C[0]
277
278	not	@C[3]
279	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] | C[3])
280	mov	@C[2],$A[3][1](%rsi)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
281
282	mov	@C[4],@T[1]
283	or	@C[3],@C[4]
284	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] | ~C[3])
285	mov	@C[4],$A[3][2](%rsi)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
286
287	and	@C[1],@T[0]
288	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] & C[1])
289	mov	@T[0],$A[3][0](%rsi)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
290
291	or	@C[0],@C[1]
292	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] | C[0])
293	mov	@C[1],$A[3][4](%rsi)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
294
295	and	@T[1],@C[0]
296	xor	@C[3],@C[0]		#           ~C[3] ^ ( C[0] & C[4])
297	mov	@C[0],$A[3][3](%rsi)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
298
299
300	xor	$A[0][2](%rdi),@D[2]
301	xor	$A[1][3](%rdi),@D[3]
302	rol	\$$rhotates[0][2],@D[2]
303	xor	$A[4][1](%rdi),@D[1]
304	rol	\$$rhotates[1][3],@D[3]
305	xor	$A[2][4](%rdi),@D[4]
306	rol	\$$rhotates[4][1],@D[1]
307	xor	$A[3][0](%rdi),@D[0]
308	xchg	%rsi,%rdi
309	rol	\$$rhotates[2][4],@D[4]
310	rol	\$$rhotates[3][0],@D[0]
311___
312	@C = @D[2..4,0,1];
313$code.=<<___;
314	mov	@C[0],@T[0]
315	and	@C[1],@C[0]
316	not	@C[1]
317	xor	@C[4],@C[0]		#            C[4] ^ ( C[0] & C[1])
318	mov	@C[0],$A[4][4](%rdi)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
319
320	mov	@C[2],@T[1]
321	and	@C[1],@C[2]
322	xor	@T[0],@C[2]		#            C[0] ^ ( C[2] & ~C[1])
323	mov	@C[2],$A[4][0](%rdi)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
324
325	or	@C[4],@T[0]
326	xor	@C[3],@T[0]		#            C[3] ^ ( C[0] | C[4])
327	mov	@T[0],$A[4][3](%rdi)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
328
329	and	@C[3],@C[4]
330	xor	@T[1],@C[4]		#            C[2] ^ ( C[4] & C[3])
331	mov	@C[4],$A[4][2](%rdi)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
332
333	or	@T[1],@C[3]
334	xor	@C[1],@C[3]		#           ~C[1] ^ ( C[2] | C[3])
335	mov	@C[3],$A[4][1](%rdi)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
336
337	mov	@C[0],@C[1]		# harmonize with the loop top
338	mov	@T[0],@C[0]
339
340	test	\$255,$iotas
341	jnz	.Loop
342
343	lea	-192($iotas),$iotas	# rewind iotas
344	ret
345.size	__KeccakF1600,.-__KeccakF1600
346
347.type	KeccakF1600,\@abi-omnipotent
348.align	32
349KeccakF1600:
350.cfi_startproc
351	push	%rbx
352.cfi_push	%rbx
353	push	%rbp
354.cfi_push	%rbp
355	push	%r12
356.cfi_push	%r12
357	push	%r13
358.cfi_push	%r13
359	push	%r14
360.cfi_push	%r14
361	push	%r15
362.cfi_push	%r15
363
364	lea	100(%rdi),%rdi		# size optimization
365	sub	\$200,%rsp
366.cfi_adjust_cfa_offset	200
367
368	notq	$A[0][1](%rdi)
369	notq	$A[0][2](%rdi)
370	notq	$A[1][3](%rdi)
371	notq	$A[2][2](%rdi)
372	notq	$A[3][2](%rdi)
373	notq	$A[4][0](%rdi)
374
375	lea	iotas(%rip),$iotas
376	lea	100(%rsp),%rsi		# size optimization
377
378	call	__KeccakF1600
379
380	notq	$A[0][1](%rdi)
381	notq	$A[0][2](%rdi)
382	notq	$A[1][3](%rdi)
383	notq	$A[2][2](%rdi)
384	notq	$A[3][2](%rdi)
385	notq	$A[4][0](%rdi)
386	lea	-100(%rdi),%rdi		# preserve A[][]
387
388	add	\$200,%rsp
389.cfi_adjust_cfa_offset	-200
390
391	pop	%r15
392.cfi_pop	%r15
393	pop	%r14
394.cfi_pop	%r14
395	pop	%r13
396.cfi_pop	%r13
397	pop	%r12
398.cfi_pop	%r12
399	pop	%rbp
400.cfi_pop	%rbp
401	pop	%rbx
402.cfi_pop	%rbx
403	ret
404.cfi_endproc
405.size	KeccakF1600,.-KeccakF1600
406___
407
408{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
409     ($A_flat,$inp) = ("%r8","%r9");
410$code.=<<___;
411.globl	SHA3_absorb
412.type	SHA3_absorb,\@function,4
413.align	32
414SHA3_absorb:
415.cfi_startproc
416	push	%rbx
417.cfi_push	%rbx
418	push	%rbp
419.cfi_push	%rbp
420	push	%r12
421.cfi_push	%r12
422	push	%r13
423.cfi_push	%r13
424	push	%r14
425.cfi_push	%r14
426	push	%r15
427.cfi_push	%r15
428
429	lea	100(%rdi),%rdi		# size optimization
430	sub	\$232,%rsp
431.cfi_adjust_cfa_offset	232
432
433	mov	%rsi,$inp
434	lea	100(%rsp),%rsi		# size optimization
435
436	notq	$A[0][1](%rdi)
437	notq	$A[0][2](%rdi)
438	notq	$A[1][3](%rdi)
439	notq	$A[2][2](%rdi)
440	notq	$A[3][2](%rdi)
441	notq	$A[4][0](%rdi)
442	lea	iotas(%rip),$iotas
443
444	mov	$bsz,216-100(%rsi)	# save bsz
445
446.Loop_absorb:
447	cmp	$bsz,$len
448	jc	.Ldone_absorb
449
450	shr	\$3,$bsz
451	lea	-100(%rdi),$A_flat
452
453.Lblock_absorb:
454	mov	($inp),%rax
455	lea	8($inp),$inp
456	xor	($A_flat),%rax
457	lea	8($A_flat),$A_flat
458	sub	\$8,$len
459	mov	%rax,-8($A_flat)
460	sub	\$1,$bsz
461	jnz	.Lblock_absorb
462
463	mov	$inp,200-100(%rsi)	# save inp
464	mov	$len,208-100(%rsi)	# save len
465	call	__KeccakF1600
466	mov	200-100(%rsi),$inp	# pull inp
467	mov	208-100(%rsi),$len	# pull len
468	mov	216-100(%rsi),$bsz	# pull bsz
469	jmp	.Loop_absorb
470
471.align	32
472.Ldone_absorb:
473	mov	$len,%rax		# return value
474
475	notq	$A[0][1](%rdi)
476	notq	$A[0][2](%rdi)
477	notq	$A[1][3](%rdi)
478	notq	$A[2][2](%rdi)
479	notq	$A[3][2](%rdi)
480	notq	$A[4][0](%rdi)
481
482	add	\$232,%rsp
483.cfi_adjust_cfa_offset	-232
484
485	pop	%r15
486.cfi_pop	%r15
487	pop	%r14
488.cfi_pop	%r14
489	pop	%r13
490.cfi_pop	%r13
491	pop	%r12
492.cfi_pop	%r12
493	pop	%rbp
494.cfi_pop	%rbp
495	pop	%rbx
496.cfi_pop	%rbx
497	ret
498.cfi_endproc
499.size	SHA3_absorb,.-SHA3_absorb
500___
501}
502{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
503     ($out,$len,$bsz) = ("%r12","%r13","%r14");
504
505$code.=<<___;
506.globl	SHA3_squeeze
507.type	SHA3_squeeze,\@function,4
508.align	32
509SHA3_squeeze:
510.cfi_startproc
511	push	%r12
512.cfi_push	%r12
513	push	%r13
514.cfi_push	%r13
515	push	%r14
516.cfi_push	%r14
517
518	shr	\$3,%rcx
519	mov	$A_flat,%r8
520	mov	%rsi,$out
521	mov	%rdx,$len
522	mov	%rcx,$bsz
523	jmp	.Loop_squeeze
524
525.align	32
526.Loop_squeeze:
527	cmp	\$8,$len
528	jb	.Ltail_squeeze
529
530	mov	(%r8),%rax
531	lea	8(%r8),%r8
532	mov	%rax,($out)
533	lea	8($out),$out
534	sub	\$8,$len		# len -= 8
535	jz	.Ldone_squeeze
536
537	sub	\$1,%rcx		# bsz--
538	jnz	.Loop_squeeze
539
540	call	KeccakF1600
541	mov	$A_flat,%r8
542	mov	$bsz,%rcx
543	jmp	.Loop_squeeze
544
545.Ltail_squeeze:
546	mov	%r8, %rsi
547	mov	$out,%rdi
548	mov	$len,%rcx
549	.byte	0xf3,0xa4		# rep	movsb
550
551.Ldone_squeeze:
552	pop	%r14
553.cfi_pop	%r14
554	pop	%r13
555.cfi_pop	%r13
556	pop	%r12
557.cfi_pop	%r13
558	ret
559.cfi_endproc
560.size	SHA3_squeeze,.-SHA3_squeeze
561___
562}
563$code.=<<___;
564.align	256
565	.quad	0,0,0,0,0,0,0,0
566.type	iotas,\@object
567iotas:
568	.quad	0x0000000000000001
569	.quad	0x0000000000008082
570	.quad	0x800000000000808a
571	.quad	0x8000000080008000
572	.quad	0x000000000000808b
573	.quad	0x0000000080000001
574	.quad	0x8000000080008081
575	.quad	0x8000000000008009
576	.quad	0x000000000000008a
577	.quad	0x0000000000000088
578	.quad	0x0000000080008009
579	.quad	0x000000008000000a
580	.quad	0x000000008000808b
581	.quad	0x800000000000008b
582	.quad	0x8000000000008089
583	.quad	0x8000000000008003
584	.quad	0x8000000000008002
585	.quad	0x8000000000000080
586	.quad	0x000000000000800a
587	.quad	0x800000008000000a
588	.quad	0x8000000080008081
589	.quad	0x8000000000008080
590	.quad	0x0000000080000001
591	.quad	0x8000000080008008
592.size	iotas,.-iotas
593.asciz	"Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
594___
595
596foreach (split("\n",$code)) {
597	# Below replacement results in 11.2 on Sandy Bridge, 9.4 on
598	# Haswell, but it hurts other processors by up to 2-3-4x...
599	#s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
600	# Below replacement results in 9.3 on Haswell [as well as
601	# on Ryzen, i.e. it *hurts* Ryzen]...
602	#s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
603
604	print $_, "\n";
605}
606
607close STDOUT;
608