1/ This Source Code Form is subject to the terms of the Mozilla Public
2/ License, v. 2.0. If a copy of the MPL was not distributed with this
3/ file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5/ ** ARCFOUR implementation optimized for AMD64.
6/ **
7/ ** The throughput achieved by this code is about 320 MBytes/sec, on
8/ ** a 1.8 GHz AMD Opteron (rev C0) processor.
9
10.text
11.align 16
12.globl ARCFOUR
13.type ARCFOUR,@function
14ARCFOUR:
15	pushq	%rbp
16	pushq	%rbx
17	movq	%rdi,		%rbp	/ key = ARG(key)
18	movq	%rsi,		%rbx	/ rbx = ARG(len)
19	movq	%rdx,		%rsi	/ in = ARG(in)
20	movq	%rcx,		%rdi	/ out = ARG(out)
21	movq	(%rbp),		%rcx	/ x = key->x
22	movq	8(%rbp),	%rdx	/ y = key->y
23	addq	$16,		%rbp	/ d = key->data
24	incq	%rcx			/ x++
25	andq	$255,		%rcx	/ x &= 0xff
26	leaq	-8(%rbx,%rsi),	%rbx	/ rbx = in+len-8
27	movq	%rbx,		%r9	/ tmp = in+len-8
28	movq	0(%rbp,%rcx,8),	%rax	/ tx = d[x]
29	cmpq	%rsi,		%rbx	/ cmp in with in+len-8
30	jl	.Lend			/ jump if (in+len-8 < in)
31
32.Lstart:
33	addq	$8,		%rsi		/ increment in
34	addq	$8,		%rdi		/ increment out
35
36	/ generate the next 8 bytes of the rc4 stream into %r8
37	movq	$8,		%r11		/ byte counter
381:	addb	%al,		%dl		/ y += tx
39	movl	0(%rbp,%rdx,8),	%ebx		/ ty = d[y]
40	movl	%ebx,		0(%rbp,%rcx,8)	/ d[x] = ty
41	addb	%al,		%bl		/ val = ty + tx
42	movl	%eax,		0(%rbp,%rdx,8)	/ d[y] = tx
43	incb	%cl				/ x++		(NEXT ROUND)
44	movl	0(%rbp,%rcx,8),	%eax		/ tx = d[x]	(NEXT ROUND)
45	movb	0(%rbp,%rbx,8),	%r8b		/ val = d[val]
46	decb	%r11b
47	rorq	$8,		%r8		/ (ror does not change ZF)
48	jnz 	1b
49
50	/ xor 8 bytes
51	xorq	-8(%rsi),	%r8
52	cmpq	%r9,		%rsi		/ cmp in+len-8 with in
53	movq	%r8,		-8(%rdi)
54	jle	.Lstart				/ jump if (in <= in+len-8)
55
56.Lend:
57	addq	$8,		%r9		/ tmp = in+len
58
59	/ handle the last bytes, one by one
601:	cmpq	%rsi,		%r9		/ cmp in with in+len
61	jle	.Lfinished			/ jump if (in+len <= in)
62	addb	%al,		%dl		/ y += tx
63	movl	0(%rbp,%rdx,8),	%ebx		/ ty = d[y]
64	movl	%ebx,		0(%rbp,%rcx,8)	/ d[x] = ty
65	addb	%al,		%bl		/ val = ty + tx
66	movl	%eax,		0(%rbp,%rdx,8)	/ d[y] = tx
67	incb	%cl				/ x++		(NEXT ROUND)
68	movl	0(%rbp,%rcx,8),	%eax		/ tx = d[x]	(NEXT ROUND)
69	movb	0(%rbp,%rbx,8),	%r8b		/ val = d[val]
70	xorb	(%rsi),		%r8b		/ xor 1 byte
71	movb	%r8b,		(%rdi)
72	incq	%rsi				/ in++
73	incq	%rdi				/ out++
74	jmp 1b
75
76.Lfinished:
77	decq	%rcx				/ x--
78	movb	%dl,		-8(%rbp)	/ key->y = y
79	movb	%cl,		-16(%rbp)	/ key->x = x
80	popq	%rbx
81	popq	%rbp
82	ret
83.L_ARCFOUR_end:
84.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR
85