xref: /netbsd/sys/arch/mvme68k/stand/sboot/oc_cksum.s (revision 6550d01e)
1|	$NetBSD: oc_cksum.s,v 1.2 2000/11/30 22:26:27 scw Exp $
2
3| Copyright (c) 1988 Regents of the University of California.
4| All rights reserved.
5|
6| Redistribution and use in source and binary forms, with or without
7| modification, are permitted provided that the following conditions
8| are met:
9| 1. Redistributions of source code must retain the above copyright
10|    notice, this list of conditions and the following disclaimer.
11| 2. Redistributions in binary form must reproduce the above copyright
12|    notice, this list of conditions and the following disclaimer in the
13|    documentation and/or other materials provided with the distribution.
14| 3. All advertising materials mentioning features or use of this software
15|    must display the following acknowledgement:
16|	This product includes software developed by the University of
17|	California, Berkeley and its contributors.
18| 4. Neither the name of the University nor the names of its contributors
19|    may be used to endorse or promote products derived from this software
20|    without specific prior written permission.
21|
22| THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25| ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28| OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32| SUCH DAMAGE.
33|
34|	@(#)oc_cksum.s	7.2 (Berkeley) 11/3/90
35|
36|
37| oc_cksum: ones complement 16 bit checksum for MC68020.
38|
39| oc_cksum (buffer, count, strtval)
40|
41| Do a 16 bit one's complement sum of 'count' bytes from 'buffer'.
42| 'strtval' is the starting value of the sum (usually zero).
43|
44| It simplifies life in in_cksum if strtval can be >= 2^16.
45| This routine will work as long as strtval is < 2^31.
46|
47| Performance
48| -----------
49| This routine is intended for MC 68020s but should also work
50| for 68030s.  It (deliberately) doesn't worry about the alignment
51| of the buffer so will only work on a 68010 if the buffer is
52| aligned on an even address.  (Also, a routine written to use
53| 68010 "loop mode" would almost certainly be faster than this
54| code on a 68010).
55|
56| We don't worry about alignment because this routine is frequently
57| called with small counts: 20 bytes for IP header checksums and 40
58| bytes for TCP ack checksums.  For these small counts, testing for
59| bad alignment adds ~10% to the per-call cost.  Since, by the nature
60| of the kernel's allocator, the data we're called with is almost
61| always longword aligned, there is no benefit to this added cost
62| and we're better off letting the loop take a big performance hit
63| in the rare cases where we're handed an unaligned buffer.
64|
65| Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were
66| tested on random data on four different types of processors (see
67| list below -- 64 was the largest unrolling because anything more
68| overflows the 68020 Icache).  On all the processors, the
69| throughput asymptote was located between 8 and 16 (closer to 8).
70| However, 16 was substantially better than 8 for small counts.
71| (It's clear why this happens for a count of 40: unroll-8 pays a
72| loop branch cost and unroll-16 doesn't.  But the tests also showed
73| that 16 was better than 8 for a count of 20.  It's not obvious to
74| me why.)  So, since 16 was good for both large and small counts,
75| the loop below is unrolled 16 times.
76|
77| The processors tested and their average time to checksum 1024 bytes
78| of random data were:
79| 	Sun 3/50 (15MHz)	190 us/KB
80| 	Sun 3/180 (16.6MHz)	175 us/KB
81| 	Sun 3/60 (20MHz)	134 us/KB
82| 	Sun 3/280 (25MHz)	 95 us/KB
83|
84| The cost of calling this routine was typically 10% of the per-
85| kilobyte cost.  E.g., checksumming zero bytes on a 3/60 cost 9us
86| and each additional byte cost 125ns.  With the high fixed cost,
87| it would clearly be a gain to "inline" this routine -- the
88| subroutine call adds 400% overhead to an IP header checksum.
89| However, in absolute terms, inlining would only gain 10us per
90| packet -- a 1% effect for a 1ms ethernet packet.  This is not
91| enough gain to be worth the effort.
92
93#include <m68k/asm.h>
94
95	.text
96	.even
97
98ENTRY_NOPROFILE(oc_cksum)
99	movl	%sp@(4),%a0	| get buffer ptr
100	movl	%sp@(8),%d1	| get byte count
101	movl	%sp@(12),%d0	| get starting value
102	movl	%d2,%sp@-	| free a reg
103
104	| test for possible 1, 2 or 3 bytes of excess at end
105	| of buffer.  The usual case is no excess (the usual
106	| case is header checksums) so we give that the faster
107	| 'not taken' leg of the compare.  (We do the excess
108	| first because we're about the trash the low order
109	| bits of the count in d1.)
110
111	btst	#0,%d1
112	jne	L5		| if one or three bytes excess
113	btst	#1,%d1
114	jne	L7		| if two bytes excess
115L1:
116	movl	%d1,%d2
117	lsrl	#6,%d1		| make cnt into # of 64 byte chunks
118	andl	#0x3c,%d2	| then find fractions of a chunk
119	negl	%d2
120	andb	#0xf,%ccr		| clear X
121	jmp	%pc@(L3-.-2:b,%d2)
122L2:
123	movl	%a0@+,%d2
124	addxl	%d2,%d0
125	movl	%a0@+,%d2
126	addxl	%d2,%d0
127	movl	%a0@+,%d2
128	addxl	%d2,%d0
129	movl	%a0@+,%d2
130	addxl	%d2,%d0
131	movl	%a0@+,%d2
132	addxl	%d2,%d0
133	movl	%a0@+,%d2
134	addxl	%d2,%d0
135	movl	%a0@+,%d2
136	addxl	%d2,%d0
137	movl	%a0@+,%d2
138	addxl	%d2,%d0
139	movl	%a0@+,%d2
140	addxl	%d2,%d0
141	movl	%a0@+,%d2
142	addxl	%d2,%d0
143	movl	%a0@+,%d2
144	addxl	%d2,%d0
145	movl	%a0@+,%d2
146	addxl	%d2,%d0
147	movl	%a0@+,%d2
148	addxl	%d2,%d0
149	movl	%a0@+,%d2
150	addxl	%d2,%d0
151	movl	%a0@+,%d2
152	addxl	%d2,%d0
153	movl	%a0@+,%d2
154	addxl	%d2,%d0
155L3:
156	dbra	%d1,L2		| (NB- dbra doesn't affect X)
157
158	movl	%d0,%d1		| fold 32 bit sum to 16 bits
159	swap	%d1		| (NB- swap doesn't affect X)
160	addxw	%d1,%d0
161	jcc	L4
162	addw	#1,%d0
163L4:
164	andl	#0xffff,%d0
165	movl	%sp@+,%d2
166	rts
167
168L5:	| deal with 1 or 3 excess bytes at the end of the buffer.
169	btst	#1,%d1
170	jeq	L6		| if 1 excess
171
172	| 3 bytes excess
173	clrl	%d2
174	movw	%a0@(-3,%d1:l),%d2	| add in last full word then drop
175	addl	%d2,%d0		|  through to pick up last byte
176
177L6:	| 1 byte excess
178	clrl	%d2
179	movb	%a0@(-1,%d1:l),%d2
180	lsll	#8,%d2
181	addl	%d2,%d0
182	jra	L1
183
184L7:	| 2 bytes excess
185	clrl	%d2
186	movw	%a0@(-2,%d1:l),%d2
187	addl	%d2,%d0
188	jra	L1
189