xref: /original-bsd/sys/hp300/hp300/oc_cksum.s (revision 333da485)
1/*
2| Copyright (c) 1988, 1993
3|	The Regents of the University of California.  All rights reserved.
4|
5| %sccs.include.redist.gas%
6|
7|	@(#)oc_cksum.s	8.3 (Berkeley) 01/21/94
8|
9|
10| oc_cksum: ones complement 16 bit checksum for MC68020.
11|
12| oc_cksum (buffer, count, strtval)
13|
14| Do a 16 bit one's complement sum of 'count' bytes from 'buffer'.
15| 'strtval' is the starting value of the sum (usually zero).
16|
17| It simplifies life in in_cksum if strtval can be >= 2^16.
18| This routine will work as long as strtval is < 2^31.
19|
20| Performance
21| -----------
22| This routine is intended for MC 68020s but should also work
23| for 68030s.  It (deliberately) doesn't worry about the alignment
24| of the buffer so will only work on a 68010 if the buffer is
25| aligned on an even address.  (Also, a routine written to use
26| 68010 "loop mode" would almost certainly be faster than this
27| code on a 68010).
28|
29| We don't worry about alignment because this routine is frequently
30| called with small counts: 20 bytes for IP header checksums and 40
31| bytes for TCP ack checksums.  For these small counts, testing for
32| bad alignment adds ~10% to the per-call cost.  Since, by the nature
33| of the kernel's allocator, the data we're called with is almost
34| always longword aligned, there is no benefit to this added cost
35| and we're better off letting the loop take a big performance hit
36| in the rare cases where we're handed an unaligned buffer.
37|
38| Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were
39| tested on random data on four different types of processors (see
40| list below -- 64 was the largest unrolling because anything more
41| overflows the 68020 Icache).  On all the processors, the
42| throughput asymptote was located between 8 and 16 (closer to 8).
43| However, 16 was substantially better than 8 for small counts.
44| (It's clear why this happens for a count of 40: unroll-8 pays a
45| loop branch cost and unroll-16 doesn't.  But the tests also showed
46| that 16 was better than 8 for a count of 20.  It's not obvious to
47| me why.)  So, since 16 was good for both large and small counts,
48| the loop below is unrolled 16 times.
49|
50| The processors tested and their average time to checksum 1024 bytes
51| of random data were:
52| 	Sun 3/50 (15MHz)	190 us/KB
53| 	Sun 3/180 (16.6MHz)	175 us/KB
54| 	Sun 3/60 (20MHz)	134 us/KB
55| 	Sun 3/280 (25MHz)	 95 us/KB
56|
57| The cost of calling this routine was typically 10% of the per-
58| kilobyte cost.  E.g., checksumming zero bytes on a 3/60 cost 9us
59| and each additional byte cost 125ns.  With the high fixed cost,
60| it would clearly be a gain to "inline" this routine -- the
61| subroutine call adds 400% overhead to an IP header checksum.
62| However, in absolute terms, inlining would only gain 10us per
63| packet -- a 1% effect for a 1ms ethernet packet.  This is not
64| enough gain to be worth the effort.
65*/
66
67	.data
68	.asciz	"@(#)$Header: oc_cksum.s,v 1.1 90/07/09 16:04:43 mike Exp $"
69	.even
70	.text
71
72	.globl	_oc_cksum
73_oc_cksum:
74	movl	sp@(4),a0	| get buffer ptr
75	movl	sp@(8),d1	| get byte count
76	movl	sp@(12),d0	| get starting value
77	movl	d2,sp@-		| free a reg
78
79	| test for possible 1, 2 or 3 bytes of excess at end
80	| of buffer.  The usual case is no excess (the usual
81	| case is header checksums) so we give that the faster
82	| 'not taken' leg of the compare.  (We do the excess
83	| first because we are about the trash the low order
84	| bits of the count in d1.)
85
86	btst	#0,d1
87	jne	L5		| if one or three bytes excess
88	btst	#1,d1
89	jne	L7		| if two bytes excess
90L1:
91	movl	d1,d2
92	lsrl	#6,d1		| make cnt into # of 64 byte chunks
93	andl	#0x3c,d2	| then find fractions of a chunk
94	negl	d2
95	andb	#0xf,cc		| clear X
96	jmp	pc@(L3-.-2:b,d2)
97L2:
98	movl	a0@+,d2
99	addxl	d2,d0
100	movl	a0@+,d2
101	addxl	d2,d0
102	movl	a0@+,d2
103	addxl	d2,d0
104	movl	a0@+,d2
105	addxl	d2,d0
106	movl	a0@+,d2
107	addxl	d2,d0
108	movl	a0@+,d2
109	addxl	d2,d0
110	movl	a0@+,d2
111	addxl	d2,d0
112	movl	a0@+,d2
113	addxl	d2,d0
114	movl	a0@+,d2
115	addxl	d2,d0
116	movl	a0@+,d2
117	addxl	d2,d0
118	movl	a0@+,d2
119	addxl	d2,d0
120	movl	a0@+,d2
121	addxl	d2,d0
122	movl	a0@+,d2
123	addxl	d2,d0
124	movl	a0@+,d2
125	addxl	d2,d0
126	movl	a0@+,d2
127	addxl	d2,d0
128	movl	a0@+,d2
129	addxl	d2,d0
130L3:
131	dbra	d1,L2		| (NB- dbra does not affect X)
132
133	movl	d0,d1		| fold 32 bit sum to 16 bits
134	swap	d1		| (NB- swap does not affect X)
135	addxw	d1,d0
136	jcc	L4
137	addw	#1,d0
138L4:
139	andl	#0xffff,d0
140	movl	sp@+,d2
141	rts
142
143L5:	| deal with 1 or 3 excess bytes at the end of the buffer.
144	btst	#1,d1
145	jeq	L6		| if 1 excess
146
147	| 3 bytes excess
148	clrl	d2
149	movw	a0@(-3,d1:l),d2	| add in last full word then drop
150	addl	d2,d0		|  through to pick up last byte
151
152L6:	| 1 byte excess
153	clrl	d2
154	movb	a0@(-1,d1:l),d2
155	lsll	#8,d2
156	addl	d2,d0
157	jra	L1
158
159L7:	| 2 bytes excess
160	clrl	d2
161	movw	a0@(-2,d1:l),d2
162	addl	d2,d0
163	jra	L1
164