1/* 2| Copyright (c) 1988, 1993 3| The Regents of the University of California. All rights reserved. 4| 5| %sccs.include.redist.gas% 6| 7| @(#)oc_cksum.s 8.3 (Berkeley) 01/21/94 8| 9| 10| oc_cksum: ones complement 16 bit checksum for MC68020. 11| 12| oc_cksum (buffer, count, strtval) 13| 14| Do a 16 bit one's complement sum of 'count' bytes from 'buffer'. 15| 'strtval' is the starting value of the sum (usually zero). 16| 17| It simplifies life in in_cksum if strtval can be >= 2^16. 18| This routine will work as long as strtval is < 2^31. 19| 20| Performance 21| ----------- 22| This routine is intended for MC 68020s but should also work 23| for 68030s. It (deliberately) doesn't worry about the alignment 24| of the buffer so will only work on a 68010 if the buffer is 25| aligned on an even address. (Also, a routine written to use 26| 68010 "loop mode" would almost certainly be faster than this 27| code on a 68010). 28| 29| We don't worry about alignment because this routine is frequently 30| called with small counts: 20 bytes for IP header checksums and 40 31| bytes for TCP ack checksums. For these small counts, testing for 32| bad alignment adds ~10% to the per-call cost. Since, by the nature 33| of the kernel's allocator, the data we're called with is almost 34| always longword aligned, there is no benefit to this added cost 35| and we're better off letting the loop take a big performance hit 36| in the rare cases where we're handed an unaligned buffer. 37| 38| Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were 39| tested on random data on four different types of processors (see 40| list below -- 64 was the largest unrolling because anything more 41| overflows the 68020 Icache). On all the processors, the 42| throughput asymptote was located between 8 and 16 (closer to 8). 43| However, 16 was substantially better than 8 for small counts. 44| (It's clear why this happens for a count of 40: unroll-8 pays a 45| loop branch cost and unroll-16 doesn't. But the tests also showed 46| that 16 was better than 8 for a count of 20. It's not obvious to 47| me why.) So, since 16 was good for both large and small counts, 48| the loop below is unrolled 16 times. 49| 50| The processors tested and their average time to checksum 1024 bytes 51| of random data were: 52| Sun 3/50 (15MHz) 190 us/KB 53| Sun 3/180 (16.6MHz) 175 us/KB 54| Sun 3/60 (20MHz) 134 us/KB 55| Sun 3/280 (25MHz) 95 us/KB 56| 57| The cost of calling this routine was typically 10% of the per- 58| kilobyte cost. E.g., checksumming zero bytes on a 3/60 cost 9us 59| and each additional byte cost 125ns. With the high fixed cost, 60| it would clearly be a gain to "inline" this routine -- the 61| subroutine call adds 400% overhead to an IP header checksum. 62| However, in absolute terms, inlining would only gain 10us per 63| packet -- a 1% effect for a 1ms ethernet packet. This is not 64| enough gain to be worth the effort. 65*/ 66 67 .data 68 .asciz "@(#)$Header: oc_cksum.s,v 1.1 90/07/09 16:04:43 mike Exp $" 69 .even 70 .text 71 72 .globl _oc_cksum 73_oc_cksum: 74 movl sp@(4),a0 | get buffer ptr 75 movl sp@(8),d1 | get byte count 76 movl sp@(12),d0 | get starting value 77 movl d2,sp@- | free a reg 78 79 | test for possible 1, 2 or 3 bytes of excess at end 80 | of buffer. The usual case is no excess (the usual 81 | case is header checksums) so we give that the faster 82 | 'not taken' leg of the compare. (We do the excess 83 | first because we are about the trash the low order 84 | bits of the count in d1.) 85 86 btst #0,d1 87 jne L5 | if one or three bytes excess 88 btst #1,d1 89 jne L7 | if two bytes excess 90L1: 91 movl d1,d2 92 lsrl #6,d1 | make cnt into # of 64 byte chunks 93 andl #0x3c,d2 | then find fractions of a chunk 94 negl d2 95 andb #0xf,cc | clear X 96 jmp pc@(L3-.-2:b,d2) 97L2: 98 movl a0@+,d2 99 addxl d2,d0 100 movl a0@+,d2 101 addxl d2,d0 102 movl a0@+,d2 103 addxl d2,d0 104 movl a0@+,d2 105 addxl d2,d0 106 movl a0@+,d2 107 addxl d2,d0 108 movl a0@+,d2 109 addxl d2,d0 110 movl a0@+,d2 111 addxl d2,d0 112 movl a0@+,d2 113 addxl d2,d0 114 movl a0@+,d2 115 addxl d2,d0 116 movl a0@+,d2 117 addxl d2,d0 118 movl a0@+,d2 119 addxl d2,d0 120 movl a0@+,d2 121 addxl d2,d0 122 movl a0@+,d2 123 addxl d2,d0 124 movl a0@+,d2 125 addxl d2,d0 126 movl a0@+,d2 127 addxl d2,d0 128 movl a0@+,d2 129 addxl d2,d0 130L3: 131 dbra d1,L2 | (NB- dbra does not affect X) 132 133 movl d0,d1 | fold 32 bit sum to 16 bits 134 swap d1 | (NB- swap does not affect X) 135 addxw d1,d0 136 jcc L4 137 addw #1,d0 138L4: 139 andl #0xffff,d0 140 movl sp@+,d2 141 rts 142 143L5: | deal with 1 or 3 excess bytes at the end of the buffer. 144 btst #1,d1 145 jeq L6 | if 1 excess 146 147 | 3 bytes excess 148 clrl d2 149 movw a0@(-3,d1:l),d2 | add in last full word then drop 150 addl d2,d0 | through to pick up last byte 151 152L6: | 1 byte excess 153 clrl d2 154 movb a0@(-1,d1:l),d2 155 lsll #8,d2 156 addl d2,d0 157 jra L1 158 159L7: | 2 bytes excess 160 clrl d2 161 movw a0@(-2,d1:l),d2 162 addl d2,d0 163 jra L1 164