1 /* $NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $ */
2 /*-
3 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $");
33
34 #include <sys/param.h>
35 #include <sys/endian.h>
36 #include <sys/mbuf.h>
37 #ifdef _KERNEL
38 #include <sys/systm.h>
39 #else
40 #include <assert.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43
44 #define KASSERT(x) assert(x)
45 #endif
46
47 #include <machine/limits.h>
48
49 #include <netinet/in.h>
50
51 #ifndef _KERNEL
52 int cpu_in_cksum(struct mbuf*, int, int, uint32_t);
53 #endif
54
55 /*
56 * Checksum routine for Internet Protocol family headers (Portable Version).
57 *
58 * This routine is very heavily used in the network
59 * code and should be modified for each CPU to be as fast as possible.
60 *
61 * A discussion of different implementation techniques can be found in
62 * RFC 1071.
63 *
64 * The default implementation for 32bit architectures is using
65 * a 32bit accumulator and operating on 16bit operands.
66 *
67 * The default implementation for 64bit architectures is using
68 * a 64bit accumulator and operating on 32bit operands.
69 *
70 * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
71 * of the inner loop. After each iteration of the inner loop, a partial
72 * reduction is done to avoid carry in long packets.
73 */
74
75 #if ULONG_MAX == 0xffffffffUL
76 /* 32bit version */
77 int
cpu_in_cksum(struct mbuf * m,int len,int off,uint32_t initial_sum)78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
79 {
80 int mlen;
81 uint32_t sum, partial;
82 unsigned int final_acc;
83 uint8_t *data;
84 bool needs_swap, started_on_odd;
85
86 KASSERT(len >= 0);
87 KASSERT(off >= 0);
88
89 needs_swap = false;
90 started_on_odd = false;
91 sum = (initial_sum >> 16) + (initial_sum & 0xffff);
92
93 for (;;) {
94 if (__predict_false(m == NULL)) {
95 printf("in_cksum: out of data\n");
96 return -1;
97 }
98 mlen = m->m_len;
99 if (mlen > off) {
100 mlen -= off;
101 data = mtod(m, uint8_t *) + off;
102 goto post_initial_offset;
103 }
104 off -= mlen;
105 if (len == 0)
106 break;
107 m = m->m_next;
108 }
109
110 for (; len > 0; m = m->m_next) {
111 if (__predict_false(m == NULL)) {
112 printf("in_cksum: out of data\n");
113 return -1;
114 }
115 mlen = m->m_len;
116 data = mtod(m, uint8_t *);
117 post_initial_offset:
118 if (mlen == 0)
119 continue;
120 if (mlen > len)
121 mlen = len;
122 len -= mlen;
123
124 partial = 0;
125 if ((uintptr_t)data & 1) {
126 /* Align on word boundary */
127 started_on_odd = !started_on_odd;
128 #if _BYTE_ORDER == _LITTLE_ENDIAN
129 partial = *data << 8;
130 #else
131 partial = *data;
132 #endif
133 ++data;
134 --mlen;
135 }
136 needs_swap = started_on_odd;
137 while (mlen >= 32) {
138 __builtin_prefetch(data + 32);
139 partial += *(uint16_t *)data;
140 partial += *(uint16_t *)(data + 2);
141 partial += *(uint16_t *)(data + 4);
142 partial += *(uint16_t *)(data + 6);
143 partial += *(uint16_t *)(data + 8);
144 partial += *(uint16_t *)(data + 10);
145 partial += *(uint16_t *)(data + 12);
146 partial += *(uint16_t *)(data + 14);
147 partial += *(uint16_t *)(data + 16);
148 partial += *(uint16_t *)(data + 18);
149 partial += *(uint16_t *)(data + 20);
150 partial += *(uint16_t *)(data + 22);
151 partial += *(uint16_t *)(data + 24);
152 partial += *(uint16_t *)(data + 26);
153 partial += *(uint16_t *)(data + 28);
154 partial += *(uint16_t *)(data + 30);
155 data += 32;
156 mlen -= 32;
157 if (__predict_false(partial & 0xc0000000)) {
158 if (needs_swap)
159 partial = (partial << 8) + (partial >> 24);
160 sum += (partial >> 16);
161 sum += (partial & 0xffff);
162 partial = 0;
163 }
164 }
165 if (mlen & 16) {
166 partial += *(uint16_t *)data;
167 partial += *(uint16_t *)(data + 2);
168 partial += *(uint16_t *)(data + 4);
169 partial += *(uint16_t *)(data + 6);
170 partial += *(uint16_t *)(data + 8);
171 partial += *(uint16_t *)(data + 10);
172 partial += *(uint16_t *)(data + 12);
173 partial += *(uint16_t *)(data + 14);
174 data += 16;
175 mlen -= 16;
176 }
177 /*
178 * mlen is not updated below as the remaining tests
179 * are using bit masks, which are not affected.
180 */
181 if (mlen & 8) {
182 partial += *(uint16_t *)data;
183 partial += *(uint16_t *)(data + 2);
184 partial += *(uint16_t *)(data + 4);
185 partial += *(uint16_t *)(data + 6);
186 data += 8;
187 }
188 if (mlen & 4) {
189 partial += *(uint16_t *)data;
190 partial += *(uint16_t *)(data + 2);
191 data += 4;
192 }
193 if (mlen & 2) {
194 partial += *(uint16_t *)data;
195 data += 2;
196 }
197 if (mlen & 1) {
198 #if _BYTE_ORDER == _LITTLE_ENDIAN
199 partial += *data;
200 #else
201 partial += *data << 8;
202 #endif
203 started_on_odd = !started_on_odd;
204 }
205
206 if (needs_swap)
207 partial = (partial << 8) + (partial >> 24);
208 sum += (partial >> 16) + (partial & 0xffff);
209 /*
210 * Reduce sum to allow potential byte swap
211 * in the next iteration without carry.
212 */
213 sum = (sum >> 16) + (sum & 0xffff);
214 }
215 final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
216 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
217 return ~final_acc & 0xffff;
218 }
219
220 #else
221 /* 64bit version */
222 int
cpu_in_cksum(struct mbuf * m,int len,int off,uint32_t initial_sum)223 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
224 {
225 int mlen;
226 uint64_t sum, partial;
227 unsigned int final_acc;
228 uint8_t *data;
229 bool needs_swap, started_on_odd;
230
231 KASSERT(len >= 0);
232 KASSERT(off >= 0);
233
234 needs_swap = false;
235 started_on_odd = false;
236 sum = initial_sum;
237
238 for (;;) {
239 if (__predict_false(m == NULL)) {
240 printf("in_cksum: out of data\n");
241 return -1;
242 }
243 mlen = m->m_len;
244 if (mlen > off) {
245 mlen -= off;
246 data = mtod(m, uint8_t *) + off;
247 goto post_initial_offset;
248 }
249 off -= mlen;
250 if (len == 0)
251 break;
252 m = m->m_next;
253 }
254
255 for (; len > 0; m = m->m_next) {
256 if (__predict_false(m == NULL)) {
257 printf("in_cksum: out of data\n");
258 return -1;
259 }
260 mlen = m->m_len;
261 data = mtod(m, uint8_t *);
262 post_initial_offset:
263 if (mlen == 0)
264 continue;
265 if (mlen > len)
266 mlen = len;
267 len -= mlen;
268
269 partial = 0;
270 if ((uintptr_t)data & 1) {
271 /* Align on word boundary */
272 started_on_odd = !started_on_odd;
273 #if _BYTE_ORDER == _LITTLE_ENDIAN
274 partial = *data << 8;
275 #else
276 partial = *data;
277 #endif
278 ++data;
279 --mlen;
280 }
281 needs_swap = started_on_odd;
282 if ((uintptr_t)data & 2) {
283 if (mlen < 2)
284 goto trailing_bytes;
285 partial += *(uint16_t *)data;
286 data += 2;
287 mlen -= 2;
288 }
289 while (mlen >= 64) {
290 __builtin_prefetch(data + 32);
291 __builtin_prefetch(data + 64);
292 partial += *(uint32_t *)data;
293 partial += *(uint32_t *)(data + 4);
294 partial += *(uint32_t *)(data + 8);
295 partial += *(uint32_t *)(data + 12);
296 partial += *(uint32_t *)(data + 16);
297 partial += *(uint32_t *)(data + 20);
298 partial += *(uint32_t *)(data + 24);
299 partial += *(uint32_t *)(data + 28);
300 partial += *(uint32_t *)(data + 32);
301 partial += *(uint32_t *)(data + 36);
302 partial += *(uint32_t *)(data + 40);
303 partial += *(uint32_t *)(data + 44);
304 partial += *(uint32_t *)(data + 48);
305 partial += *(uint32_t *)(data + 52);
306 partial += *(uint32_t *)(data + 56);
307 partial += *(uint32_t *)(data + 60);
308 data += 64;
309 mlen -= 64;
310 if (__predict_false(partial & (3ULL << 62))) {
311 if (needs_swap)
312 partial = (partial << 8) + (partial >> 56);
313 sum += (partial >> 32);
314 sum += (partial & 0xffffffff);
315 partial = 0;
316 }
317 }
318 /*
319 * mlen is not updated below as the remaining tests
320 * are using bit masks, which are not affected.
321 */
322 if (mlen & 32) {
323 partial += *(uint32_t *)data;
324 partial += *(uint32_t *)(data + 4);
325 partial += *(uint32_t *)(data + 8);
326 partial += *(uint32_t *)(data + 12);
327 partial += *(uint32_t *)(data + 16);
328 partial += *(uint32_t *)(data + 20);
329 partial += *(uint32_t *)(data + 24);
330 partial += *(uint32_t *)(data + 28);
331 data += 32;
332 }
333 if (mlen & 16) {
334 partial += *(uint32_t *)data;
335 partial += *(uint32_t *)(data + 4);
336 partial += *(uint32_t *)(data + 8);
337 partial += *(uint32_t *)(data + 12);
338 data += 16;
339 }
340 if (mlen & 8) {
341 partial += *(uint32_t *)data;
342 partial += *(uint32_t *)(data + 4);
343 data += 8;
344 }
345 if (mlen & 4) {
346 partial += *(uint32_t *)data;
347 data += 4;
348 }
349 if (mlen & 2) {
350 partial += *(uint16_t *)data;
351 data += 2;
352 }
353 trailing_bytes:
354 if (mlen & 1) {
355 #if _BYTE_ORDER == _LITTLE_ENDIAN
356 partial += *data;
357 #else
358 partial += *data << 8;
359 #endif
360 started_on_odd = !started_on_odd;
361 }
362
363 if (needs_swap)
364 partial = (partial << 8) + (partial >> 56);
365 sum += (partial >> 32) + (partial & 0xffffffff);
366 /*
367 * Reduce sum to allow potential byte swap
368 * in the next iteration without carry.
369 */
370 sum = (sum >> 32) + (sum & 0xffffffff);
371 }
372 final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
373 ((sum >> 16) & 0xffff) + (sum & 0xffff);
374 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
375 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
376 return ~final_acc & 0xffff;
377 }
378 #endif
379