1 /*	$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $	*/
2 /*-
3  * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
21  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $");
33 
34 #include <sys/param.h>
35 #include <sys/endian.h>
36 #include <sys/mbuf.h>
37 #ifdef _KERNEL
38 #include <sys/systm.h>
39 #else
40 #include <assert.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43 
44 #define KASSERT(x) assert(x)
45 #endif
46 
47 #include <machine/limits.h>
48 
49 #include <netinet/in.h>
50 
51 #ifndef _KERNEL
52 int	cpu_in_cksum(struct mbuf*, int, int, uint32_t);
53 #endif
54 
55 /*
56  * Checksum routine for Internet Protocol family headers (Portable Version).
57  *
58  * This routine is very heavily used in the network
59  * code and should be modified for each CPU to be as fast as possible.
60  *
61  * A discussion of different implementation techniques can be found in
62  * RFC 1071.
63  *
64  * The default implementation for 32bit architectures is using
65  * a 32bit accumulator and operating on 16bit operands.
66  *
67  * The default implementation for 64bit architectures is using
68  * a 64bit accumulator and operating on 32bit operands.
69  *
70  * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
71  * of the inner loop. After each iteration of the inner loop, a partial
72  * reduction is done to avoid carry in long packets.
73  */
74 
75 #if ULONG_MAX == 0xffffffffUL
76 /* 32bit version */
77 int
cpu_in_cksum(struct mbuf * m,int len,int off,uint32_t initial_sum)78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
79 {
80 	int mlen;
81 	uint32_t sum, partial;
82 	unsigned int final_acc;
83 	uint8_t *data;
84 	bool needs_swap, started_on_odd;
85 
86 	KASSERT(len >= 0);
87 	KASSERT(off >= 0);
88 
89 	needs_swap = false;
90 	started_on_odd = false;
91 	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
92 
93 	for (;;) {
94 		if (__predict_false(m == NULL)) {
95 			printf("in_cksum: out of data\n");
96 			return -1;
97 		}
98 		mlen = m->m_len;
99 		if (mlen > off) {
100 			mlen -= off;
101 			data = mtod(m, uint8_t *) + off;
102 			goto post_initial_offset;
103 		}
104 		off -= mlen;
105 		if (len == 0)
106 			break;
107 		m = m->m_next;
108 	}
109 
110 	for (; len > 0; m = m->m_next) {
111 		if (__predict_false(m == NULL)) {
112 			printf("in_cksum: out of data\n");
113 			return -1;
114 		}
115 		mlen = m->m_len;
116 		data = mtod(m, uint8_t *);
117  post_initial_offset:
118 		if (mlen == 0)
119 			continue;
120 		if (mlen > len)
121 			mlen = len;
122 		len -= mlen;
123 
124 		partial = 0;
125 		if ((uintptr_t)data & 1) {
126 			/* Align on word boundary */
127 			started_on_odd = !started_on_odd;
128 #if _BYTE_ORDER == _LITTLE_ENDIAN
129 			partial = *data << 8;
130 #else
131 			partial = *data;
132 #endif
133 			++data;
134 			--mlen;
135 		}
136 		needs_swap = started_on_odd;
137 		while (mlen >= 32) {
138 			__builtin_prefetch(data + 32);
139 			partial += *(uint16_t *)data;
140 			partial += *(uint16_t *)(data + 2);
141 			partial += *(uint16_t *)(data + 4);
142 			partial += *(uint16_t *)(data + 6);
143 			partial += *(uint16_t *)(data + 8);
144 			partial += *(uint16_t *)(data + 10);
145 			partial += *(uint16_t *)(data + 12);
146 			partial += *(uint16_t *)(data + 14);
147 			partial += *(uint16_t *)(data + 16);
148 			partial += *(uint16_t *)(data + 18);
149 			partial += *(uint16_t *)(data + 20);
150 			partial += *(uint16_t *)(data + 22);
151 			partial += *(uint16_t *)(data + 24);
152 			partial += *(uint16_t *)(data + 26);
153 			partial += *(uint16_t *)(data + 28);
154 			partial += *(uint16_t *)(data + 30);
155 			data += 32;
156 			mlen -= 32;
157 			if (__predict_false(partial & 0xc0000000)) {
158 				if (needs_swap)
159 					partial = (partial << 8) + (partial >> 24);
160 				sum += (partial >> 16);
161 				sum += (partial & 0xffff);
162 				partial = 0;
163 			}
164 		}
165 		if (mlen & 16) {
166 			partial += *(uint16_t *)data;
167 			partial += *(uint16_t *)(data + 2);
168 			partial += *(uint16_t *)(data + 4);
169 			partial += *(uint16_t *)(data + 6);
170 			partial += *(uint16_t *)(data + 8);
171 			partial += *(uint16_t *)(data + 10);
172 			partial += *(uint16_t *)(data + 12);
173 			partial += *(uint16_t *)(data + 14);
174 			data += 16;
175 			mlen -= 16;
176 		}
177 		/*
178 		 * mlen is not updated below as the remaining tests
179 		 * are using bit masks, which are not affected.
180 		 */
181 		if (mlen & 8) {
182 			partial += *(uint16_t *)data;
183 			partial += *(uint16_t *)(data + 2);
184 			partial += *(uint16_t *)(data + 4);
185 			partial += *(uint16_t *)(data + 6);
186 			data += 8;
187 		}
188 		if (mlen & 4) {
189 			partial += *(uint16_t *)data;
190 			partial += *(uint16_t *)(data + 2);
191 			data += 4;
192 		}
193 		if (mlen & 2) {
194 			partial += *(uint16_t *)data;
195 			data += 2;
196 		}
197 		if (mlen & 1) {
198 #if _BYTE_ORDER == _LITTLE_ENDIAN
199 			partial += *data;
200 #else
201 			partial += *data << 8;
202 #endif
203 			started_on_odd = !started_on_odd;
204 		}
205 
206 		if (needs_swap)
207 			partial = (partial << 8) + (partial >> 24);
208 		sum += (partial >> 16) + (partial & 0xffff);
209 		/*
210 		 * Reduce sum to allow potential byte swap
211 		 * in the next iteration without carry.
212 		 */
213 		sum = (sum >> 16) + (sum & 0xffff);
214 	}
215 	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
216 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
217 	return ~final_acc & 0xffff;
218 }
219 
220 #else
221 /* 64bit version */
222 int
cpu_in_cksum(struct mbuf * m,int len,int off,uint32_t initial_sum)223 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
224 {
225 	int mlen;
226 	uint64_t sum, partial;
227 	unsigned int final_acc;
228 	uint8_t *data;
229 	bool needs_swap, started_on_odd;
230 
231 	KASSERT(len >= 0);
232 	KASSERT(off >= 0);
233 
234 	needs_swap = false;
235 	started_on_odd = false;
236 	sum = initial_sum;
237 
238 	for (;;) {
239 		if (__predict_false(m == NULL)) {
240 			printf("in_cksum: out of data\n");
241 			return -1;
242 		}
243 		mlen = m->m_len;
244 		if (mlen > off) {
245 			mlen -= off;
246 			data = mtod(m, uint8_t *) + off;
247 			goto post_initial_offset;
248 		}
249 		off -= mlen;
250 		if (len == 0)
251 			break;
252 		m = m->m_next;
253 	}
254 
255 	for (; len > 0; m = m->m_next) {
256 		if (__predict_false(m == NULL)) {
257 			printf("in_cksum: out of data\n");
258 			return -1;
259 		}
260 		mlen = m->m_len;
261 		data = mtod(m, uint8_t *);
262  post_initial_offset:
263 		if (mlen == 0)
264 			continue;
265 		if (mlen > len)
266 			mlen = len;
267 		len -= mlen;
268 
269 		partial = 0;
270 		if ((uintptr_t)data & 1) {
271 			/* Align on word boundary */
272 			started_on_odd = !started_on_odd;
273 #if _BYTE_ORDER == _LITTLE_ENDIAN
274 			partial = *data << 8;
275 #else
276 			partial = *data;
277 #endif
278 			++data;
279 			--mlen;
280 		}
281 		needs_swap = started_on_odd;
282 		if ((uintptr_t)data & 2) {
283 			if (mlen < 2)
284 				goto trailing_bytes;
285 			partial += *(uint16_t *)data;
286 			data += 2;
287 			mlen -= 2;
288 		}
289 		while (mlen >= 64) {
290 			__builtin_prefetch(data + 32);
291 			__builtin_prefetch(data + 64);
292 			partial += *(uint32_t *)data;
293 			partial += *(uint32_t *)(data + 4);
294 			partial += *(uint32_t *)(data + 8);
295 			partial += *(uint32_t *)(data + 12);
296 			partial += *(uint32_t *)(data + 16);
297 			partial += *(uint32_t *)(data + 20);
298 			partial += *(uint32_t *)(data + 24);
299 			partial += *(uint32_t *)(data + 28);
300 			partial += *(uint32_t *)(data + 32);
301 			partial += *(uint32_t *)(data + 36);
302 			partial += *(uint32_t *)(data + 40);
303 			partial += *(uint32_t *)(data + 44);
304 			partial += *(uint32_t *)(data + 48);
305 			partial += *(uint32_t *)(data + 52);
306 			partial += *(uint32_t *)(data + 56);
307 			partial += *(uint32_t *)(data + 60);
308 			data += 64;
309 			mlen -= 64;
310 			if (__predict_false(partial & (3ULL << 62))) {
311 				if (needs_swap)
312 					partial = (partial << 8) + (partial >> 56);
313 				sum += (partial >> 32);
314 				sum += (partial & 0xffffffff);
315 				partial = 0;
316 			}
317 		}
318 		/*
319 		 * mlen is not updated below as the remaining tests
320 		 * are using bit masks, which are not affected.
321 		 */
322 		if (mlen & 32) {
323 			partial += *(uint32_t *)data;
324 			partial += *(uint32_t *)(data + 4);
325 			partial += *(uint32_t *)(data + 8);
326 			partial += *(uint32_t *)(data + 12);
327 			partial += *(uint32_t *)(data + 16);
328 			partial += *(uint32_t *)(data + 20);
329 			partial += *(uint32_t *)(data + 24);
330 			partial += *(uint32_t *)(data + 28);
331 			data += 32;
332 		}
333 		if (mlen & 16) {
334 			partial += *(uint32_t *)data;
335 			partial += *(uint32_t *)(data + 4);
336 			partial += *(uint32_t *)(data + 8);
337 			partial += *(uint32_t *)(data + 12);
338 			data += 16;
339 		}
340 		if (mlen & 8) {
341 			partial += *(uint32_t *)data;
342 			partial += *(uint32_t *)(data + 4);
343 			data += 8;
344 		}
345 		if (mlen & 4) {
346 			partial += *(uint32_t *)data;
347 			data += 4;
348 		}
349 		if (mlen & 2) {
350 			partial += *(uint16_t *)data;
351 			data += 2;
352 		}
353  trailing_bytes:
354 		if (mlen & 1) {
355 #if _BYTE_ORDER == _LITTLE_ENDIAN
356 			partial += *data;
357 #else
358 			partial += *data << 8;
359 #endif
360 			started_on_odd = !started_on_odd;
361 		}
362 
363 		if (needs_swap)
364 			partial = (partial << 8) + (partial >> 56);
365 		sum += (partial >> 32) + (partial & 0xffffffff);
366 		/*
367 		 * Reduce sum to allow potential byte swap
368 		 * in the next iteration without carry.
369 		 */
370 		sum = (sum >> 32) + (sum & 0xffffffff);
371 	}
372 	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
373 	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
374 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
375 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
376 	return ~final_acc & 0xffff;
377 }
378 #endif
379