xref: /freebsd/sys/arm/arm/in_cksum_arm.S (revision f05cddf9)
1/*	$NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 *
37 */
38
39/*
40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
41 */
42
43#include "opt_inet.h"
44
45#include <machine/asm.h>
46#include "assym.s"
47__FBSDID("$FreeBSD$");
48
49/*
50 * int in_cksum(struct mbuf *m, int len)
51 *
52 * Entry:
53 *	r0	m
54 *	r1	len
55 *
56 * NOTE: Assumes 'm' is *never* NULL.
57 */
58/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
59ENTRY(in_cksum)
60	stmfd	sp!, {r4-r11,lr}
61	mov	r8, #0x00
62	mov	r9, r1
63	mov	r10, #0x00
64	mov	ip, r0
65
66.Lin_cksum_loop:
67	ldr	r1, [ip, #(M_LEN)]
68	ldr	r0, [ip, #(M_DATA)]
69	ldr	ip, [ip, #(M_NEXT)]
70.Lin_cksum_entry4:
71	cmp	r9, r1
72	movlt	r1, r9
73	sub	r9, r9, r1
74	eor	r11, r10, r0
75	add	r10, r10, r1
76	adds	r2, r1, #0x00
77	blne	_ASM_LABEL(L_cksumdata)
78	tst	r11, #0x01
79	movne	r2, r2, ror #8
80	adds	r8, r8, r2
81	adc	r8, r8, #0x00
82	cmp	ip, #0x00
83	bne	.Lin_cksum_loop
84
85	mov	r1, #0xff
86	orr	r1, r1, #0xff00
87	and	r0, r8, r1
88	add	r0, r0, r8, lsr #16
89	add	r0, r0, r0, lsr #16
90	and	r0, r0, r1
91	eor	r0, r0, r1
92	ldmfd	sp!, {r4-r11,pc}
93END(in_cksum)
94
95ENTRY(do_cksum)
96	stmfd	sp!, {r4-r7, lr}
97	bl	L_cksumdata
98	mov	r0, r2
99	ldmfd	sp!, {r4-r7, pc}
100END(do_cksum)
101
102/*
103 * The main in*_cksum() workhorse...
104 *
105 * Entry parameters:
106 *	r0	Pointer to buffer
107 *	r1	Buffer length
108 *	lr	Return address
109 *
110 * Returns:
111 *	r2	Accumulated 32-bit sum
112 *
113 * Clobbers:
114 *	r0-r7
115 */
116/* LINTSTUB: Ignore */
117ASENTRY_NP(L_cksumdata)
118#ifdef _ARM_ARCH_5E
119	pld	[r0]			/* Pre-fetch the start of the buffer */
120#endif
121	mov	r2, #0
122
123	/* We first have to word-align the buffer.  */
124	ands	r7, r0, #0x03
125	beq	.Lcksumdata_wordaligned
126	rsb	r7, r7, #0x04
127	cmp	r1, r7			/* Enough bytes left to make it? */
128	blt	.Lcksumdata_endgame
129	cmp	r7, #0x02
130	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
131	ldrgeb	r5, [r0], #0x01		/* Fetch 2nd byte */
132	movlt	r5, #0x00
133	ldrgtb	r6, [r0], #0x01		/* Fetch 3rd byte */
134	movle	r6, #0x00
135	/* Combine the three bytes depending on endianness and alignment */
136#ifdef __ARMEB__
137	orreq	r2, r5, r4, lsl #8
138	orreq	r2, r2, r6, lsl #24
139	orrne	r2, r4, r5, lsl #8
140	orrne	r2, r2, r6, lsl #16
141#else
142	orreq	r2, r4, r5, lsl #8
143	orreq	r2, r2, r6, lsl #16
144	orrne	r2, r5, r4, lsl #8
145	orrne	r2, r2, r6, lsl #24
146#endif
147	subs	r1, r1, r7		/* Update length */
148	RETeq			/* All done? */
149
150	/* Buffer is now word aligned */
151.Lcksumdata_wordaligned:
152#ifdef _ARM_ARCH_5E
153	cmp	r1, #0x04		/* Less than 4 bytes left? */
154	blt	.Lcksumdata_endgame	/* Yup */
155
156	/* Now quad-align, if necessary */
157	ands	r7, r0, #0x04
158	ldrne	r7, [r0], #0x04
159	subne	r1, r1, #0x04
160	subs	r1, r1, #0x40
161	blt	.Lcksumdata_bigloop_end	/* Note: C flag clear if branch taken */
162
163	/*
164	 * Buffer is now quad aligned. Sum 64 bytes at a time.
165	 * Note: First ldrd is hoisted above the loop, together with
166	 * setting r6 to zero to avoid stalling for results in the
167	 * loop. (r7 is live, from above).
168	 */
169	ldrd	r4, [r0], #0x08
170	mov	r6, #0x00
171.Lcksumdata_bigloop:
172	pld	[r0, #0x18]
173	adds	r2, r2, r6
174	adcs	r2, r2, r7
175	ldrd	r6, [r0], #0x08
176	adcs	r2, r2, r4
177	adcs	r2, r2, r5
178	ldrd	r4, [r0], #0x08
179	adcs	r2, r2, r6
180	adcs	r2, r2, r7
181	ldrd	r6, [r0], #0x08
182	adcs	r2, r2, r4
183	adcs	r2, r2, r5
184	ldrd	r4, [r0], #0x08
185	adcs	r2, r2, r6
186	adcs	r2, r2, r7
187	pld	[r0, #0x18]
188	ldrd	r6, [r0], #0x08
189	adcs	r2, r2, r4
190	adcs	r2, r2, r5
191	ldrd	r4, [r0], #0x08
192	adcs	r2, r2, r6
193	adcs	r2, r2, r7
194	ldrd	r6, [r0], #0x08
195	adcs	r2, r2, r4
196	adcs	r2, r2, r5
197	adc	r2, r2, #0x00
198	subs	r1, r1, #0x40
199	ldrged	r4, [r0], #0x08
200	bge	.Lcksumdata_bigloop
201
202	adds	r2, r2, r6		/* r6/r7 still need summing */
203.Lcksumdata_bigloop_end:
204	adcs	r2, r2, r7
205	adc	r2, r2, #0x00
206
207#else	/* !_ARM_ARCH_5E */
208
209	subs	r1, r1, #0x40
210	blt	.Lcksumdata_bigloop_end
211
212.Lcksumdata_bigloop:
213	ldmia	r0!, {r3, r4, r5, r6}
214	adds	r2, r2, r3
215	adcs	r2, r2, r4
216	adcs	r2, r2, r5
217	ldmia	r0!, {r3, r4, r5, r7}
218	adcs	r2, r2, r6
219	adcs	r2, r2, r3
220	adcs	r2, r2, r4
221	adcs	r2, r2, r5
222	ldmia	r0!, {r3, r4, r5, r6}
223	adcs	r2, r2, r7
224	adcs	r2, r2, r3
225	adcs	r2, r2, r4
226	adcs	r2, r2, r5
227	ldmia	r0!, {r3, r4, r5, r7}
228	adcs	r2, r2, r6
229	adcs	r2, r2, r3
230	adcs	r2, r2, r4
231	adcs	r2, r2, r5
232	adcs	r2, r2, r7
233	adc	r2, r2, #0x00
234	subs	r1, r1, #0x40
235	bge	.Lcksumdata_bigloop
236.Lcksumdata_bigloop_end:
237#endif
238
239	adds	r1, r1, #0x40
240	RETeq
241	cmp	r1, #0x20
242
243#ifdef _ARM_ARCH_5E
244	ldrged	r4, [r0], #0x08		/* Avoid stalling pld and result */
245	blt	.Lcksumdata_less_than_32
246	pld	[r0, #0x18]
247	ldrd	r6, [r0], #0x08
248	adds	r2, r2, r4
249	adcs	r2, r2, r5
250	ldrd	r4, [r0], #0x08
251	adcs	r2, r2, r6
252	adcs	r2, r2, r7
253	ldrd	r6, [r0], #0x08
254	adcs	r2, r2, r4
255	adcs	r2, r2, r5
256	adcs	r2, r2, r6		/* XXX: Unavoidable result stall */
257	adcs	r2, r2, r7
258#else
259	blt	.Lcksumdata_less_than_32
260	ldmia	r0!, {r3, r4, r5, r6}
261	adds	r2, r2, r3
262	adcs	r2, r2, r4
263	adcs	r2, r2, r5
264	ldmia	r0!, {r3, r4, r5, r7}
265	adcs	r2, r2, r6
266	adcs	r2, r2, r3
267	adcs	r2, r2, r4
268	adcs	r2, r2, r5
269	adcs	r2, r2, r7
270#endif
271	adc	r2, r2, #0x00
272	subs	r1, r1, #0x20
273	RETeq
274
275.Lcksumdata_less_than_32:
276	/* There are less than 32 bytes left */
277	and	r3, r1, #0x18
278	rsb	r4, r3, #0x18
279	sub	r1, r1, r3
280	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
281	addne	pc, pc, r4
282	nop
283
284/*
285 * Note: We use ldm here, even on armv5e, since the combined issue/result
286 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
287 */
288	/* At least 24 bytes remaining... */
289	ldmia	r0!, {r4, r5}
290	adcs	r2, r2, r4
291	adcs	r2, r2, r5
292
293	/* At least 16 bytes remaining... */
294	ldmia	r0!, {r4, r5}
295	adcs	r2, r2, r4
296	adcs	r2, r2, r5
297
298	/* At least 8 bytes remaining... */
299	ldmia	r0!, {r4, r5}
300	adcs	r2, r2, r4
301	adcs	r2, r2, r5
302
303	/* Less than 8 bytes remaining... */
304	adc	r2, r2, #0x00
305	subs	r1, r1, #0x04
306	blt	.Lcksumdata_lessthan4
307
308	ldr	r4, [r0], #0x04
309	sub	r1, r1, #0x04
310	adds	r2, r2, r4
311	adc	r2, r2, #0x00
312
313	/* Deal with < 4 bytes remaining */
314.Lcksumdata_lessthan4:
315	adds	r1, r1, #0x04
316	RETeq
317
318	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
319.Lcksumdata_endgame:
320	ldrb	r3, [r0]		/* Fetch first byte */
321	cmp	r1, #0x02
322	ldrgeb	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
323	movlt	r4, #0x00
324	ldrgtb	r5, [r0, #0x02]
325	movle	r5, #0x00
326	/* Combine the three bytes depending on endianness and alignment */
327	tst	r0, #0x01
328#ifdef __ARMEB__
329	orreq	r3, r4, r3, lsl #8
330	orreq	r3, r3, r5, lsl #24
331	orrne	r3, r3, r4, lsl #8
332	orrne	r3, r3, r5, lsl #16
333#else
334	orreq	r3, r3, r4, lsl #8
335	orreq	r3, r3, r5, lsl #16
336	orrne	r3, r4, r3, lsl #8
337	orrne	r3, r3, r5, lsl #24
338#endif
339	adds	r2, r2, r3
340	adc	r2, r2, #0x00
341	RET
342END(L_cksumdata)
343
344