xref: /openbsd/sys/arch/arm/arm/in_cksum_arm.S (revision 81621933)
1/*	$OpenBSD: in_cksum_arm.S,v 1.9 2022/12/08 01:25:44 guenther Exp $	*/
2/*	$NetBSD: in_cksum_arm.S,v 1.3 2003/11/26 10:31:53 rearnsha Exp $ */
3
4/*
5 * Copyright 2003 Wasabi Systems, Inc.
6 * All rights reserved.
7 *
8 * Written by Steve C. Woodford for Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *      This product includes software developed for the NetBSD Project by
21 *      Wasabi Systems, Inc.
22 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
23 *    or promote products derived from this software without specific prior
24 *    written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39/*
40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
41 */
42
43#include "assym.h"
44
45#include <machine/asm.h>
46
47.syntax unified
48
49/*
50 * int in_cksum(struct mbuf *m, int len)
51 *
52 * Entry:
53 *	r0	m
54 *	r1	len
55 *
56 * NOTE: Assumes 'm' is *never* NULL.
57 */
58/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
59ENTRY(in_cksum)
60	stmfd	sp!, {r4-r11,lr}
61	mov	r8, #0x00
62	mov	r9, r1
63	mov	r10, #0x00
64	mov	ip, r0
65
66.Lin_cksum_loop:
67	ldr	r1, [ip, #(M_LEN)]
68	ldr	r0, [ip, #(M_DATA)]
69	ldr	ip, [ip, #(M_NEXT)]
70.Lin_cksum_entry4:
71	cmp	r9, r1
72	movlt	r1, r9
73	sub	r9, r9, r1
74	eor	r11, r10, r0
75	add	r10, r10, r1
76	adds	r2, r1, #0x00
77	blne	L_cksumdata
78	tst	r11, #0x01
79	movne	r2, r2, ror #8
80	adds	r8, r8, r2
81	adc	r8, r8, #0x00
82	cmp	ip, #0x00
83	bne	.Lin_cksum_loop
84
85	mov	r1, #0xff
86	orr	r1, r1, #0xff00
87	and	r0, r8, r1
88	add	r0, r0, r8, lsr #16
89	add	r0, r0, r0, lsr #16
90	and	r0, r0, r1
91	eor	r0, r0, r1
92	ldmfd	sp!, {r4-r11,pc}
93
94/*
95 * int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
96 *
97 * Entry:
98 *	r0	m
99 *	r1	nxt
100 *	r2	off
101 *	r3	len
102 */
103/* LINTSTUB: Func: int in4_cksum(struct mbuf *, u_int8_t, int, int) */
104ENTRY(in4_cksum)
105	stmfd	sp!, {r4-r11,lr}
106	mov	r8, #0x00		/* Accumulate sum in r8 */
107
108	/*
109	 * First, deal with a pseudo header, if present
110	 */
111	ldr	r6, [r0, #(M_DATA)]
112	cmp	r1, #0x00
113	beq	.Lin4_cksum_skip_entry
114
115	add	r4, r6, #(IP_SRC)
116	ands	r4, r4, #0x03
117	add	r8, r1, r3		/* sum = nxt + len */
118	addne	pc, pc, r4, lsl #5	/* Handle alignment of pseudo header */
119
120	/* 0x00: Data 32-bit aligned */
121	ldr	r5, [r6, #(IP_SRC)]
122	ldr	r4, [r6, #(IP_DST)]
123	b	.Lin4_cksum_add_ips
124	nop
125	nop
126	nop
127	nop
128	nop
129	nop
130
131	/* 0x01: Data 8-bit aligned */
132	ldr	r4, [r6, #(IP_SRC - 1)]	/* BE:r4 = x012  LE:r4 = 210x */
133	ldr	r5, [r6, #(IP_SRC + 3)]	/* BE:r5 = 3456  LE:r5 = 6543 */
134	ldrb	r7, [r6, #(IP_SRC + 7)]	/* r7 = ...7 */
135	mov	r4, r4, lsr #8		/* r4 = .210 */
136	orr	r4, r4, r5, lsl #24	/* r4 = 3210 */
137	mov	r5, r5, lsr #8		/* r5 = .654 */
138	orr	r5, r5, r7, lsl #24	/* r5 = 7654 */
139	b	.Lin4_cksum_add_ips
140
141	/* 0x02: Data 16-bit aligned */
142	ldr	r4, [r6, #(IP_SRC - 2)]	/* r4 = 10xx */
143	ldrh	r7, [r6, #(IP_DST + 2)]	/* r7 = ..76 */
144	ldr	r5, [r6, #(IP_SRC + 2)]	/* r5 = 5432 */
145	mov	r4, r4, lsr #16		/* r4 = ..10 */
146	orr	r4, r4, r7, lsl #16	/* r4 = 7610 */
147	b	.Lin4_cksum_add_ips
148	nop
149	nop
150
151	/* 0x03: Data 8-bit aligned */
152	ldrb	r4, [r6, #(IP_SRC)]	/* r4 = ...0 */
153	ldr	r5, [r6, #(IP_SRC + 1)]	/* BE:r5 = 1234  LE:r5 = 4321 */
154	ldr	r7, [r6, #(IP_SRC + 5)]	/* BE:r7 = 567x  LE:r7 = x765 */
155	orr	r4, r4, r5, lsl #8	/* r4 = 3210 */
156	mov	r5, r5, lsr #24		/* r4 = ...4 */
157	orr	r5, r5, r7, lsl #8	/* r5 = 7654 */
158	/* FALLTHROUGH */
159
160.Lin4_cksum_add_ips:
161	adds	r5, r5, r4
162	adcs	r8, r5, r8, lsl #8
163	adc	r8, r8, #0x00
164	mov	r1, #0x00
165	b	.Lin4_cksum_skip_entry
166
167.Lin4_cksum_skip_loop:
168	ldr	r1, [r0, #(M_LEN)]
169	ldr	r6, [r0, #(M_DATA)]
170	ldr	r0, [r0, #(M_NEXT)]
171.Lin4_cksum_skip_entry:
172	subs	r2, r2, r1
173	blt	.Lin4_cksum_skip_done
174	cmp	r0, #0x00
175	bne	.Lin4_cksum_skip_loop
176	b	.Lin4_cksum_whoops
177
178.Lin4_cksum_skip_done:
179	mov	ip, r0
180	add	r0, r2, r6
181	add	r0, r0, r1
182	rsb	r1, r2, #0x00
183	mov	r9, r3
184	mov	r10, #0x00
185	b	.Lin_cksum_entry4
186
187.Lin4_cksum_whoops:
188	adr	r0, .Lin4_cksum_whoops_str
189	bl	panic
190.Lin4_cksum_whoops_str:
191	.asciz	"in4_cksum: out of mbufs\n"
192	.align	5
193
194/*
195 * The main in*_cksum() workhorse...
196 *
197 * Entry parameters:
198 *	r0	Pointer to buffer
199 *	r1	Buffer length
200 *	lr	Return address
201 *
202 * Returns:
203 *	r2	Accumulated 32-bit sum
204 *
205 * Clobbers:
206 *	r0-r7
207 */
208/* LINTSTUB: Ignore */
209ASENTRY_NP(L_cksumdata)
210	mov	r2, #0
211
212	/* We first have to word-align the buffer.  */
213	ands	r7, r0, #0x03
214	beq	.Lcksumdata_wordaligned
215	rsb	r7, r7, #0x04
216	cmp	r1, r7			/* Enough bytes left to make it? */
217	blt	.Lcksumdata_endgame
218	cmp	r7, #0x02
219	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
220	ldrbge	r5, [r0], #0x01		/* Fetch 2nd byte */
221	movlt	r5, #0x00
222	ldrbgt	r6, [r0], #0x01		/* Fetch 3rd byte */
223	movle	r6, #0x00
224	/* Combine the three bytes depending on endianness and alignment */
225	orreq	r2, r4, r5, lsl #8
226	orreq	r2, r2, r6, lsl #16
227	orrne	r2, r5, r4, lsl #8
228	orrne	r2, r2, r6, lsl #24
229	subs	r1, r1, r7		/* Update length */
230	moveq	pc, lr			/* All done? */
231
232	/* Buffer is now word aligned */
233.Lcksumdata_wordaligned:
234	subs	r1, r1, #0x40
235	blt	.Lcksumdata_bigloop_end
236
237.Lcksumdata_bigloop:
238	ldmia	r0!, {r3, r4, r5, r6}
239	adds	r2, r2, r3
240	adcs	r2, r2, r4
241	adcs	r2, r2, r5
242	ldmia	r0!, {r3, r4, r5, r7}
243	adcs	r2, r2, r6
244	adcs	r2, r2, r3
245	adcs	r2, r2, r4
246	adcs	r2, r2, r5
247	ldmia	r0!, {r3, r4, r5, r6}
248	adcs	r2, r2, r7
249	adcs	r2, r2, r3
250	adcs	r2, r2, r4
251	adcs	r2, r2, r5
252	ldmia	r0!, {r3, r4, r5, r7}
253	adcs	r2, r2, r6
254	adcs	r2, r2, r3
255	adcs	r2, r2, r4
256	adcs	r2, r2, r5
257	adcs	r2, r2, r7
258	adc	r2, r2, #0x00
259	subs	r1, r1, #0x40
260	bge	.Lcksumdata_bigloop
261.Lcksumdata_bigloop_end:
262
263	adds	r1, r1, #0x40
264	moveq	pc, lr
265	cmp	r1, #0x20
266
267	blt	.Lcksumdata_less_than_32
268	ldmia	r0!, {r3, r4, r5, r6}
269	adds	r2, r2, r3
270	adcs	r2, r2, r4
271	adcs	r2, r2, r5
272	ldmia	r0!, {r3, r4, r5, r7}
273	adcs	r2, r2, r6
274	adcs	r2, r2, r3
275	adcs	r2, r2, r4
276	adcs	r2, r2, r5
277	adcs	r2, r2, r7
278	adc	r2, r2, #0x00
279	subs	r1, r1, #0x20
280	moveq	pc, lr
281
282.Lcksumdata_less_than_32:
283	/* There are less than 32 bytes left */
284	and	r3, r1, #0x18
285	rsb	r4, r3, #0x18
286	sub	r1, r1, r3
287	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
288	addne	pc, pc, r4
289
290/*
291 * Note: We use ldm here, even on Xscale, since the combined issue/result
292 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
293 */
294	/* At least 24 bytes remaining... */
295	ldmia	r0!, {r4, r5}
296	nop
297	adcs	r2, r2, r4
298	adcs	r2, r2, r5
299
300	/* At least 16 bytes remaining... */
301	ldmia	r0!, {r4, r5}
302	adcs	r2, r2, r4
303	adcs	r2, r2, r5
304
305	/* At least 8 bytes remaining... */
306	ldmia	r0!, {r4, r5}
307	adcs	r2, r2, r4
308	adcs	r2, r2, r5
309
310	/* Less than 8 bytes remaining... */
311	adc	r2, r2, #0x00
312	subs	r1, r1, #0x04
313	blt	.Lcksumdata_lessthan4
314
315	ldr	r4, [r0], #0x04
316	sub	r1, r1, #0x04
317	adds	r2, r2, r4
318	adc	r2, r2, #0x00
319
320	/* Deal with < 4 bytes remaining */
321.Lcksumdata_lessthan4:
322	adds	r1, r1, #0x04
323	moveq	pc, lr
324
325	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
326.Lcksumdata_endgame:
327	ldrb	r3, [r0]		/* Fetch first byte */
328	cmp	r1, #0x02
329	ldrbge	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
330	movlt	r4, #0x00
331	ldrbgt	r5, [r0, #0x02]
332	movle	r5, #0x00
333	/* Combine the three bytes depending on endianness and alignment */
334	tst	r0, #0x01
335	orreq	r3, r3, r4, lsl #8
336	orreq	r3, r3, r5, lsl #16
337	orrne	r3, r4, r3, lsl #8
338	orrne	r3, r3, r5, lsl #24
339	adds	r2, r2, r3
340	adc	r2, r2, #0x00
341	mov	pc, lr
342