1/* $NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $ */ 2 3/*- 4 * Copyright 2003 Wasabi Systems, Inc. 5 * All rights reserved. 6 * 7 * Written by Steve C. Woodford for Wasabi Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed for the NetBSD Project by 20 * Wasabi Systems, Inc. 21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 22 * or promote products derived from this software without specific prior 23 * written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 * 37 */ 38 39/* 40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e 41 */ 42 43#include "opt_inet.h" 44 45#include <machine/asm.h> 46#include "assym.s" 47__FBSDID("$FreeBSD$"); 48 49/* 50 * int in_cksum(struct mbuf *m, int len) 51 * 52 * Entry: 53 * r0 m 54 * r1 len 55 * 56 * NOTE: Assumes 'm' is *never* NULL. 57 */ 58/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */ 59ENTRY(in_cksum) 60 stmfd sp!, {r4-r11,lr} 61 mov r8, #0x00 62 mov r9, r1 63 mov r10, #0x00 64 mov ip, r0 65 66.Lin_cksum_loop: 67 ldr r1, [ip, #(M_LEN)] 68 ldr r0, [ip, #(M_DATA)] 69 ldr ip, [ip, #(M_NEXT)] 70.Lin_cksum_entry4: 71 cmp r9, r1 72 movlt r1, r9 73 sub r9, r9, r1 74 eor r11, r10, r0 75 add r10, r10, r1 76 adds r2, r1, #0x00 77 blne _ASM_LABEL(L_cksumdata) 78 tst r11, #0x01 79 movne r2, r2, ror #8 80 adds r8, r8, r2 81 adc r8, r8, #0x00 82 cmp ip, #0x00 83 bne .Lin_cksum_loop 84 85 mov r1, #0xff 86 orr r1, r1, #0xff00 87 and r0, r8, r1 88 add r0, r0, r8, lsr #16 89 add r0, r0, r0, lsr #16 90 and r0, r0, r1 91 eor r0, r0, r1 92 ldmfd sp!, {r4-r11,pc} 93END(in_cksum) 94 95ENTRY(do_cksum) 96 stmfd sp!, {r4-r7, lr} 97 bl L_cksumdata 98 mov r0, r2 99 ldmfd sp!, {r4-r7, pc} 100END(do_cksum) 101 102/* 103 * The main in*_cksum() workhorse... 104 * 105 * Entry parameters: 106 * r0 Pointer to buffer 107 * r1 Buffer length 108 * lr Return address 109 * 110 * Returns: 111 * r2 Accumulated 32-bit sum 112 * 113 * Clobbers: 114 * r0-r7 115 */ 116/* LINTSTUB: Ignore */ 117ASENTRY_NP(L_cksumdata) 118#ifdef _ARM_ARCH_5E 119 pld [r0] /* Pre-fetch the start of the buffer */ 120#endif 121 mov r2, #0 122 123 /* We first have to word-align the buffer. */ 124 ands r7, r0, #0x03 125 beq .Lcksumdata_wordaligned 126 rsb r7, r7, #0x04 127 cmp r1, r7 /* Enough bytes left to make it? */ 128 blt .Lcksumdata_endgame 129 cmp r7, #0x02 130 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ 131 ldrgeb r5, [r0], #0x01 /* Fetch 2nd byte */ 132 movlt r5, #0x00 133 ldrgtb r6, [r0], #0x01 /* Fetch 3rd byte */ 134 movle r6, #0x00 135 /* Combine the three bytes depending on endianness and alignment */ 136#ifdef __ARMEB__ 137 orreq r2, r5, r4, lsl #8 138 orreq r2, r2, r6, lsl #24 139 orrne r2, r4, r5, lsl #8 140 orrne r2, r2, r6, lsl #16 141#else 142 orreq r2, r4, r5, lsl #8 143 orreq r2, r2, r6, lsl #16 144 orrne r2, r5, r4, lsl #8 145 orrne r2, r2, r6, lsl #24 146#endif 147 subs r1, r1, r7 /* Update length */ 148 RETeq /* All done? */ 149 150 /* Buffer is now word aligned */ 151.Lcksumdata_wordaligned: 152#ifdef _ARM_ARCH_5E 153 cmp r1, #0x04 /* Less than 4 bytes left? */ 154 blt .Lcksumdata_endgame /* Yup */ 155 156 /* Now quad-align, if necessary */ 157 ands r7, r0, #0x04 158 ldrne r7, [r0], #0x04 159 subne r1, r1, #0x04 160 subs r1, r1, #0x40 161 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */ 162 163 /* 164 * Buffer is now quad aligned. Sum 64 bytes at a time. 165 * Note: First ldrd is hoisted above the loop, together with 166 * setting r6 to zero to avoid stalling for results in the 167 * loop. (r7 is live, from above). 168 */ 169 ldrd r4, [r0], #0x08 170 mov r6, #0x00 171.Lcksumdata_bigloop: 172 pld [r0, #0x18] 173 adds r2, r2, r6 174 adcs r2, r2, r7 175 ldrd r6, [r0], #0x08 176 adcs r2, r2, r4 177 adcs r2, r2, r5 178 ldrd r4, [r0], #0x08 179 adcs r2, r2, r6 180 adcs r2, r2, r7 181 ldrd r6, [r0], #0x08 182 adcs r2, r2, r4 183 adcs r2, r2, r5 184 ldrd r4, [r0], #0x08 185 adcs r2, r2, r6 186 adcs r2, r2, r7 187 pld [r0, #0x18] 188 ldrd r6, [r0], #0x08 189 adcs r2, r2, r4 190 adcs r2, r2, r5 191 ldrd r4, [r0], #0x08 192 adcs r2, r2, r6 193 adcs r2, r2, r7 194 ldrd r6, [r0], #0x08 195 adcs r2, r2, r4 196 adcs r2, r2, r5 197 adc r2, r2, #0x00 198 subs r1, r1, #0x40 199 ldrged r4, [r0], #0x08 200 bge .Lcksumdata_bigloop 201 202 adds r2, r2, r6 /* r6/r7 still need summing */ 203.Lcksumdata_bigloop_end: 204 adcs r2, r2, r7 205 adc r2, r2, #0x00 206 207#else /* !_ARM_ARCH_5E */ 208 209 subs r1, r1, #0x40 210 blt .Lcksumdata_bigloop_end 211 212.Lcksumdata_bigloop: 213 ldmia r0!, {r3, r4, r5, r6} 214 adds r2, r2, r3 215 adcs r2, r2, r4 216 adcs r2, r2, r5 217 ldmia r0!, {r3, r4, r5, r7} 218 adcs r2, r2, r6 219 adcs r2, r2, r3 220 adcs r2, r2, r4 221 adcs r2, r2, r5 222 ldmia r0!, {r3, r4, r5, r6} 223 adcs r2, r2, r7 224 adcs r2, r2, r3 225 adcs r2, r2, r4 226 adcs r2, r2, r5 227 ldmia r0!, {r3, r4, r5, r7} 228 adcs r2, r2, r6 229 adcs r2, r2, r3 230 adcs r2, r2, r4 231 adcs r2, r2, r5 232 adcs r2, r2, r7 233 adc r2, r2, #0x00 234 subs r1, r1, #0x40 235 bge .Lcksumdata_bigloop 236.Lcksumdata_bigloop_end: 237#endif 238 239 adds r1, r1, #0x40 240 RETeq 241 cmp r1, #0x20 242 243#ifdef _ARM_ARCH_5E 244 ldrged r4, [r0], #0x08 /* Avoid stalling pld and result */ 245 blt .Lcksumdata_less_than_32 246 pld [r0, #0x18] 247 ldrd r6, [r0], #0x08 248 adds r2, r2, r4 249 adcs r2, r2, r5 250 ldrd r4, [r0], #0x08 251 adcs r2, r2, r6 252 adcs r2, r2, r7 253 ldrd r6, [r0], #0x08 254 adcs r2, r2, r4 255 adcs r2, r2, r5 256 adcs r2, r2, r6 /* XXX: Unavoidable result stall */ 257 adcs r2, r2, r7 258#else 259 blt .Lcksumdata_less_than_32 260 ldmia r0!, {r3, r4, r5, r6} 261 adds r2, r2, r3 262 adcs r2, r2, r4 263 adcs r2, r2, r5 264 ldmia r0!, {r3, r4, r5, r7} 265 adcs r2, r2, r6 266 adcs r2, r2, r3 267 adcs r2, r2, r4 268 adcs r2, r2, r5 269 adcs r2, r2, r7 270#endif 271 adc r2, r2, #0x00 272 subs r1, r1, #0x20 273 RETeq 274 275.Lcksumdata_less_than_32: 276 /* There are less than 32 bytes left */ 277 and r3, r1, #0x18 278 rsb r4, r3, #0x18 279 sub r1, r1, r3 280 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 281 addne pc, pc, r4 282 nop 283 284/* 285 * Note: We use ldm here, even on armv5e, since the combined issue/result 286 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 287 */ 288 /* At least 24 bytes remaining... */ 289 ldmia r0!, {r4, r5} 290 adcs r2, r2, r4 291 adcs r2, r2, r5 292 293 /* At least 16 bytes remaining... */ 294 ldmia r0!, {r4, r5} 295 adcs r2, r2, r4 296 adcs r2, r2, r5 297 298 /* At least 8 bytes remaining... */ 299 ldmia r0!, {r4, r5} 300 adcs r2, r2, r4 301 adcs r2, r2, r5 302 303 /* Less than 8 bytes remaining... */ 304 adc r2, r2, #0x00 305 subs r1, r1, #0x04 306 blt .Lcksumdata_lessthan4 307 308 ldr r4, [r0], #0x04 309 sub r1, r1, #0x04 310 adds r2, r2, r4 311 adc r2, r2, #0x00 312 313 /* Deal with < 4 bytes remaining */ 314.Lcksumdata_lessthan4: 315 adds r1, r1, #0x04 316 RETeq 317 318 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 319.Lcksumdata_endgame: 320 ldrb r3, [r0] /* Fetch first byte */ 321 cmp r1, #0x02 322 ldrgeb r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ 323 movlt r4, #0x00 324 ldrgtb r5, [r0, #0x02] 325 movle r5, #0x00 326 /* Combine the three bytes depending on endianness and alignment */ 327 tst r0, #0x01 328#ifdef __ARMEB__ 329 orreq r3, r4, r3, lsl #8 330 orreq r3, r3, r5, lsl #24 331 orrne r3, r3, r4, lsl #8 332 orrne r3, r3, r5, lsl #16 333#else 334 orreq r3, r3, r4, lsl #8 335 orreq r3, r3, r5, lsl #16 336 orrne r3, r4, r3, lsl #8 337 orrne r3, r3, r5, lsl #24 338#endif 339 adds r2, r2, r3 340 adc r2, r2, #0x00 341 RET 342END(L_cksumdata) 343 344