1/* $OpenBSD: in_cksum_arm.S,v 1.9 2022/12/08 01:25:44 guenther Exp $ */ 2/* $NetBSD: in_cksum_arm.S,v 1.3 2003/11/26 10:31:53 rearnsha Exp $ */ 3 4/* 5 * Copyright 2003 Wasabi Systems, Inc. 6 * All rights reserved. 7 * 8 * Written by Steve C. Woodford for Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed for the NetBSD Project by 21 * Wasabi Systems, Inc. 22 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 23 * or promote products derived from this software without specific prior 24 * written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39/* 40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale 41 */ 42 43#include "assym.h" 44 45#include <machine/asm.h> 46 47.syntax unified 48 49/* 50 * int in_cksum(struct mbuf *m, int len) 51 * 52 * Entry: 53 * r0 m 54 * r1 len 55 * 56 * NOTE: Assumes 'm' is *never* NULL. 57 */ 58/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */ 59ENTRY(in_cksum) 60 stmfd sp!, {r4-r11,lr} 61 mov r8, #0x00 62 mov r9, r1 63 mov r10, #0x00 64 mov ip, r0 65 66.Lin_cksum_loop: 67 ldr r1, [ip, #(M_LEN)] 68 ldr r0, [ip, #(M_DATA)] 69 ldr ip, [ip, #(M_NEXT)] 70.Lin_cksum_entry4: 71 cmp r9, r1 72 movlt r1, r9 73 sub r9, r9, r1 74 eor r11, r10, r0 75 add r10, r10, r1 76 adds r2, r1, #0x00 77 blne L_cksumdata 78 tst r11, #0x01 79 movne r2, r2, ror #8 80 adds r8, r8, r2 81 adc r8, r8, #0x00 82 cmp ip, #0x00 83 bne .Lin_cksum_loop 84 85 mov r1, #0xff 86 orr r1, r1, #0xff00 87 and r0, r8, r1 88 add r0, r0, r8, lsr #16 89 add r0, r0, r0, lsr #16 90 and r0, r0, r1 91 eor r0, r0, r1 92 ldmfd sp!, {r4-r11,pc} 93 94/* 95 * int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len) 96 * 97 * Entry: 98 * r0 m 99 * r1 nxt 100 * r2 off 101 * r3 len 102 */ 103/* LINTSTUB: Func: int in4_cksum(struct mbuf *, u_int8_t, int, int) */ 104ENTRY(in4_cksum) 105 stmfd sp!, {r4-r11,lr} 106 mov r8, #0x00 /* Accumulate sum in r8 */ 107 108 /* 109 * First, deal with a pseudo header, if present 110 */ 111 ldr r6, [r0, #(M_DATA)] 112 cmp r1, #0x00 113 beq .Lin4_cksum_skip_entry 114 115 add r4, r6, #(IP_SRC) 116 ands r4, r4, #0x03 117 add r8, r1, r3 /* sum = nxt + len */ 118 addne pc, pc, r4, lsl #5 /* Handle alignment of pseudo header */ 119 120 /* 0x00: Data 32-bit aligned */ 121 ldr r5, [r6, #(IP_SRC)] 122 ldr r4, [r6, #(IP_DST)] 123 b .Lin4_cksum_add_ips 124 nop 125 nop 126 nop 127 nop 128 nop 129 nop 130 131 /* 0x01: Data 8-bit aligned */ 132 ldr r4, [r6, #(IP_SRC - 1)] /* BE:r4 = x012 LE:r4 = 210x */ 133 ldr r5, [r6, #(IP_SRC + 3)] /* BE:r5 = 3456 LE:r5 = 6543 */ 134 ldrb r7, [r6, #(IP_SRC + 7)] /* r7 = ...7 */ 135 mov r4, r4, lsr #8 /* r4 = .210 */ 136 orr r4, r4, r5, lsl #24 /* r4 = 3210 */ 137 mov r5, r5, lsr #8 /* r5 = .654 */ 138 orr r5, r5, r7, lsl #24 /* r5 = 7654 */ 139 b .Lin4_cksum_add_ips 140 141 /* 0x02: Data 16-bit aligned */ 142 ldr r4, [r6, #(IP_SRC - 2)] /* r4 = 10xx */ 143 ldrh r7, [r6, #(IP_DST + 2)] /* r7 = ..76 */ 144 ldr r5, [r6, #(IP_SRC + 2)] /* r5 = 5432 */ 145 mov r4, r4, lsr #16 /* r4 = ..10 */ 146 orr r4, r4, r7, lsl #16 /* r4 = 7610 */ 147 b .Lin4_cksum_add_ips 148 nop 149 nop 150 151 /* 0x03: Data 8-bit aligned */ 152 ldrb r4, [r6, #(IP_SRC)] /* r4 = ...0 */ 153 ldr r5, [r6, #(IP_SRC + 1)] /* BE:r5 = 1234 LE:r5 = 4321 */ 154 ldr r7, [r6, #(IP_SRC + 5)] /* BE:r7 = 567x LE:r7 = x765 */ 155 orr r4, r4, r5, lsl #8 /* r4 = 3210 */ 156 mov r5, r5, lsr #24 /* r4 = ...4 */ 157 orr r5, r5, r7, lsl #8 /* r5 = 7654 */ 158 /* FALLTHROUGH */ 159 160.Lin4_cksum_add_ips: 161 adds r5, r5, r4 162 adcs r8, r5, r8, lsl #8 163 adc r8, r8, #0x00 164 mov r1, #0x00 165 b .Lin4_cksum_skip_entry 166 167.Lin4_cksum_skip_loop: 168 ldr r1, [r0, #(M_LEN)] 169 ldr r6, [r0, #(M_DATA)] 170 ldr r0, [r0, #(M_NEXT)] 171.Lin4_cksum_skip_entry: 172 subs r2, r2, r1 173 blt .Lin4_cksum_skip_done 174 cmp r0, #0x00 175 bne .Lin4_cksum_skip_loop 176 b .Lin4_cksum_whoops 177 178.Lin4_cksum_skip_done: 179 mov ip, r0 180 add r0, r2, r6 181 add r0, r0, r1 182 rsb r1, r2, #0x00 183 mov r9, r3 184 mov r10, #0x00 185 b .Lin_cksum_entry4 186 187.Lin4_cksum_whoops: 188 adr r0, .Lin4_cksum_whoops_str 189 bl panic 190.Lin4_cksum_whoops_str: 191 .asciz "in4_cksum: out of mbufs\n" 192 .align 5 193 194/* 195 * The main in*_cksum() workhorse... 196 * 197 * Entry parameters: 198 * r0 Pointer to buffer 199 * r1 Buffer length 200 * lr Return address 201 * 202 * Returns: 203 * r2 Accumulated 32-bit sum 204 * 205 * Clobbers: 206 * r0-r7 207 */ 208/* LINTSTUB: Ignore */ 209ASENTRY_NP(L_cksumdata) 210 mov r2, #0 211 212 /* We first have to word-align the buffer. */ 213 ands r7, r0, #0x03 214 beq .Lcksumdata_wordaligned 215 rsb r7, r7, #0x04 216 cmp r1, r7 /* Enough bytes left to make it? */ 217 blt .Lcksumdata_endgame 218 cmp r7, #0x02 219 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ 220 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */ 221 movlt r5, #0x00 222 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */ 223 movle r6, #0x00 224 /* Combine the three bytes depending on endianness and alignment */ 225 orreq r2, r4, r5, lsl #8 226 orreq r2, r2, r6, lsl #16 227 orrne r2, r5, r4, lsl #8 228 orrne r2, r2, r6, lsl #24 229 subs r1, r1, r7 /* Update length */ 230 moveq pc, lr /* All done? */ 231 232 /* Buffer is now word aligned */ 233.Lcksumdata_wordaligned: 234 subs r1, r1, #0x40 235 blt .Lcksumdata_bigloop_end 236 237.Lcksumdata_bigloop: 238 ldmia r0!, {r3, r4, r5, r6} 239 adds r2, r2, r3 240 adcs r2, r2, r4 241 adcs r2, r2, r5 242 ldmia r0!, {r3, r4, r5, r7} 243 adcs r2, r2, r6 244 adcs r2, r2, r3 245 adcs r2, r2, r4 246 adcs r2, r2, r5 247 ldmia r0!, {r3, r4, r5, r6} 248 adcs r2, r2, r7 249 adcs r2, r2, r3 250 adcs r2, r2, r4 251 adcs r2, r2, r5 252 ldmia r0!, {r3, r4, r5, r7} 253 adcs r2, r2, r6 254 adcs r2, r2, r3 255 adcs r2, r2, r4 256 adcs r2, r2, r5 257 adcs r2, r2, r7 258 adc r2, r2, #0x00 259 subs r1, r1, #0x40 260 bge .Lcksumdata_bigloop 261.Lcksumdata_bigloop_end: 262 263 adds r1, r1, #0x40 264 moveq pc, lr 265 cmp r1, #0x20 266 267 blt .Lcksumdata_less_than_32 268 ldmia r0!, {r3, r4, r5, r6} 269 adds r2, r2, r3 270 adcs r2, r2, r4 271 adcs r2, r2, r5 272 ldmia r0!, {r3, r4, r5, r7} 273 adcs r2, r2, r6 274 adcs r2, r2, r3 275 adcs r2, r2, r4 276 adcs r2, r2, r5 277 adcs r2, r2, r7 278 adc r2, r2, #0x00 279 subs r1, r1, #0x20 280 moveq pc, lr 281 282.Lcksumdata_less_than_32: 283 /* There are less than 32 bytes left */ 284 and r3, r1, #0x18 285 rsb r4, r3, #0x18 286 sub r1, r1, r3 287 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 288 addne pc, pc, r4 289 290/* 291 * Note: We use ldm here, even on Xscale, since the combined issue/result 292 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 293 */ 294 /* At least 24 bytes remaining... */ 295 ldmia r0!, {r4, r5} 296 nop 297 adcs r2, r2, r4 298 adcs r2, r2, r5 299 300 /* At least 16 bytes remaining... */ 301 ldmia r0!, {r4, r5} 302 adcs r2, r2, r4 303 adcs r2, r2, r5 304 305 /* At least 8 bytes remaining... */ 306 ldmia r0!, {r4, r5} 307 adcs r2, r2, r4 308 adcs r2, r2, r5 309 310 /* Less than 8 bytes remaining... */ 311 adc r2, r2, #0x00 312 subs r1, r1, #0x04 313 blt .Lcksumdata_lessthan4 314 315 ldr r4, [r0], #0x04 316 sub r1, r1, #0x04 317 adds r2, r2, r4 318 adc r2, r2, #0x00 319 320 /* Deal with < 4 bytes remaining */ 321.Lcksumdata_lessthan4: 322 adds r1, r1, #0x04 323 moveq pc, lr 324 325 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 326.Lcksumdata_endgame: 327 ldrb r3, [r0] /* Fetch first byte */ 328 cmp r1, #0x02 329 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ 330 movlt r4, #0x00 331 ldrbgt r5, [r0, #0x02] 332 movle r5, #0x00 333 /* Combine the three bytes depending on endianness and alignment */ 334 tst r0, #0x01 335 orreq r3, r3, r4, lsl #8 336 orreq r3, r3, r5, lsl #16 337 orrne r3, r4, r3, lsl #8 338 orrne r3, r3, r5, lsl #24 339 adds r2, r2, r3 340 adc r2, r2, #0x00 341 mov pc, lr 342