1######################################################################## 2# Copyright(c) 2019 Arm Corporation All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# * Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# * Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in 11# the documentation and/or other materials provided with the 12# distribution. 13# * Neither the name of Arm Corporation nor the names of its 14# contributors may be used to endorse or promote products derived 15# from this software without specific prior written permission. 16# 17# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28######################################################################### 29 30 .arch armv8-a+crc+crypto 31 .text 32 .align 3 33 .global crc16_t10dif_pmull 34 .type crc16_t10dif_pmull, %function 35 36/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */ 37 38/* arguments */ 39w_seed .req w0 40x_buf .req x1 41x_len .req x2 42w_len .req w2 43 44/* returns */ 45w_ret .req w0 46 47/* these as global temporary registers */ 48w_tmp .req w5 49x_tmp .req x5 50x_tmp1 .req x6 51x_tmp2 .req x7 52 53d_tmp1 .req d0 54d_tmp2 .req d1 55q_tmp1 .req q0 56q_tmp2 .req q1 57v_tmp1 .req v0 58v_tmp2 .req v1 59 60/* local variables */ 61w_counter .req w3 62w_crc .req w0 63x_crc .req x0 64x_counter .req x3 65x_crc16tab .req x4 66x_buf_saved .req x0 67 68crc16_t10dif_pmull: 69 cmp x_len, 1023 70 sub sp, sp, #16 71 uxth w_seed, w_seed 72 bhi .crc_fold 73 74 mov x_tmp, 0 75 mov w_counter, 0 76 77.crc_table_loop_pre: 78 cmp x_len, x_tmp 79 bls .end 80 81 sxtw x_counter, w_counter 82 adrp x_crc16tab, .LANCHOR0 83 sub x_buf, x_buf, x_counter 84 add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0 85 86 .align 2 87.crc_table_loop: 88 ldrb w_tmp, [x_buf, x_counter] 89 add x_counter, x_counter, 1 90 cmp x_len, x_counter 91 eor w_tmp, w_tmp, w_crc, lsr 8 92 ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1] 93 eor w_crc, w_tmp, w_crc, lsl 8 94 uxth w_crc, w_crc 95 bhi .crc_table_loop 96 97.end: 98 add sp, sp, 16 99 ret 100 101/* carry less multiplication, part1 - before loop */ 102q_x0 .req q2 103q_x1 .req q3 104q_x2 .req q4 105q_x3 .req q5 106 107v_x0 .req v2 108v_x1 .req v3 109v_x2 .req v4 110v_x3 .req v5 111 112d_x0 .req d2 113d_x1 .req d3 114d_x2 .req d4 115d_x3 .req d5 116 117// the following registers only used this part1 118d_tmp3 .req d16 119v_tmp3 .req v16 120 121 .align 3 122.crc_fold: 123 fmov d_tmp1, x_crc 124 fmov d_tmp2, xzr 125 dup d_tmp3, v_tmp2.d[0] 126 shl d_tmp1, d_tmp1, 48 127 ins v_tmp3.d[1], v_tmp1.d[0] 128 129 and x_counter, x_len, -64 130 sub x_counter, x_counter, #64 131 cmp x_counter, 63 132 add x_buf_saved, x_buf, 64 133 134 ldr q_x0, [x_buf] 135 ldr q_x1, [x_buf, 16] 136 ldr q_x2, [x_buf, 32] 137 ldr q_x3, [x_buf, 48] 138 139 adrp x_tmp, .shuffle_mask_lanchor 140 ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor] 141 142 tbl v_tmp1.16b, {v_x0.16b}, v7.16b 143 eor v_x0.16b, v_tmp3.16b, v_tmp1.16b 144 145 tbl v_x1.16b, {v_x1.16b}, v7.16b 146 tbl v_x2.16b, {v_x2.16b}, v7.16b 147 tbl v_x3.16b, {v_x3.16b}, v7.16b 148 bls .crc_fold_loop_end 149 150/* carry less multiplication, part2 - loop */ 151q_y0 .req q28 152q_y1 .req q29 153q_y2 .req q30 154q_y3 .req q31 155 156v_y0 .req v28 157v_y1 .req v29 158v_y2 .req v30 159v_y3 .req v31 160 161d_x0_h .req d24 162d_x0_l .req d2 163d_x1_h .req d25 164d_x1_l .req d3 165d_x2_h .req d26 166d_x2_l .req d4 167d_x3_h .req d27 168d_x3_l .req d5 169 170v_x0_h .req v24 171v_x0_l .req v2 172v_x1_h .req v25 173v_x1_l .req v3 174v_x2_h .req v26 175v_x2_l .req v4 176v_x3_h .req v27 177v_x3_l .req v5 178 179v_tmp1_x0 .req v24 180v_tmp1_x1 .req v25 181v_tmp1_x2 .req v26 182v_tmp1_x3 .req v27 183 184d_p4_h .req d19 185v_p4_h .req v19 186d_p4_l .req d17 187v_p4_l .req v17 188 189 mov x_tmp, 0x371d0000 /* p4 [1] */ 190 fmov d_p4_h, x_tmp 191 mov x_tmp, 0x87e70000 /* p4 [0] */ 192 fmov d_p4_l, x_tmp 193 194 .align 2 195.crc_fold_loop: 196 add x_buf_saved, x_buf_saved, 64 197 sub x_counter, x_counter, #64 198 cmp x_counter, 63 199 200 dup d_x0_h, v_x0.d[1] 201 dup d_x1_h, v_x1.d[1] 202 dup d_x2_h, v_x2.d[1] 203 dup d_x3_h, v_x3.d[1] 204 205 dup d_x0_l, v_x0.d[0] 206 dup d_x1_l, v_x1.d[0] 207 dup d_x2_l, v_x2.d[0] 208 dup d_x3_l, v_x3.d[0] 209 210 ldr q_y0, [x_buf_saved, -64] 211 ldr q_y1, [x_buf_saved, -48] 212 ldr q_y2, [x_buf_saved, -32] 213 ldr q_y3, [x_buf_saved, -16] 214 215 pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d 216 pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d 217 pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d 218 pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d 219 pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d 220 pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d 221 pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d 222 pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d 223 224 tbl v_y0.16b, {v_y0.16b}, v7.16b 225 tbl v_y1.16b, {v_y1.16b}, v7.16b 226 tbl v_y2.16b, {v_y2.16b}, v7.16b 227 tbl v_y3.16b, {v_y3.16b}, v7.16b 228 229 eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b 230 eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b 231 eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b 232 eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b 233 234 eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b 235 eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b 236 eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b 237 eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b 238 239 bhi .crc_fold_loop 240 241/* carry less multiplication, part3 - after loop */ 242/* folding 512bit ---> 128bit */ 243 244// input parameters: 245// v_x0 => v2 246// v_x1 => v3 247// v_x2 => v4 248// v_x3 => v5 249 250// v0, v1, v6, v30, are tmp registers 251 252.crc_fold_loop_end: 253 mov x_tmp, 0x4c1a0000 /* p1 [1] */ 254 fmov d0, x_tmp 255 mov x_tmp, 0xfb0b0000 /* p1 [0] */ 256 fmov d1, x_tmp 257 258 and w_counter, w_len, -64 259 sxtw x_tmp, w_counter 260 add x_buf, x_buf, x_tmp 261 262 dup d6, v_x0.d[1] 263 dup d30, v_x0.d[0] 264 pmull v6.1q, v6.1d, v0.1d 265 pmull v30.1q, v30.1d, v1.1d 266 eor v6.16b, v6.16b, v30.16b 267 eor v_x1.16b, v6.16b, v_x1.16b 268 269 dup d6, v_x1.d[1] 270 dup d30, v_x1.d[0] 271 pmull v6.1q, v6.1d, v0.1d 272 pmull v16.1q, v30.1d, v1.1d 273 eor v6.16b, v6.16b, v16.16b 274 eor v_x2.16b, v6.16b, v_x2.16b 275 276 dup d_x0, v_x2.d[1] 277 dup d30, v_x2.d[0] 278 pmull v0.1q, v_x0.1d, v0.1d 279 pmull v_x0.1q, v30.1d, v1.1d 280 eor v1.16b, v0.16b, v_x0.16b 281 eor v_x0.16b, v1.16b, v_x3.16b 282 283/* carry less multiplication, part3 - after loop */ 284/* crc16 fold function */ 285d_16fold_p0_h .req d18 286v_16fold_p0_h .req v18 287 288d_16fold_p0_l .req d4 289v_16fold_p0_l .req v4 290 291v_16fold_from .req v_x0 292d_16fold_from_h .req d3 293v_16fold_from_h .req v3 294 295v_16fold_zero .req v7 296 297v_16fold_from1 .req v16 298 299v_16fold_from2 .req v0 300d_16fold_from2_h .req d6 301v_16fold_from2_h .req v6 302 303v_16fold_tmp .req v0 304 305 movi v_16fold_zero.4s, 0 306 mov x_tmp1, 0x2d560000 /* p0 [1] */ 307 mov x_tmp2, 0x13680000 /* p0 [0] */ 308 309 ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8 310 ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4 311 312 dup d_16fold_from_h, v_16fold_from.d[1] 313 fmov d_16fold_p0_h, x_tmp1 314 pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d 315 eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b 316 317 dup d_16fold_from2_h, v_16fold_from2.d[1] 318 fmov d_16fold_p0_l, x_tmp2 319 pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d 320 eor v_x0.16b, v0.16b, v6.16b 321 322/* carry less multiplication, part3 - after loop */ 323/* crc16 barrett reduction function */ 324 325// input parameters: 326// v_x0: v2 327// barrett reduction constant: br[0], br[1] 328 329d_br0 .req d3 330v_br0 .req v3 331d_br1 .req d5 332v_br1 .req v5 333 334 mov x_tmp1, 0x57f9 /* br[0] low */ 335 movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */ 336 movk x_tmp1, 0x1, lsl 32 337 fmov d_br0, x_tmp1 338 339 dup d1, v_x0.d[0] 340 dup d1, v1.d[0] 341 ext v1.16b, v1.16b, v7.16b, #4 342 pmull v4.1q, v1.1d, v_br0.1d 343 344 ext v1.16b, v4.16b, v7.16b, #4 345 mov x_tmp1, 0x8bb70000 /* br[1] low */ 346 movk x_tmp1, 0x1, lsl 32 /* br[1] high */ 347 348 fmov d_br1, x_tmp1 349 pmull v_br1.1q, v1.1d, v_br1.1d 350 eor v_x0.16b, v_x0.16b, v_br1.16b 351 352 umov x0, v_x0.d[0] 353 ubfx x0, x0, 16, 16 354 b .crc_table_loop_pre 355 356 .size crc16_t10dif_pmull, .-crc16_t10dif_pmull 357 358 .section .rodata 359 360 .align 4 361.shuffle_mask_lanchor = . + 0 362 .type shuffle_mask, %object 363 .size shuffle_mask, 16 364shuffle_mask: 365 .byte 15, 14, 13, 12, 11, 10, 9, 8 366 .byte 7, 6, 5, 4, 3, 2, 1, 0 367 368 .align 4 369.LANCHOR0 = . + 0 370 .type crc16tab, %object 371 .size crc16tab, 512 372crc16tab: 373 .hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b 374 .hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6 375 .hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6 376 .hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b 377 .hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1 378 .hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c 379 .hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c 380 .hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781 381 .hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8 382 .hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255 383 .hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925 384 .hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698 385 .hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472 386 .hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf 387 .hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf 388 .hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02 389 .hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda 390 .hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067 391 .hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17 392 .hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa 393 .hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640 394 .hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd 395 .hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d 396 .hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30 397 .hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759 398 .hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4 399 .hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394 400 .hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29 401 .hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3 402 .hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e 403 .hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e 404 .hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3 405