1######################################################################## 2# Copyright(c) 2019 Arm Corporation All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# * Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# * Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in 11# the documentation and/or other materials provided with the 12# distribution. 13# * Neither the name of Arm Corporation nor the names of its 14# contributors may be used to endorse or promote products derived 15# from this software without specific prior written permission. 16# 17# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28######################################################################### 29 30 .arch armv8-a+crc+crypto 31 .text 32 .align 3 33 .global crc16_t10dif_copy_pmull 34 .type crc16_t10dif_copy_pmull, %function 35 36/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */ 37 38/* arguments */ 39w_seed .req w0 40x_dst .req x1 41x_src .req x2 42x_len .req x3 43w_len .req w3 44 45/* returns */ 46w_ret .req w0 47 48/* these as global temporary registers */ 49w_tmp .req w6 50x_tmp .req x6 51x_tmp1 .req x7 52x_tmp2 .req x11 53 54d_tmp1 .req d0 55d_tmp2 .req d1 56q_tmp1 .req q0 57q_tmp2 .req q1 58v_tmp1 .req v0 59v_tmp2 .req v1 60 61/* local variables */ 62w_counter .req w4 63w_crc .req w0 64x_crc .req x0 65x_counter .req x4 66x_crc16tab .req x5 67x_src_saved .req x0 68x_dst_saved .req x12 69 70crc16_t10dif_copy_pmull: 71 cmp x_len, 1023 72 sub sp, sp, #16 73 uxth w_seed, w_seed 74 bhi .crc_fold 75 76 mov x_tmp, 0 77 mov w_counter, 0 78 79.crc_table_loop_pre: 80 cmp x_len, x_tmp 81 bls .end 82 83 sxtw x_counter, w_counter 84 adrp x_crc16tab, .LANCHOR0 85 sub x_src, x_src, x_counter 86 sub x_dst, x_dst, x_counter 87 add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0 88 89 .align 2 90.crc_table_loop: 91 ldrb w_tmp, [x_src, x_counter] 92 strb w_tmp, [x_dst, x_counter] 93 add x_counter, x_counter, 1 94 cmp x_len, x_counter 95 eor w_tmp, w_tmp, w_crc, lsr 8 96 ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1] 97 eor w_crc, w_tmp, w_crc, lsl 8 98 uxth w_crc, w_crc 99 bhi .crc_table_loop 100 101.end: 102 add sp, sp, 16 103 ret 104 105/* carry less multiplication, part1 - before loop */ 106q_x0 .req q2 107q_x1 .req q3 108q_x2 .req q4 109q_x3 .req q5 110 111v_x0 .req v2 112v_x1 .req v3 113v_x2 .req v4 114v_x3 .req v5 115 116d_x0 .req d2 117d_x1 .req d3 118d_x2 .req d4 119d_x3 .req d5 120 121// the following registers only used this part1 122d_tmp3 .req d16 123v_tmp3 .req v16 124 125 .align 3 126.crc_fold: 127 fmov d_tmp1, x_crc 128 fmov d_tmp2, xzr 129 dup d_tmp3, v_tmp2.d[0] 130 shl d_tmp1, d_tmp1, 48 131 ins v_tmp3.d[1], v_tmp1.d[0] 132 133 and x_counter, x_len, -64 134 sub x_counter, x_counter, #64 135 cmp x_counter, 63 136 add x_src_saved, x_src, 64 137 add x_dst_saved, x_dst, 64 138 139 ldr q_x0, [x_src] 140 ldr q_x1, [x_src, 16] 141 ldr q_x2, [x_src, 32] 142 ldr q_x3, [x_src, 48] 143 144 str q_x0, [x_dst] 145 str q_x1, [x_dst, 16] 146 str q_x2, [x_dst, 32] 147 str q_x3, [x_dst, 48] 148 149 adrp x_tmp, .shuffle_mask_lanchor 150 ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor] 151 152 tbl v_tmp1.16b, {v_x0.16b}, v7.16b 153 eor v_x0.16b, v_tmp3.16b, v_tmp1.16b 154 155 tbl v_x1.16b, {v_x1.16b}, v7.16b 156 tbl v_x2.16b, {v_x2.16b}, v7.16b 157 tbl v_x3.16b, {v_x3.16b}, v7.16b 158 bls .crc_fold_loop_end 159 160/* carry less multiplication, part2 - loop */ 161q_y0 .req q28 162q_y1 .req q29 163q_y2 .req q30 164q_y3 .req q31 165 166v_y0 .req v28 167v_y1 .req v29 168v_y2 .req v30 169v_y3 .req v31 170 171d_x0_h .req d24 172d_x0_l .req d2 173d_x1_h .req d25 174d_x1_l .req d3 175d_x2_h .req d26 176d_x2_l .req d4 177d_x3_h .req d27 178d_x3_l .req d5 179 180v_x0_h .req v24 181v_x0_l .req v2 182v_x1_h .req v25 183v_x1_l .req v3 184v_x2_h .req v26 185v_x2_l .req v4 186v_x3_h .req v27 187v_x3_l .req v5 188 189v_tmp1_x0 .req v24 190v_tmp1_x1 .req v25 191v_tmp1_x2 .req v26 192v_tmp1_x3 .req v27 193 194d_p4_h .req d19 195v_p4_h .req v19 196d_p4_l .req d17 197v_p4_l .req v17 198 199 mov x_tmp, 0x371d0000 /* p4 [1] */ 200 fmov d_p4_h, x_tmp 201 mov x_tmp, 0x87e70000 /* p4 [0] */ 202 fmov d_p4_l, x_tmp 203 204 .align 2 205.crc_fold_loop: 206 add x_src_saved, x_src_saved, 64 207 add x_dst_saved, x_dst_saved, 64 208 209 sub x_counter, x_counter, #64 210 cmp x_counter, 63 211 212 dup d_x0_h, v_x0.d[1] 213 dup d_x1_h, v_x1.d[1] 214 dup d_x2_h, v_x2.d[1] 215 dup d_x3_h, v_x3.d[1] 216 217 dup d_x0_l, v_x0.d[0] 218 dup d_x1_l, v_x1.d[0] 219 dup d_x2_l, v_x2.d[0] 220 dup d_x3_l, v_x3.d[0] 221 222 ldr q_y0, [x_src_saved, -64] 223 ldr q_y1, [x_src_saved, -48] 224 ldr q_y2, [x_src_saved, -32] 225 ldr q_y3, [x_src_saved, -16] 226 227 str q_y0, [x_dst_saved, -64] 228 str q_y1, [x_dst_saved, -48] 229 str q_y2, [x_dst_saved, -32] 230 str q_y3, [x_dst_saved, -16] 231 232 pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d 233 pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d 234 pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d 235 pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d 236 pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d 237 pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d 238 pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d 239 pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d 240 241 tbl v_y0.16b, {v_y0.16b}, v7.16b 242 tbl v_y1.16b, {v_y1.16b}, v7.16b 243 tbl v_y2.16b, {v_y2.16b}, v7.16b 244 tbl v_y3.16b, {v_y3.16b}, v7.16b 245 246 eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b 247 eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b 248 eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b 249 eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b 250 251 eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b 252 eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b 253 eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b 254 eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b 255 256 bhi .crc_fold_loop 257 258/* carry less multiplication, part3 - after loop */ 259/* folding 512bit ---> 128bit */ 260 261// input parameters: 262// v_x0 => v2 263// v_x1 => v3 264// v_x2 => v4 265// v_x3 => v5 266 267// v0, v1, v6, v30, are tmp registers 268 269.crc_fold_loop_end: 270 mov x_tmp, 0x4c1a0000 /* p1 [1] */ 271 fmov d0, x_tmp 272 mov x_tmp, 0xfb0b0000 /* p1 [0] */ 273 fmov d1, x_tmp 274 275 and w_counter, w_len, -64 276 sxtw x_tmp, w_counter 277 278 add x_src, x_src, x_tmp 279 add x_dst, x_dst, x_tmp 280 281 dup d6, v_x0.d[1] 282 dup d30, v_x0.d[0] 283 pmull v6.1q, v6.1d, v0.1d 284 pmull v30.1q, v30.1d, v1.1d 285 eor v6.16b, v6.16b, v30.16b 286 eor v_x1.16b, v6.16b, v_x1.16b 287 288 dup d6, v_x1.d[1] 289 dup d30, v_x1.d[0] 290 pmull v6.1q, v6.1d, v0.1d 291 pmull v16.1q, v30.1d, v1.1d 292 eor v6.16b, v6.16b, v16.16b 293 eor v_x2.16b, v6.16b, v_x2.16b 294 295 dup d_x0, v_x2.d[1] 296 dup d30, v_x2.d[0] 297 pmull v0.1q, v_x0.1d, v0.1d 298 pmull v_x0.1q, v30.1d, v1.1d 299 eor v1.16b, v0.16b, v_x0.16b 300 eor v_x0.16b, v1.16b, v_x3.16b 301 302/* carry less multiplication, part3 - after loop */ 303/* crc16 fold function */ 304d_16fold_p0_h .req d18 305v_16fold_p0_h .req v18 306 307d_16fold_p0_l .req d4 308v_16fold_p0_l .req v4 309 310v_16fold_from .req v_x0 311d_16fold_from_h .req d3 312v_16fold_from_h .req v3 313 314v_16fold_zero .req v7 315 316v_16fold_from1 .req v16 317 318v_16fold_from2 .req v0 319d_16fold_from2_h .req d6 320v_16fold_from2_h .req v6 321 322v_16fold_tmp .req v0 323 324 movi v_16fold_zero.4s, 0 325 mov x_tmp1, 0x2d560000 /* p0 [1] */ 326 mov x_tmp2, 0x13680000 /* p0 [0] */ 327 328 ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8 329 ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4 330 331 dup d_16fold_from_h, v_16fold_from.d[1] 332 fmov d_16fold_p0_h, x_tmp1 333 pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d 334 eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b 335 336 dup d_16fold_from2_h, v_16fold_from2.d[1] 337 fmov d_16fold_p0_l, x_tmp2 338 pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d 339 eor v_x0.16b, v0.16b, v6.16b 340 341/* carry less multiplication, part3 - after loop */ 342/* crc16 barrett reduction function */ 343 344// input parameters: 345// v_x0: v2 346// barrett reduction constant: br[0], br[1] 347 348d_br0 .req d3 349v_br0 .req v3 350d_br1 .req d5 351v_br1 .req v5 352 353 mov x_tmp1, 0x57f9 /* br[0] low */ 354 movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */ 355 movk x_tmp1, 0x1, lsl 32 356 fmov d_br0, x_tmp1 357 358 dup d1, v_x0.d[0] 359 dup d1, v1.d[0] 360 ext v1.16b, v1.16b, v7.16b, #4 361 pmull v4.1q, v1.1d, v_br0.1d 362 363 ext v1.16b, v4.16b, v7.16b, #4 364 mov x_tmp1, 0x8bb70000 /* br[1] low */ 365 movk x_tmp1, 0x1, lsl 32 /* br[1] high */ 366 367 fmov d_br1, x_tmp1 368 pmull v_br1.1q, v1.1d, v_br1.1d 369 eor v_x0.16b, v_x0.16b, v_br1.16b 370 371 umov x0, v_x0.d[0] 372 ubfx x0, x0, 16, 16 373 b .crc_table_loop_pre 374 375 .size crc16_t10dif_copy_pmull, .-crc16_t10dif_copy_pmull 376 377 .section .rodata 378 379 .align 4 380.shuffle_mask_lanchor = . + 0 381 .type shuffle_mask, %object 382 .size shuffle_mask, 16 383shuffle_mask: 384 .byte 15, 14, 13, 12, 11, 10, 9, 8 385 .byte 7, 6, 5, 4, 3, 2, 1, 0 386 387 .align 4 388.LANCHOR0 = . + 0 389 .type crc16tab, %object 390 .size crc16tab, 512 391crc16tab: 392 .hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b 393 .hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6 394 .hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6 395 .hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b 396 .hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1 397 .hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c 398 .hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c 399 .hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781 400 .hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8 401 .hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255 402 .hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925 403 .hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698 404 .hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472 405 .hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf 406 .hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf 407 .hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02 408 .hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda 409 .hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067 410 .hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17 411 .hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa 412 .hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640 413 .hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd 414 .hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d 415 .hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30 416 .hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759 417 .hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4 418 .hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394 419 .hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29 420 .hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3 421 .hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e 422 .hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e 423 .hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3 424