1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29 30.text 31 32.global gf_5vect_dot_prod_neon 33.type gf_5vect_dot_prod_neon, %function 34 35 36/* arguments */ 37x_len .req x0 38x_vec .req x1 39x_tbl .req x2 40x_src .req x3 41x_dest .req x4 42 43/* returns */ 44w_ret .req w0 45 46/* local variables */ 47x_vec_i .req x5 48x_ptr .req x6 49x_pos .req x7 50x_tmp .req x8 51x_dest1 .req x9 52x_dest2 .req x10 53x_dest3 .req x11 54x_dest4 .req x12 55x_dest5 .req x13 56 57/* vectors */ 58v_tmp1 .req v0 59q_tmp1 .req q0 60v_tmp2 .req v1 61q_tmp2 .req q1 62 63v_mask0f .req v_tmp1 64q_mask0f .req q_tmp1 65v_tmp_lo .req v_tmp1 66v_tmp_hi .req v_tmp2 67 68v_gft_lo .req v2 69v_gft_hi .req v3 70q_gft_lo .req q2 71q_gft_hi .req q3 72 73v_p1_0 .req v4 74v_p2_0 .req v5 75v_p3_0 .req v6 76v_p4_0 .req v7 77 78q_p1_0 .req q4 79q_p2_0 .req q5 80q_p3_0 .req q6 81q_p4_0 .req q7 82 83v_data_0 .req v8 84v_data_1 .req v9 85v_data_2 .req v10 86v_data_3 .req v11 87q_data_0 .req q8 88q_data_1 .req q9 89q_data_2 .req q10 90q_data_3 .req q11 91 92v_data_0_lo .req v12 93v_data_1_lo .req v13 94v_data_2_lo .req v14 95v_data_3_lo .req v15 96v_data_0_hi .req v_data_0 97v_data_1_hi .req v_data_1 98v_data_2_hi .req v_data_2 99v_data_3_hi .req v_data_3 100 101v_p5_0 .req v16 102v_p1_1 .req v17 103v_p2_1 .req v18 104v_p3_1 .req v19 105v_p4_1 .req v20 106v_p5_1 .req v21 107v_p1_2 .req v22 108v_p2_2 .req v23 109v_p3_2 .req v24 110v_p4_2 .req v25 111v_p5_2 .req v26 112v_p1_3 .req v27 113v_p2_3 .req v28 114v_p3_3 .req v29 115v_p4_3 .req v30 116v_p5_3 .req v31 117 118q_p5_0 .req q16 119q_p1_1 .req q17 120q_p2_1 .req q18 121q_p3_1 .req q19 122q_p4_1 .req q20 123q_p5_1 .req q21 124q_p1_2 .req q22 125q_p2_2 .req q23 126q_p3_2 .req q24 127q_p4_2 .req q25 128q_p5_2 .req q26 129q_p1_3 .req q27 130q_p2_3 .req q28 131q_p3_3 .req q29 132q_p4_3 .req q30 133q_p5_3 .req q31 134 135v_data .req v_p1_1 136q_data .req q_p1_1 137v_data_lo .req v_p2_1 138v_data_hi .req v_p3_1 139 140v_gft1_lo .req v_p4_1 141v_gft1_hi .req v_p5_1 142v_gft2_lo .req v_p1_2 143v_gft2_hi .req v_p2_2 144v_gft3_lo .req v_p3_2 145v_gft3_hi .req v_p4_2 146v_gft4_lo .req v_p5_2 147v_gft4_hi .req v_p1_3 148v_gft5_lo .req v_p2_3 149v_gft5_hi .req v_p3_3 150q_gft1_lo .req q_p4_1 151q_gft1_hi .req q_p5_1 152q_gft2_lo .req q_p1_2 153q_gft2_hi .req q_p2_2 154q_gft3_lo .req q_p3_2 155q_gft3_hi .req q_p4_2 156q_gft4_lo .req q_p5_2 157q_gft4_hi .req q_p1_3 158q_gft5_lo .req q_p2_3 159q_gft5_hi .req q_p3_3 160 161 162gf_5vect_dot_prod_neon: 163 /* less than 16 bytes, return_fail */ 164 cmp x_len, #16 165 blt .return_fail 166 167 mov x_pos, #0 168 lsl x_vec, x_vec, #3 169 ldr x_dest1, [x_dest, #8*0] 170 ldr x_dest2, [x_dest, #8*1] 171 ldr x_dest3, [x_dest, #8*2] 172 ldr x_dest4, [x_dest, #8*3] 173 ldr x_dest5, [x_dest, #8*4] 174 175.Lloop64_init: 176 /* less than 64 bytes, goto Lloop16_init */ 177 cmp x_len, #64 178 blt .Lloop16_init 179 180 /* save d8 ~ d15 to stack */ 181 sub sp, sp, #64 182 stp d8, d9, [sp] 183 stp d10, d11, [sp, #16] 184 stp d12, d13, [sp, #32] 185 stp d14, d15, [sp, #48] 186 187 sub x_len, x_len, #64 188 189.Lloop64: 190 movi v_p1_0.16b, #0 191 movi v_p1_1.16b, #0 192 movi v_p1_2.16b, #0 193 movi v_p1_3.16b, #0 194 movi v_p2_0.16b, #0 195 movi v_p2_1.16b, #0 196 movi v_p2_2.16b, #0 197 movi v_p2_3.16b, #0 198 movi v_p3_0.16b, #0 199 movi v_p3_1.16b, #0 200 movi v_p3_2.16b, #0 201 movi v_p3_3.16b, #0 202 movi v_p4_0.16b, #0 203 movi v_p4_1.16b, #0 204 movi v_p4_2.16b, #0 205 movi v_p4_3.16b, #0 206 movi v_p5_0.16b, #0 207 movi v_p5_1.16b, #0 208 movi v_p5_2.16b, #0 209 movi v_p5_3.16b, #0 210 mov x_vec_i, #0 211 212.Lloop64_vects: 213 ldr x_ptr, [x_src, x_vec_i] 214 add x_ptr, x_ptr, x_pos 215 216 ldr q_data_0, [x_ptr], #16 217 ldr q_data_1, [x_ptr], #16 218 ldr q_data_2, [x_ptr], #16 219 ldr q_data_3, [x_ptr], #16 220 prfm pldl2keep, [x_ptr] 221 222 movi v_mask0f.16b, #0x0f 223 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b 224 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b 225 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b 226 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b 227 ushr v_data_0_hi.16b, v_data_0.16b, #4 228 ushr v_data_1_hi.16b, v_data_1.16b, #4 229 ushr v_data_2_hi.16b, v_data_2.16b, #4 230 ushr v_data_3_hi.16b, v_data_3.16b, #4 231 232 /* v_p1_x */ 233 add x_tmp, x_tbl, x_vec_i, lsl #2 234 add x_vec_i, x_vec_i, #8 235 ldp q_gft_lo, q_gft_hi, [x_tmp] 236 prfm pldl3keep, [x_tmp, #32] 237 add x_tmp, x_tmp, x_vec, lsl #2 238 239 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b 240 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b 241 eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b 242 eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b 243 244 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b 245 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b 246 eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b 247 eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b 248 249 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b 250 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b 251 eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b 252 eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b 253 254 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b 255 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b 256 eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b 257 eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b 258 259 /* v_p2_x */ 260 ldp q_gft_lo, q_gft_hi, [x_tmp] 261 prfm pldl3keep, [x_tmp, #32] 262 add x_tmp, x_tmp, x_vec, lsl #2 263 264 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b 265 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b 266 eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b 267 eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b 268 269 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b 270 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b 271 eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b 272 eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b 273 274 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b 275 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b 276 eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b 277 eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b 278 279 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b 280 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b 281 eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b 282 eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b 283 284 /* v_p3_x */ 285 ldp q_gft_lo, q_gft_hi, [x_tmp] 286 prfm pldl3keep, [x_tmp, #32] 287 add x_tmp, x_tmp, x_vec, lsl #2 288 289 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b 290 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b 291 eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b 292 eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b 293 294 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b 295 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b 296 eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b 297 eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b 298 299 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b 300 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b 301 eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b 302 eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b 303 304 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b 305 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b 306 eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b 307 eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b 308 309 /* v_p4_x */ 310 ldp q_gft_lo, q_gft_hi, [x_tmp] 311 prfm pldl3keep, [x_tmp, #32] 312 add x_tmp, x_tmp, x_vec, lsl #2 313 314 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b 315 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b 316 eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b 317 eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b 318 319 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b 320 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b 321 eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b 322 eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b 323 324 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b 325 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b 326 eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b 327 eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b 328 329 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b 330 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b 331 eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b 332 eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b 333 334 /* v_p5_x */ 335 ldp q_gft_lo, q_gft_hi, [x_tmp] 336 prfm pldl3keep, [x_tmp, #32] 337 338 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b 339 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b 340 eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b 341 eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b 342 343 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b 344 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b 345 eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b 346 eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b 347 348 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b 349 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b 350 eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b 351 eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b 352 353 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b 354 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b 355 eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b 356 eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b 357 358 cmp x_vec_i, x_vec 359 blt .Lloop64_vects 360 361.Lloop64_vects_end: 362 add x_ptr, x_dest1, x_pos 363 stp q_p1_0, q_p1_1, [x_ptr], #32 364 stp q_p1_2, q_p1_3, [x_ptr] 365 366 add x_ptr, x_dest2, x_pos 367 stp q_p2_0, q_p2_1, [x_ptr], #32 368 stp q_p2_2, q_p2_3, [x_ptr] 369 370 add x_ptr, x_dest3, x_pos 371 stp q_p3_0, q_p3_1, [x_ptr], #32 372 stp q_p3_2, q_p3_3, [x_ptr] 373 374 add x_ptr, x_dest4, x_pos 375 stp q_p4_0, q_p4_1, [x_ptr], #32 376 stp q_p4_2, q_p4_3, [x_ptr] 377 378 add x_ptr, x_dest5, x_pos 379 stp q_p5_0, q_p5_1, [x_ptr], #32 380 stp q_p5_2, q_p5_3, [x_ptr] 381 382 add x_pos, x_pos, #64 383 cmp x_pos, x_len 384 ble .Lloop64 385 386.Lloop64_end: 387 /* restore d8 ~ d15 */ 388 ldp d8, d9, [sp] 389 ldp d10, d11, [sp, #16] 390 ldp d12, d13, [sp, #32] 391 ldp d14, d15, [sp, #48] 392 add sp, sp, #64 393 394 add x_len, x_len, #64 395 cmp x_pos, x_len 396 beq .return_pass 397 398.Lloop16_init: 399 sub x_len, x_len, #16 400 cmp x_pos, x_len 401 bgt .lessthan16_init 402 403.Lloop16: 404 movi v_p1_0.16b, #0 405 movi v_p2_0.16b, #0 406 movi v_p3_0.16b, #0 407 movi v_p4_0.16b, #0 408 movi v_p5_0.16b, #0 409 mov x_vec_i, #0 410 411.Lloop16_vects: 412 ldr x_ptr, [x_src, x_vec_i] 413 ldr q_data, [x_ptr, x_pos] 414 415 movi v_mask0f.16b, #0x0f 416 and v_data_lo.16b, v_data.16b, v_mask0f.16b 417 ushr v_data_hi.16b, v_data.16b, #4 418 419 add x_tmp, x_tbl, x_vec_i, lsl #2 420 add x_vec_i, x_vec_i, #8 421 ldp q_gft1_lo, q_gft1_hi, [x_tmp] 422 add x_tmp, x_tmp, x_vec, lsl #2 423 ldp q_gft2_lo, q_gft2_hi, [x_tmp] 424 add x_tmp, x_tmp, x_vec, lsl #2 425 ldp q_gft3_lo, q_gft3_hi, [x_tmp] 426 add x_tmp, x_tmp, x_vec, lsl #2 427 ldp q_gft4_lo, q_gft4_hi, [x_tmp] 428 add x_tmp, x_tmp, x_vec, lsl #2 429 ldp q_gft5_lo, q_gft5_hi, [x_tmp] 430 431 tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 432 tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 433 tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 434 tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 435 tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 436 tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 437 tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b 438 tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b 439 tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b 440 tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b 441 442 eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b 443 eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b 444 eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b 445 eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b 446 eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b 447 eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b 448 eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b 449 eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b 450 eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b 451 eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b 452 453 cmp x_vec_i, x_vec 454 bne .Lloop16_vects 455 456.Lloop16_vects_end: 457 str q_p1_0, [x_dest1, x_pos] 458 str q_p2_0, [x_dest2, x_pos] 459 str q_p3_0, [x_dest3, x_pos] 460 str q_p4_0, [x_dest4, x_pos] 461 str q_p5_0, [x_dest5, x_pos] 462 add x_pos, x_pos, #16 463 cmp x_pos, x_len 464 ble .Lloop16 465 466.Lloop16_end: 467 sub x_tmp, x_pos, x_len 468 cmp x_tmp, #16 469 beq .return_pass 470 471.lessthan16_init: 472 mov x_pos, x_len 473 b .Lloop16 474 475.return_pass: 476 mov w_ret, #0 477 ret 478 479.return_fail: 480 mov w_ret, #1 481 ret 482