1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29.text 30 31.global gf_4vect_dot_prod_neon 32.type gf_4vect_dot_prod_neon, %function 33 34 35/* arguments */ 36x_len .req x0 37x_vec .req x1 38x_tbl .req x2 39x_src .req x3 40x_dest .req x4 41 42/* returns */ 43w_ret .req w0 44 45/* local variables */ 46x_vec_i .req x5 47x_ptr .req x6 48x_pos .req x7 49x_tmp .req x8 50x_dest1 .req x9 51x_tbl1 .req x10 52x_dest2 .req x11 53x_tbl2 .req x12 54x_dest3 .req x13 55x_tbl3 .req x14 56x_dest4 .req x_dest 57x_tbl4 .req x15 58 59/* vectors */ 60v_mask0f .req v0 61q_mask0f .req q0 62v_tmp1_lo .req v1 63v_tmp1_hi .req v2 64v_tmp1 .req v3 65q_tmp1 .req q3 66 67v_p1_0 .req v4 68v_p2_0 .req v5 69v_p3_0 .req v6 70v_p4_0 .req v7 71 72q_p1_0 .req q4 73q_p2_0 .req q5 74q_p3_0 .req q6 75q_p4_0 .req q7 76 77v_data_0 .req v8 78v_data_1 .req v9 79v_data_2 .req v10 80v_data_3 .req v11 81q_data_0 .req q8 82q_data_1 .req q9 83q_data_2 .req q10 84q_data_3 .req q11 85 86v_p1_3 .req v12 87v_p2_3 .req v13 88v_p3_3 .req v14 89v_p4_3 .req v15 90q_p1_3 .req q12 91q_p2_3 .req q13 92q_p3_3 .req q14 93q_p4_3 .req q15 94 95v_gft1_lo .req v16 96v_gft1_hi .req v17 97v_gft2_lo .req v18 98v_gft2_hi .req v19 99v_gft3_lo .req v20 100v_gft3_hi .req v21 101v_gft4_lo .req v22 102v_gft4_hi .req v23 103q_gft1_lo .req q16 104q_gft1_hi .req q17 105q_gft2_lo .req q18 106q_gft2_hi .req q19 107q_gft3_lo .req q20 108q_gft3_hi .req q21 109q_gft4_lo .req q22 110q_gft4_hi .req q23 111 112v_p1_1 .req v24 113v_p1_2 .req v25 114v_p2_1 .req v26 115v_p2_2 .req v27 116v_p3_1 .req v28 117v_p3_2 .req v29 118v_p4_1 .req v30 119v_p4_2 .req v31 120 121q_p1_1 .req q24 122q_p1_2 .req q25 123q_p2_1 .req q26 124q_p2_2 .req q27 125q_p3_1 .req q28 126q_p3_2 .req q29 127q_p4_1 .req q30 128q_p4_2 .req q31 129 130v_data .req v_tmp1 131q_data .req q_tmp1 132v_data_lo .req v_tmp1_lo 133v_data_hi .req v_tmp1_hi 134 135gf_4vect_dot_prod_neon: 136 /* less than 16 bytes, return_fail */ 137 cmp x_len, #16 138 blt .return_fail 139 140 movi v_mask0f.16b, #0x0f 141 mov x_pos, #0 142 lsl x_vec, x_vec, #3 143 ldr x_dest1, [x_dest, #8*0] 144 ldr x_dest2, [x_dest, #8*1] 145 ldr x_dest3, [x_dest, #8*2] 146 ldr x_dest4, [x_dest, #8*3] 147 148.Lloop64_init: 149 /* less than 64 bytes, goto Lloop16_init */ 150 cmp x_len, #64 151 blt .Lloop16_init 152 153 /* save d8 ~ d15 to stack */ 154 sub sp, sp, #64 155 stp d8, d9, [sp] 156 stp d10, d11, [sp, #16] 157 stp d12, d13, [sp, #32] 158 stp d14, d15, [sp, #48] 159 160 sub x_len, x_len, #64 161 162.Lloop64: 163 movi v_p1_0.16b, #0 164 movi v_p1_1.16b, #0 165 movi v_p1_2.16b, #0 166 movi v_p1_3.16b, #0 167 movi v_p2_0.16b, #0 168 movi v_p2_1.16b, #0 169 movi v_p2_2.16b, #0 170 movi v_p2_3.16b, #0 171 movi v_p3_0.16b, #0 172 movi v_p3_1.16b, #0 173 movi v_p3_2.16b, #0 174 movi v_p3_3.16b, #0 175 movi v_p4_0.16b, #0 176 movi v_p4_1.16b, #0 177 movi v_p4_2.16b, #0 178 movi v_p4_3.16b, #0 179 180 mov x_tbl1, x_tbl 181 add x_tbl2, x_tbl1, x_vec, lsl #2 182 add x_tbl3, x_tbl2, x_vec, lsl #2 183 add x_tbl4, x_tbl3, x_vec, lsl #2 184 mov x_vec_i, #0 185 prfm pldl1keep, [x_tbl1] 186 prfm pldl1keep, [x_tbl2] 187 prfm pldl1keep, [x_tbl3] 188 prfm pldl1keep, [x_tbl4] 189 190.Lloop64_vects: 191 ldr x_ptr, [x_src, x_vec_i] 192 add x_vec_i, x_vec_i, #8 193 add x_ptr, x_ptr, x_pos 194 195 ldr q_data_0, [x_ptr], #16 196 ldr q_data_1, [x_ptr], #16 197 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 198 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 199 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 200 ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 201 ldr q_data_2, [x_ptr], #16 202 ldr q_data_3, [x_ptr], #16 203 204 prfm pldl1strm, [x_ptr] 205 prfm pldl1keep, [x_tbl1] 206 prfm pldl1keep, [x_tbl2] 207 prfm pldl1keep, [x_tbl3] 208 prfm pldl1keep, [x_tbl4] 209 210 /* data_0 */ 211 and v_tmp1.16b, v_data_0.16b, v_mask0f.16b 212 ushr v_data_0.16b, v_data_0.16b, #4 213 214 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 215 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b 216 eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b 217 eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b 218 219 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 220 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b 221 eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b 222 eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b 223 224 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 225 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b 226 eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b 227 eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b 228 229 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b 230 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b 231 eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b 232 eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b 233 234 /* data_1 */ 235 and v_tmp1.16b, v_data_1.16b, v_mask0f.16b 236 ushr v_data_1.16b, v_data_1.16b, #4 237 238 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 239 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b 240 eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b 241 eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b 242 243 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 244 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b 245 eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b 246 eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b 247 248 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 249 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b 250 eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b 251 eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b 252 253 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b 254 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b 255 eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b 256 eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b 257 258 /* data_2 */ 259 and v_tmp1.16b, v_data_2.16b, v_mask0f.16b 260 ushr v_data_2.16b, v_data_2.16b, #4 261 262 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 263 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b 264 eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b 265 eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b 266 267 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 268 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b 269 eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b 270 eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b 271 272 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 273 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b 274 eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b 275 eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b 276 277 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b 278 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b 279 eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b 280 eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b 281 282 /* data_3 */ 283 and v_tmp1.16b, v_data_3.16b, v_mask0f.16b 284 ushr v_data_3.16b, v_data_3.16b, #4 285 286 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 287 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b 288 eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b 289 eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b 290 291 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 292 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b 293 eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b 294 eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b 295 296 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 297 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b 298 eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b 299 eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b 300 301 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b 302 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b 303 eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b 304 eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b 305 306 cmp x_vec_i, x_vec 307 blt .Lloop64_vects 308 309.Lloop64_vects_end: 310 add x_ptr, x_dest1, x_pos 311 stp q_p1_0, q_p1_1, [x_ptr], #32 312 stp q_p1_2, q_p1_3, [x_ptr] 313 314 add x_ptr, x_dest2, x_pos 315 stp q_p2_0, q_p2_1, [x_ptr], #32 316 stp q_p2_2, q_p2_3, [x_ptr] 317 318 add x_ptr, x_dest3, x_pos 319 stp q_p3_0, q_p3_1, [x_ptr], #32 320 stp q_p3_2, q_p3_3, [x_ptr] 321 322 add x_ptr, x_dest4, x_pos 323 stp q_p4_0, q_p4_1, [x_ptr], #32 324 stp q_p4_2, q_p4_3, [x_ptr] 325 326 add x_pos, x_pos, #64 327 cmp x_pos, x_len 328 ble .Lloop64 329 330.Lloop64_end: 331 /* restore d8 ~ d15 */ 332 ldp d8, d9, [sp] 333 ldp d10, d11, [sp, #16] 334 ldp d12, d13, [sp, #32] 335 ldp d14, d15, [sp, #48] 336 add sp, sp, #64 337 338 add x_len, x_len, #64 339 cmp x_pos, x_len 340 beq .return_pass 341 342.Lloop16_init: 343 sub x_len, x_len, #16 344 cmp x_pos, x_len 345 bgt .lessthan16_init 346 347.Lloop16: 348 movi v_p1_0.16b, #0 349 movi v_p2_0.16b, #0 350 movi v_p3_0.16b, #0 351 movi v_p4_0.16b, #0 352 mov x_tbl1, x_tbl 353 add x_tbl2, x_tbl1, x_vec, lsl #2 354 add x_tbl3, x_tbl2, x_vec, lsl #2 355 add x_tbl4, x_tbl3, x_vec, lsl #2 356 mov x_vec_i, #0 357 358.Lloop16_vects: 359 ldr x_ptr, [x_src, x_vec_i] 360 add x_vec_i, x_vec_i, #8 361 ldr q_data, [x_ptr, x_pos] 362 363 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 364 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 365 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 366 ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 367 368 prfm pldl1keep, [x_tbl1] 369 prfm pldl1keep, [x_tbl2] 370 prfm pldl1keep, [x_tbl3] 371 prfm pldl1keep, [x_tbl4] 372 373 and v_data_lo.16b, v_data.16b, v_mask0f.16b 374 ushr v_data_hi.16b, v_data.16b, #4 375 376 tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 377 tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 378 tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 379 tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 380 tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 381 tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 382 tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b 383 tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b 384 385 eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b 386 eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b 387 eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b 388 eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b 389 eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b 390 eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b 391 eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b 392 eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b 393 394 cmp x_vec_i, x_vec 395 bne .Lloop16_vects 396 397.Lloop16_vects_end: 398 str q_p1_0, [x_dest1, x_pos] 399 str q_p2_0, [x_dest2, x_pos] 400 str q_p3_0, [x_dest3, x_pos] 401 str q_p4_0, [x_dest4, x_pos] 402 add x_pos, x_pos, #16 403 cmp x_pos, x_len 404 ble .Lloop16 405 406.Lloop16_end: 407 sub x_tmp, x_pos, x_len 408 cmp x_tmp, #16 409 beq .return_pass 410 411.lessthan16_init: 412 mov x_pos, x_len 413 b .Lloop16 414 415.return_pass: 416 mov w_ret, #0 417 ret 418 419.return_fail: 420 mov w_ret, #1 421 ret 422