1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29 30.text 31 32.global gf_2vect_dot_prod_neon 33.type gf_2vect_dot_prod_neon, %function 34 35 36/* arguments */ 37x_len .req x0 38x_vec .req x1 39x_tbl .req x2 40x_src .req x3 41x_dest .req x4 42 43/* returns */ 44w_ret .req w0 45 46/* local variables */ 47x_vec_i .req x5 48x_ptr .req x6 49x_pos .req x7 50x_tmp .req x8 51x_tbl1 .req x9 52x_tbl2 .req x10 53x_dest1 .req x11 54x_dest2 .req x12 55 56/* vectors */ 57v_gft1_lo .req v0 58v_gft1_hi .req v1 59v_gft2_lo .req v2 60v_gft2_hi .req v3 61q_gft1_lo .req q0 62q_gft1_hi .req q1 63q_gft2_lo .req q2 64q_gft2_hi .req q3 65 66v_mask0f .req v4 67q_mask0f .req q4 68 69v_tmp1_lo .req v5 70v_tmp1_hi .req v6 71v_tmp1 .req v7 72 73v_data_0 .req v8 74v_data_1 .req v9 75v_data_2 .req v10 76v_data_3 .req v11 77v_data_4 .req v12 78v_data_5 .req v13 79v_data_6 .req v14 80v_data_7 .req v15 81q_data_0 .req q8 82q_data_1 .req q9 83q_data_2 .req q10 84q_data_3 .req q11 85q_data_4 .req q12 86q_data_5 .req q13 87q_data_6 .req q14 88q_data_7 .req q15 89 90v_p1_0 .req v16 91v_p1_1 .req v17 92v_p1_2 .req v18 93v_p1_3 .req v19 94v_p1_4 .req v20 95v_p1_5 .req v21 96v_p1_6 .req v22 97v_p1_7 .req v23 98v_p2_0 .req v24 99v_p2_1 .req v25 100v_p2_2 .req v26 101v_p2_3 .req v27 102v_p2_4 .req v28 103v_p2_5 .req v29 104v_p2_6 .req v30 105v_p2_7 .req v31 106 107q_p1_0 .req q16 108q_p1_1 .req q17 109q_p1_2 .req q18 110q_p1_3 .req q19 111q_p1_4 .req q20 112q_p1_5 .req q21 113q_p1_6 .req q22 114q_p1_7 .req q23 115q_p2_0 .req q24 116q_p2_1 .req q25 117q_p2_2 .req q26 118q_p2_3 .req q27 119q_p2_4 .req q28 120q_p2_5 .req q29 121q_p2_6 .req q30 122q_p2_7 .req q31 123 124v_p1 .req v_p1_0 125q_p1 .req q_p1_0 126v_p2 .req v_p2_0 127q_p2 .req q_p2_0 128v_data .req v_p1_1 129q_data .req q_p1_1 130v_data_lo .req v_p1_2 131v_data_hi .req v_p1_3 132 133gf_2vect_dot_prod_neon: 134 /* less than 16 bytes, return_fail */ 135 cmp x_len, #16 136 blt .return_fail 137 138 movi v_mask0f.16b, #0x0f 139 mov x_pos, #0 140 lsl x_vec, x_vec, #3 141 ldr x_dest1, [x_dest, #8*0] 142 ldr x_dest2, [x_dest, #8*1] 143 144.Lloop128_init: 145 /* less than 128 bytes, goto Lloop16_init */ 146 cmp x_len, #128 147 blt .Lloop16_init 148 149 /* save d8 ~ d15 to stack */ 150 sub sp, sp, #64 151 stp d8, d9, [sp] 152 stp d10, d11, [sp, #16] 153 stp d12, d13, [sp, #32] 154 stp d14, d15, [sp, #48] 155 156 sub x_len, x_len, #128 157 158.Lloop128: 159 movi v_p1_0.16b, #0 160 movi v_p1_1.16b, #0 161 movi v_p1_2.16b, #0 162 movi v_p1_3.16b, #0 163 movi v_p1_4.16b, #0 164 movi v_p1_5.16b, #0 165 movi v_p1_6.16b, #0 166 movi v_p1_7.16b, #0 167 168 movi v_p2_0.16b, #0 169 movi v_p2_1.16b, #0 170 movi v_p2_2.16b, #0 171 movi v_p2_3.16b, #0 172 movi v_p2_4.16b, #0 173 movi v_p2_5.16b, #0 174 movi v_p2_6.16b, #0 175 movi v_p2_7.16b, #0 176 177 mov x_tbl1, x_tbl 178 add x_tbl2, x_tbl, x_vec, lsl #2 179 mov x_vec_i, #0 180 181.Lloop128_vects: 182 ldr x_ptr, [x_src, x_vec_i] 183 add x_vec_i, x_vec_i, #8 184 add x_ptr, x_ptr, x_pos 185 186 ldp q_data_0, q_data_1, [x_ptr], #32 187 ldp q_data_2, q_data_3, [x_ptr], #32 188 189 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 190 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 191 ldp q_data_4, q_data_5, [x_ptr], #32 192 ldp q_data_6, q_data_7, [x_ptr], #32 193 prfm pldl1strm, [x_ptr] 194 prfm pldl1keep, [x_tbl1] 195 prfm pldl1keep, [x_tbl2] 196 197 /* data_0 */ 198 and v_tmp1.16b, v_data_0.16b, v_mask0f.16b 199 ushr v_data_0.16b, v_data_0.16b, #4 200 201 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 202 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b 203 eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b 204 eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b 205 206 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 207 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b 208 eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b 209 eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b 210 211 /* data_1 */ 212 and v_tmp1.16b, v_data_1.16b, v_mask0f.16b 213 ushr v_data_1.16b, v_data_1.16b, #4 214 215 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 216 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b 217 eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b 218 eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b 219 220 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 221 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b 222 eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b 223 eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b 224 225 /* data_2 */ 226 and v_tmp1.16b, v_data_2.16b, v_mask0f.16b 227 ushr v_data_2.16b, v_data_2.16b, #4 228 229 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 230 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b 231 eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b 232 eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b 233 234 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 235 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b 236 eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b 237 eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b 238 239 /* data_3 */ 240 and v_tmp1.16b, v_data_3.16b, v_mask0f.16b 241 ushr v_data_3.16b, v_data_3.16b, #4 242 243 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 244 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b 245 eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b 246 eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b 247 248 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 249 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b 250 eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b 251 eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b 252 253 /* data_4 */ 254 and v_tmp1.16b, v_data_4.16b, v_mask0f.16b 255 ushr v_data_4.16b, v_data_4.16b, #4 256 257 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 258 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b 259 eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b 260 eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b 261 262 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 263 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b 264 eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b 265 eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b 266 267 /* data_5 */ 268 and v_tmp1.16b, v_data_5.16b, v_mask0f.16b 269 ushr v_data_5.16b, v_data_5.16b, #4 270 271 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 272 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b 273 eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b 274 eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b 275 276 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 277 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b 278 eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b 279 eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b 280 281 /* data_6 */ 282 and v_tmp1.16b, v_data_6.16b, v_mask0f.16b 283 ushr v_data_6.16b, v_data_6.16b, #4 284 285 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 286 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b 287 eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b 288 eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b 289 290 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 291 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b 292 eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b 293 eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b 294 295 /* data_7 */ 296 and v_tmp1.16b, v_data_7.16b, v_mask0f.16b 297 ushr v_data_7.16b, v_data_7.16b, #4 298 299 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 300 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b 301 eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b 302 eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b 303 304 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 305 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b 306 eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b 307 eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b 308 309 cmp x_vec_i, x_vec 310 blt .Lloop128_vects 311 312.Lloop128_vects_end: 313 add x_ptr, x_dest1, x_pos 314 stp q_p1_0, q_p1_1, [x_ptr], #32 315 stp q_p1_2, q_p1_3, [x_ptr], #32 316 stp q_p1_4, q_p1_5, [x_ptr], #32 317 stp q_p1_6, q_p1_7, [x_ptr] 318 319 add x_ptr, x_dest2, x_pos 320 stp q_p2_0, q_p2_1, [x_ptr], #32 321 stp q_p2_2, q_p2_3, [x_ptr], #32 322 stp q_p2_4, q_p2_5, [x_ptr], #32 323 stp q_p2_6, q_p2_7, [x_ptr] 324 325 add x_pos, x_pos, #128 326 cmp x_pos, x_len 327 ble .Lloop128 328 329.Lloop128_end: 330 /* restore d8 ~ d15 */ 331 ldp d8, d9, [sp] 332 ldp d10, d11, [sp, #16] 333 ldp d12, d13, [sp, #32] 334 ldp d14, d15, [sp, #48] 335 add sp, sp, #64 336 337 add x_len, x_len, #128 338 cmp x_pos, x_len 339 beq .return_pass 340 341.Lloop16_init: 342 sub x_len, x_len, #16 343 cmp x_pos, x_len 344 bgt .lessthan16_init 345 346.Lloop16: 347 movi v_p1.16b, #0 348 movi v_p2.16b, #0 349 mov x_tbl1, x_tbl 350 add x_tbl2, x_tbl, x_vec, lsl #2 351 mov x_vec_i, #0 352 353.Lloop16_vects: 354 ldr x_ptr, [x_src, x_vec_i] 355 ldr q_data, [x_ptr, x_pos] 356 add x_vec_i, x_vec_i, #8 357 358 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 359 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 360 361 and v_data_lo.16b, v_data.16b, v_mask0f.16b 362 ushr v_data_hi.16b, v_data.16b, #4 363 364 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 365 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 366 eor v_p1.16b, v_tmp1_lo.16b, v_p1.16b 367 eor v_p1.16b, v_p1.16b, v_tmp1_hi.16b 368 369 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 370 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 371 eor v_p2.16b, v_tmp1_lo.16b, v_p2.16b 372 eor v_p2.16b, v_p2.16b, v_tmp1_hi.16b 373 374 cmp x_vec_i, x_vec 375 bne .Lloop16_vects 376 377.Lloop16_vects_end: 378 str q_p1, [x_dest1, x_pos] 379 str q_p2, [x_dest2, x_pos] 380 add x_pos, x_pos, #16 381 cmp x_pos, x_len 382 ble .Lloop16 383 384.Lloop16_end: 385 sub x_tmp, x_pos, x_len 386 cmp x_tmp, #16 387 beq .return_pass 388 389.lessthan16_init: 390 mov x_pos, x_len 391 b .Lloop16 392 393.return_pass: 394 mov w_ret, #0 395 ret 396 397.return_fail: 398 mov w_ret, #1 399 ret 400