1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29 30.text 31.global gf_6vect_mad_neon 32.type gf_6vect_mad_neon, %function 33 34 35/* arguments */ 36x_len .req x0 37x_vec .req x1 38x_vec_i .req x2 39x_tbl .req x3 40x_src .req x4 41x_dest .req x5 42 43/* returns */ 44w_ret .req w0 45 46/* local variables */ 47x_src_end .req x6 48x_dest1 .req x7 49x_dest2 .req x8 50x_dest3 .req x9 51x_dest4 .req x10 52x_dest5 .req x11 53x_dest6 .req x_dest 54x_tmp .req x12 55x_tbl1 .req x13 56x_tbl2 .req x14 57x_tbl3 .req x15 58x_tbl4 .req x16 59x_tbl5 .req x17 60x_tbl6 .req x_tbl 61x_const .req x18 62 63/* vectors */ 64v_mask0f .req v0 65v_tmp_lo .req v1 66v_tmp_hi .req v2 67v_tmp .req v3 68q_tmp .req q3 69 70v_gft1_lo .req v4 71v_gft1_hi .req v5 72v_gft2_lo .req v6 73v_gft2_hi .req v7 74v_gft3_lo .req v16 75v_gft3_hi .req v17 76q_gft1_lo .req q4 77q_gft1_hi .req q5 78q_gft2_lo .req q6 79q_gft2_hi .req q7 80q_gft3_lo .req q16 81q_gft3_hi .req q17 82 83v_gft4_lo .req v18 84v_gft4_hi .req v19 85q_gft4_lo .req q18 86q_gft4_hi .req q19 87v_gft5_lo .req v_gft2_lo 88v_gft5_hi .req v_gft2_hi 89q_gft5_lo .req q_gft2_lo 90q_gft5_hi .req q_gft2_hi 91v_gft6_lo .req v_gft3_lo 92v_gft6_hi .req v_gft3_hi 93q_gft6_lo .req q_gft3_lo 94q_gft6_hi .req q_gft3_hi 95 96v_data_0 .req v8 97v_data_1 .req v9 98v_data_2 .req v10 99v_data_3 .req v11 100q_data_0 .req q8 101q_data_1 .req q9 102q_data_2 .req q10 103q_data_3 .req q11 104 105v_data_0_lo .req v12 106v_data_1_lo .req v13 107v_data_2_lo .req v14 108v_data_3_lo .req v15 109v_data_0_hi .req v_data_0 110v_data_1_hi .req v_data_1 111v_data_2_hi .req v_data_2 112v_data_3_hi .req v_data_3 113 114v_d1_0 .req v20 115v_d1_1 .req v21 116v_d1_2 .req v22 117v_d1_3 .req v23 118v_d2_0 .req v24 119v_d2_1 .req v25 120v_d2_2 .req v26 121v_d2_3 .req v27 122v_d3_0 .req v28 123v_d3_1 .req v29 124v_d3_2 .req v30 125v_d3_3 .req v31 126q_d1_0 .req q20 127q_d1_1 .req q21 128q_d1_2 .req q22 129q_d1_3 .req q23 130q_d2_0 .req q24 131q_d2_1 .req q25 132q_d2_2 .req q26 133q_d2_3 .req q27 134q_d3_0 .req q28 135q_d3_1 .req q29 136q_d3_2 .req q30 137q_d3_3 .req q31 138 139v_d4_0 .req v_d1_0 140v_d4_1 .req v_d1_1 141v_d4_2 .req v_d1_2 142v_d4_3 .req v_d1_3 143q_d4_0 .req q_d1_0 144q_d4_1 .req q_d1_1 145q_d4_2 .req q_d1_2 146q_d4_3 .req q_d1_3 147v_d5_0 .req v_d2_0 148v_d5_1 .req v_d2_1 149v_d5_2 .req v_d2_2 150v_d5_3 .req v_d2_3 151q_d5_0 .req q_d2_0 152q_d5_1 .req q_d2_1 153q_d5_2 .req q_d2_2 154q_d5_3 .req q_d2_3 155v_d6_0 .req v_d3_0 156v_d6_1 .req v_d3_1 157v_d6_2 .req v_d3_2 158v_d6_3 .req v_d3_3 159q_d6_0 .req q_d3_0 160q_d6_1 .req q_d3_1 161q_d6_2 .req q_d3_2 162q_d6_3 .req q_d3_3 163 164v_data .req v21 165q_data .req q21 166v_data_lo .req v22 167v_data_hi .req v23 168 169gf_6vect_mad_neon: 170 /* less than 16 bytes, return_fail */ 171 cmp x_len, #16 172 blt .return_fail 173 174 movi v_mask0f.16b, #0x0f 175 lsl x_vec_i, x_vec_i, #5 176 lsl x_vec, x_vec, #5 177 add x_tbl1, x_tbl, x_vec_i 178 add x_tbl2, x_tbl1, x_vec 179 add x_tbl3, x_tbl2, x_vec 180 add x_tbl4, x_tbl3, x_vec 181 add x_tbl5, x_tbl4, x_vec 182 add x_tbl6, x_tbl5, x_vec 183 add x_src_end, x_src, x_len 184 ldr x_dest1, [x_dest, #8*0] 185 ldr x_dest2, [x_dest, #8*1] 186 ldr x_dest3, [x_dest, #8*2] 187 ldr x_dest4, [x_dest, #8*3] 188 ldr x_dest5, [x_dest, #8*4] 189 ldr x_dest6, [x_dest, #8*5] 190 ldr q_gft1_lo, [x_tbl1] 191 ldr q_gft1_hi, [x_tbl1, #16] 192 ldr q_gft4_lo, [x_tbl4] 193 ldr q_gft4_hi, [x_tbl4, #16] 194 195.Lloop64_init: 196 /* less than 64 bytes, goto Lloop16_init */ 197 cmp x_len, #64 198 blt .Lloop16_init 199 200 /* save d8 ~ d15 to stack */ 201 sub sp, sp, #64 202 stp d8, d9, [sp] 203 stp d10, d11, [sp, #16] 204 stp d12, d13, [sp, #32] 205 stp d14, d15, [sp, #48] 206 207 sub x_src_end, x_src_end, #64 208 209.Lloop64: 210 ldr q_data_0, [x_src, #16*0] 211 ldr q_data_1, [x_src, #16*1] 212 ldr q_data_2, [x_src, #16*2] 213 ldr q_data_3, [x_src, #16*3] 214 add x_src, x_src, #64 215 216 ldr q_d1_0, [x_dest1, #16*0] 217 ldr q_d1_1, [x_dest1, #16*1] 218 ldr q_d1_2, [x_dest1, #16*2] 219 ldr q_d1_3, [x_dest1, #16*3] 220 221 ldr q_d2_0, [x_dest2, #16*0] 222 ldr q_d2_1, [x_dest2, #16*1] 223 ldr q_d2_2, [x_dest2, #16*2] 224 ldr q_d2_3, [x_dest2, #16*3] 225 226 ldr q_d3_0, [x_dest3, #16*0] 227 ldr q_d3_1, [x_dest3, #16*1] 228 ldr q_d3_2, [x_dest3, #16*2] 229 ldr q_d3_3, [x_dest3, #16*3] 230 231 ldr q_gft2_lo, [x_tbl2] 232 ldr q_gft2_hi, [x_tbl2, #16] 233 ldr q_gft3_lo, [x_tbl3] 234 ldr q_gft3_hi, [x_tbl3, #16] 235 236 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b 237 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b 238 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b 239 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b 240 241 ushr v_data_0_hi.16b, v_data_0.16b, #4 242 ushr v_data_1_hi.16b, v_data_1.16b, #4 243 ushr v_data_2_hi.16b, v_data_2.16b, #4 244 ushr v_data_3_hi.16b, v_data_3.16b, #4 245 246 /* dest1 */ 247 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b 248 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b 249 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b 250 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 251 252 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b 253 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b 254 eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b 255 eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b 256 257 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b 258 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b 259 eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b 260 eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b 261 262 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b 263 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b 264 eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b 265 eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b 266 267 /* dest2 */ 268 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b 269 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b 270 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b 271 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 272 273 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b 274 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b 275 eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b 276 eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b 277 278 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b 279 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b 280 eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b 281 eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b 282 283 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b 284 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b 285 eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b 286 eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b 287 288 /* dest3 */ 289 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b 290 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b 291 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b 292 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 293 294 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b 295 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b 296 eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b 297 eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b 298 299 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b 300 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b 301 eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b 302 eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b 303 304 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b 305 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b 306 eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b 307 eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b 308 309 str q_d1_0, [x_dest1, #16*0] 310 str q_d1_1, [x_dest1, #16*1] 311 str q_d1_2, [x_dest1, #16*2] 312 str q_d1_3, [x_dest1, #16*3] 313 add x_dest1, x_dest1, #64 314 315 str q_d2_0, [x_dest2, #16*0] 316 str q_d2_1, [x_dest2, #16*1] 317 str q_d2_2, [x_dest2, #16*2] 318 str q_d2_3, [x_dest2, #16*3] 319 add x_dest2, x_dest2, #64 320 321 str q_d3_0, [x_dest3, #16*0] 322 str q_d3_1, [x_dest3, #16*1] 323 str q_d3_2, [x_dest3, #16*2] 324 str q_d3_3, [x_dest3, #16*3] 325 add x_dest3, x_dest3, #64 326 327 ldr q_d4_0, [x_dest4, #16*0] 328 ldr q_d4_1, [x_dest4, #16*1] 329 ldr q_d4_2, [x_dest4, #16*2] 330 ldr q_d4_3, [x_dest4, #16*3] 331 332 ldr q_d5_0, [x_dest5, #16*0] 333 ldr q_d5_1, [x_dest5, #16*1] 334 ldr q_d5_2, [x_dest5, #16*2] 335 ldr q_d5_3, [x_dest5, #16*3] 336 337 ldr q_d6_0, [x_dest6, #16*0] 338 ldr q_d6_1, [x_dest6, #16*1] 339 ldr q_d6_2, [x_dest6, #16*2] 340 ldr q_d6_3, [x_dest6, #16*3] 341 342 ldr q_gft5_lo, [x_tbl5] 343 ldr q_gft5_hi, [x_tbl5, #16] 344 ldr q_gft6_lo, [x_tbl6] 345 ldr q_gft6_hi, [x_tbl6, #16] 346 347 /* dest4 */ 348 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b 349 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b 350 eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b 351 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b 352 353 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b 354 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b 355 eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b 356 eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b 357 358 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b 359 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b 360 eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b 361 eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b 362 363 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b 364 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b 365 eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b 366 eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b 367 368 /* dest5 */ 369 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b 370 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b 371 eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b 372 eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b 373 374 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b 375 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b 376 eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b 377 eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b 378 379 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b 380 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b 381 eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b 382 eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b 383 384 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b 385 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b 386 eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b 387 eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b 388 389 /* dest6 */ 390 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b 391 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b 392 eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b 393 eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b 394 395 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b 396 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b 397 eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b 398 eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b 399 400 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b 401 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b 402 eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b 403 eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b 404 405 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b 406 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b 407 eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b 408 eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b 409 410 str q_d4_0, [x_dest4, #16*0] 411 str q_d4_1, [x_dest4, #16*1] 412 str q_d4_2, [x_dest4, #16*2] 413 str q_d4_3, [x_dest4, #16*3] 414 add x_dest4, x_dest4, #64 415 416 str q_d5_0, [x_dest5, #16*0] 417 str q_d5_1, [x_dest5, #16*1] 418 str q_d5_2, [x_dest5, #16*2] 419 str q_d5_3, [x_dest5, #16*3] 420 add x_dest5, x_dest5, #64 421 422 str q_d6_0, [x_dest6, #16*0] 423 str q_d6_1, [x_dest6, #16*1] 424 str q_d6_2, [x_dest6, #16*2] 425 str q_d6_3, [x_dest6, #16*3] 426 add x_dest6, x_dest6, #64 427 428 cmp x_src, x_src_end 429 bls .Lloop64 430 431.Lloop64_end: 432 /* restore d8 ~ d15 */ 433 ldp d8, d9, [sp] 434 ldp d10, d11, [sp, #16] 435 ldp d12, d13, [sp, #32] 436 ldp d14, d15, [sp, #48] 437 add sp, sp, #64 438 add x_src_end, x_src_end, #64 439 440.Lloop16_init: 441 sub x_src_end, x_src_end, #16 442 cmp x_src, x_src_end 443 bhi .lessthan16_init 444 445.Lloop16: 446 ldr q_data, [x_src] 447 448 ldr q_d1_0, [x_dest1] 449 ldr q_d2_0, [x_dest2] 450 ldr q_d3_0, [x_dest3] 451 ldr q_gft2_lo, [x_tbl2] 452 ldr q_gft2_hi, [x_tbl2, #16] 453 ldr q_gft3_lo, [x_tbl3] 454 ldr q_gft3_hi, [x_tbl3, #16] 455 456 and v_data_lo.16b, v_data.16b, v_mask0f.16b 457 ushr v_data_hi.16b, v_data.16b, #4 458 459 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 460 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 461 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b 462 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 463 464 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 465 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 466 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b 467 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 468 469 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 470 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 471 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b 472 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 473 474 str q_d1_0, [x_dest1] 475 str q_d2_0, [x_dest2] 476 str q_d3_0, [x_dest3] 477 478 ldr q_d4_0, [x_dest4] 479 ldr q_d5_0, [x_dest5] 480 ldr q_d6_0, [x_dest6] 481 ldr q_gft5_lo, [x_tbl5] 482 ldr q_gft5_hi, [x_tbl5, #16] 483 ldr q_gft6_lo, [x_tbl6] 484 ldr q_gft6_hi, [x_tbl6, #16] 485 486 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b 487 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b 488 eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b 489 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b 490 491 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b 492 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b 493 eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b 494 eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b 495 496 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b 497 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b 498 eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b 499 eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b 500 501 str q_d4_0, [x_dest4] 502 str q_d5_0, [x_dest5] 503 str q_d6_0, [x_dest6] 504 505 add x_src, x_src, #16 506 add x_dest1, x_dest1, #16 507 add x_dest2, x_dest2, #16 508 add x_dest3, x_dest3, #16 509 add x_dest4, x_dest4, #16 510 add x_dest5, x_dest5, #16 511 add x_dest6, x_dest6, #16 512 cmp x_src, x_src_end 513 bls .Lloop16 514 515.lessthan16_init: 516 sub x_tmp, x_src, x_src_end 517 cmp x_tmp, #16 518 beq .return_pass 519 520.lessthan16: 521 mov x_src, x_src_end 522 sub x_dest1, x_dest1, x_tmp 523 sub x_dest2, x_dest2, x_tmp 524 sub x_dest3, x_dest3, x_tmp 525 sub x_dest4, x_dest4, x_tmp 526 sub x_dest5, x_dest5, x_tmp 527 sub x_dest6, x_dest6, x_tmp 528 529 ldr x_const, =const_tbl 530 sub x_const, x_const, x_tmp 531 ldr q_tmp, [x_const, #16] 532 533 ldr q_data, [x_src] 534 ldr q_d1_0, [x_dest1] 535 ldr q_d2_0, [x_dest2] 536 ldr q_d3_0, [x_dest3] 537 ldr q_gft2_lo, [x_tbl2] 538 ldr q_gft2_hi, [x_tbl2, #16] 539 ldr q_gft3_lo, [x_tbl3] 540 ldr q_gft3_hi, [x_tbl3, #16] 541 542 and v_data_lo.16b, v_data.16b, v_mask0f.16b 543 ushr v_data_hi.16b, v_data.16b, #4 544 545 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 546 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 547 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 548 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 549 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 550 551 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 552 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 553 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 554 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 555 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 556 557 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 558 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 559 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 560 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 561 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 562 563 str q_d1_0, [x_dest1] 564 str q_d2_0, [x_dest2] 565 str q_d3_0, [x_dest3] 566 567 ldr q_d4_0, [x_dest4] 568 ldr q_d5_0, [x_dest5] 569 ldr q_d6_0, [x_dest6] 570 ldr q_gft5_lo, [x_tbl5] 571 ldr q_gft5_hi, [x_tbl5, #16] 572 ldr q_gft6_lo, [x_tbl6] 573 ldr q_gft6_hi, [x_tbl6, #16] 574 575 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b 576 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b 577 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 578 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 579 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b 580 581 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b 582 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b 583 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 584 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 585 eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b 586 587 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b 588 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b 589 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 590 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 591 eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b 592 593 str q_d4_0, [x_dest4] 594 str q_d5_0, [x_dest5] 595 str q_d6_0, [x_dest6] 596 597.return_pass: 598 mov w_ret, #0 599 ret 600 601.return_fail: 602 mov w_ret, #1 603 ret 604 605.section .data 606.balign 8 607const_tbl: 608 .dword 0x0000000000000000, 0x0000000000000000 609 .dword 0xffffffffffffffff, 0xffffffffffffffff 610