1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29 30.text 31 32.global gf_4vect_mad_neon 33.type gf_4vect_mad_neon, %function 34 35 36/* arguments */ 37x_len .req x0 38x_vec .req x1 39x_vec_i .req x2 40x_tbl .req x3 41x_src .req x4 42x_dest .req x5 43 44/* returns */ 45w_ret .req w0 46 47/* local variables */ 48x_src_end .req x6 49x_dest1 .req x7 50x_dest2 .req x8 51x_dest3 .req x9 52x_dest4 .req x_dest 53x_tmp .req x10 54x_tbl1 .req x11 55x_tbl2 .req x12 56x_tbl3 .req x13 57x_tbl4 .req x14 58x_const .req x15 59 60/* vectors */ 61v_mask0f .req v0 62v_tmp_lo .req v1 63v_tmp_hi .req v2 64v_tmp .req v3 65q_tmp .req q3 66 67v_gft1_lo .req v4 68v_gft1_hi .req v5 69v_gft2_lo .req v6 70v_gft2_hi .req v7 71v_gft3_lo .req v16 72v_gft3_hi .req v17 73v_gft4_lo .req v18 74v_gft4_hi .req v19 75q_gft1_lo .req q4 76q_gft1_hi .req q5 77q_gft2_lo .req q6 78q_gft2_hi .req q7 79q_gft3_lo .req q16 80q_gft3_hi .req q17 81q_gft4_lo .req q18 82q_gft4_hi .req q19 83 84v_data_0 .req v8 85v_data_1 .req v9 86v_data_2 .req v10 87v_data_3 .req v11 88q_data_0 .req q8 89q_data_1 .req q9 90q_data_2 .req q10 91q_data_3 .req q11 92 93v_data_0_lo .req v12 94v_data_1_lo .req v13 95v_data_2_lo .req v14 96v_data_3_lo .req v15 97v_data_0_hi .req v_data_0 98v_data_1_hi .req v_data_1 99v_data_2_hi .req v_data_2 100v_data_3_hi .req v_data_3 101 102v_d1_0 .req v20 103v_d1_1 .req v21 104v_d1_2 .req v22 105v_d1_3 .req v23 106v_d2_0 .req v24 107v_d2_1 .req v25 108v_d2_2 .req v26 109v_d2_3 .req v27 110v_d3_0 .req v28 111v_d3_1 .req v29 112v_d3_2 .req v30 113v_d3_3 .req v31 114q_d1_0 .req q20 115q_d1_1 .req q21 116q_d1_2 .req q22 117q_d1_3 .req q23 118q_d2_0 .req q24 119q_d2_1 .req q25 120q_d2_2 .req q26 121q_d2_3 .req q27 122q_d3_0 .req q28 123q_d3_1 .req q29 124q_d3_2 .req q30 125q_d3_3 .req q31 126 127v_d4_0 .req v_d1_0 128v_d4_1 .req v_d1_1 129v_d4_2 .req v_d1_2 130v_d4_3 .req v_d1_3 131q_d4_0 .req q_d1_0 132q_d4_1 .req q_d1_1 133q_d4_2 .req q_d1_2 134q_d4_3 .req q_d1_3 135 136v_data .req v21 137q_data .req q21 138v_data_lo .req v22 139v_data_hi .req v23 140 141gf_4vect_mad_neon: 142 /* less than 16 bytes, return_fail */ 143 cmp x_len, #16 144 blt .return_fail 145 146 movi v_mask0f.16b, #0x0f 147 lsl x_vec_i, x_vec_i, #5 148 lsl x_vec, x_vec, #5 149 add x_tbl1, x_tbl, x_vec_i 150 add x_tbl2, x_tbl1, x_vec 151 add x_tbl3, x_tbl2, x_vec 152 add x_tbl4, x_tbl3, x_vec 153 add x_src_end, x_src, x_len 154 ldr x_dest1, [x_dest, #8*0] 155 ldr x_dest2, [x_dest, #8*1] 156 ldr x_dest3, [x_dest, #8*2] 157 ldr x_dest4, [x_dest, #8*3] 158 ldr q_gft1_lo, [x_tbl1] 159 ldr q_gft1_hi, [x_tbl1, #16] 160 ldr q_gft2_lo, [x_tbl2] 161 ldr q_gft2_hi, [x_tbl2, #16] 162 ldr q_gft3_lo, [x_tbl3] 163 ldr q_gft3_hi, [x_tbl3, #16] 164 ldr q_gft4_lo, [x_tbl4] 165 ldr q_gft4_hi, [x_tbl4, #16] 166 167.Lloop64_init: 168 /* less than 64 bytes, goto Lloop16_init */ 169 cmp x_len, #64 170 blt .Lloop16_init 171 172 /* save d8 ~ d15 to stack */ 173 sub sp, sp, #64 174 stp d8, d9, [sp] 175 stp d10, d11, [sp, #16] 176 stp d12, d13, [sp, #32] 177 stp d14, d15, [sp, #48] 178 179 sub x_src_end, x_src_end, #64 180 181.Lloop64: 182 ldr q_data_0, [x_src, #16*0] 183 ldr q_data_1, [x_src, #16*1] 184 ldr q_data_2, [x_src, #16*2] 185 ldr q_data_3, [x_src, #16*3] 186 add x_src, x_src, #64 187 188 ldr q_d1_0, [x_dest1, #16*0] 189 ldr q_d1_1, [x_dest1, #16*1] 190 ldr q_d1_2, [x_dest1, #16*2] 191 ldr q_d1_3, [x_dest1, #16*3] 192 193 ldr q_d2_0, [x_dest2, #16*0] 194 ldr q_d2_1, [x_dest2, #16*1] 195 ldr q_d2_2, [x_dest2, #16*2] 196 ldr q_d2_3, [x_dest2, #16*3] 197 198 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b 199 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b 200 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b 201 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b 202 203 ushr v_data_0_hi.16b, v_data_0.16b, #4 204 ushr v_data_1_hi.16b, v_data_1.16b, #4 205 ushr v_data_2_hi.16b, v_data_2.16b, #4 206 ushr v_data_3_hi.16b, v_data_3.16b, #4 207 208 /* dest1 */ 209 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b 210 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b 211 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b 212 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 213 214 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b 215 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b 216 eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b 217 eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b 218 219 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b 220 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b 221 eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b 222 eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b 223 224 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b 225 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b 226 eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b 227 eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b 228 229 /* dest2 */ 230 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b 231 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b 232 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b 233 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 234 235 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b 236 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b 237 eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b 238 eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b 239 240 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b 241 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b 242 eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b 243 eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b 244 245 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b 246 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b 247 eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b 248 eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b 249 250 str q_d1_0, [x_dest1, #16*0] 251 str q_d1_1, [x_dest1, #16*1] 252 str q_d1_2, [x_dest1, #16*2] 253 str q_d1_3, [x_dest1, #16*3] 254 add x_dest1, x_dest1, #64 255 256 str q_d2_0, [x_dest2, #16*0] 257 str q_d2_1, [x_dest2, #16*1] 258 str q_d2_2, [x_dest2, #16*2] 259 str q_d2_3, [x_dest2, #16*3] 260 add x_dest2, x_dest2, #64 261 262 ldr q_d3_0, [x_dest3, #16*0] 263 ldr q_d3_1, [x_dest3, #16*1] 264 ldr q_d3_2, [x_dest3, #16*2] 265 ldr q_d3_3, [x_dest3, #16*3] 266 267 ldr q_d4_0, [x_dest4, #16*0] 268 ldr q_d4_1, [x_dest4, #16*1] 269 ldr q_d4_2, [x_dest4, #16*2] 270 ldr q_d4_3, [x_dest4, #16*3] 271 272 /* dest3 */ 273 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b 274 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b 275 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b 276 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 277 278 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b 279 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b 280 eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b 281 eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b 282 283 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b 284 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b 285 eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b 286 eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b 287 288 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b 289 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b 290 eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b 291 eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b 292 293 /* dest4 */ 294 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b 295 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b 296 eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b 297 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b 298 299 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b 300 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b 301 eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b 302 eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b 303 304 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b 305 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b 306 eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b 307 eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b 308 309 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b 310 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b 311 eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b 312 eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b 313 314 str q_d3_0, [x_dest3, #16*0] 315 str q_d3_1, [x_dest3, #16*1] 316 str q_d3_2, [x_dest3, #16*2] 317 str q_d3_3, [x_dest3, #16*3] 318 add x_dest3, x_dest3, #64 319 320 str q_d4_0, [x_dest4, #16*0] 321 str q_d4_1, [x_dest4, #16*1] 322 str q_d4_2, [x_dest4, #16*2] 323 str q_d4_3, [x_dest4, #16*3] 324 add x_dest4, x_dest4, #64 325 326 cmp x_src, x_src_end 327 bls .Lloop64 328 329.Lloop64_end: 330 /* restore d8 ~ d15 */ 331 ldp d8, d9, [sp] 332 ldp d10, d11, [sp, #16] 333 ldp d12, d13, [sp, #32] 334 ldp d14, d15, [sp, #48] 335 add sp, sp, #64 336 add x_src_end, x_src_end, #64 337 338.Lloop16_init: 339 sub x_src_end, x_src_end, #16 340 cmp x_src, x_src_end 341 bhi .lessthan16_init 342 343.Lloop16: 344 ldr q_data, [x_src] 345 346 ldr q_d1_0, [x_dest1] 347 ldr q_d2_0, [x_dest2] 348 349 and v_data_lo.16b, v_data.16b, v_mask0f.16b 350 ushr v_data_hi.16b, v_data.16b, #4 351 352 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 353 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 354 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b 355 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 356 357 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 358 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 359 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b 360 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 361 362 str q_d1_0, [x_dest1] 363 str q_d2_0, [x_dest2] 364 ldr q_d3_0, [x_dest3] 365 ldr q_d4_0, [x_dest4] 366 367 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 368 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 369 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b 370 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 371 372 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b 373 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b 374 eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b 375 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b 376 377 str q_d3_0, [x_dest3] 378 str q_d4_0, [x_dest4] 379 380 add x_src, x_src, #16 381 add x_dest1, x_dest1, #16 382 add x_dest2, x_dest2, #16 383 add x_dest3, x_dest3, #16 384 add x_dest4, x_dest4, #16 385 cmp x_src, x_src_end 386 bls .Lloop16 387 388.lessthan16_init: 389 sub x_tmp, x_src, x_src_end 390 cmp x_tmp, #16 391 beq .return_pass 392 393.lessthan16: 394 mov x_src, x_src_end 395 sub x_dest1, x_dest1, x_tmp 396 sub x_dest2, x_dest2, x_tmp 397 sub x_dest3, x_dest3, x_tmp 398 sub x_dest4, x_dest4, x_tmp 399 400 ldr x_const, =const_tbl 401 sub x_const, x_const, x_tmp 402 ldr q_tmp, [x_const, #16] 403 404 ldr q_data, [x_src] 405 ldr q_d1_0, [x_dest1] 406 ldr q_d2_0, [x_dest2] 407 408 and v_data_lo.16b, v_data.16b, v_mask0f.16b 409 ushr v_data_hi.16b, v_data.16b, #4 410 411 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 412 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 413 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 414 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 415 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 416 417 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 418 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 419 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 420 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 421 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 422 423 str q_d1_0, [x_dest1] 424 str q_d2_0, [x_dest2] 425 ldr q_d3_0, [x_dest3] 426 ldr q_d4_0, [x_dest4] 427 428 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 429 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 430 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 431 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 432 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 433 434 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b 435 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b 436 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 437 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 438 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b 439 440 str q_d3_0, [x_dest3] 441 str q_d4_0, [x_dest4] 442 443.return_pass: 444 mov w_ret, #0 445 ret 446 447.return_fail: 448 mov w_ret, #1 449 ret 450 451.section .data 452.balign 8 453const_tbl: 454 .dword 0x0000000000000000, 0x0000000000000000 455 .dword 0xffffffffffffffff, 0xffffffffffffffff 456