1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29.text 30 31.global gf_3vect_mad_neon 32.type gf_3vect_mad_neon, %function 33 34 35/* arguments */ 36x_len .req x0 37x_vec .req x1 38x_vec_i .req x2 39x_tbl .req x3 40x_src .req x4 41x_dest .req x5 42 43/* returns */ 44w_ret .req w0 45 46/* local variables */ 47x_src_end .req x6 48x_dest1 .req x7 49x_dest2 .req x8 50x_dest3 .req x_dest 51x_tmp .req x10 52x_tbl1 .req x11 53x_tbl2 .req x12 54x_tbl3 .req x13 55x_const .req x14 56 57/* vectors */ 58v_mask0f .req v0 59v_tmp_lo .req v1 60v_tmp_hi .req v2 61v_tmp .req v3 62q_tmp .req q3 63 64v_gft1_lo .req v4 65v_gft1_hi .req v5 66v_gft2_lo .req v6 67v_gft2_hi .req v7 68v_gft3_lo .req v16 69v_gft3_hi .req v17 70q_gft1_lo .req q4 71q_gft1_hi .req q5 72q_gft2_lo .req q6 73q_gft2_hi .req q7 74q_gft3_lo .req q16 75q_gft3_hi .req q17 76 77v_data_0 .req v8 78v_data_1 .req v9 79v_data_2 .req v10 80v_data_3 .req v11 81q_data_0 .req q8 82q_data_1 .req q9 83q_data_2 .req q10 84q_data_3 .req q11 85 86v_data_0_lo .req v12 87v_data_1_lo .req v13 88v_data_2_lo .req v14 89v_data_3_lo .req v15 90v_data_0_hi .req v_data_0 91v_data_1_hi .req v_data_1 92v_data_2_hi .req v_data_2 93v_data_3_hi .req v_data_3 94 95v_d1_0 .req v20 96v_d1_1 .req v21 97v_d1_2 .req v22 98v_d1_3 .req v23 99v_d2_0 .req v24 100v_d2_1 .req v25 101v_d2_2 .req v26 102v_d2_3 .req v27 103v_d3_0 .req v28 104v_d3_1 .req v29 105v_d3_2 .req v30 106v_d3_3 .req v31 107q_d1_0 .req q20 108q_d1_1 .req q21 109q_d1_2 .req q22 110q_d1_3 .req q23 111q_d2_0 .req q24 112q_d2_1 .req q25 113q_d2_2 .req q26 114q_d2_3 .req q27 115q_d3_0 .req q28 116q_d3_1 .req q29 117q_d3_2 .req q30 118q_d3_3 .req q31 119 120v_data .req v21 121q_data .req q21 122v_data_lo .req v22 123v_data_hi .req v23 124 125gf_3vect_mad_neon: 126 /* less than 16 bytes, return_fail */ 127 cmp x_len, #16 128 blt .return_fail 129 130 movi v_mask0f.16b, #0x0f 131 lsl x_vec_i, x_vec_i, #5 132 lsl x_vec, x_vec, #5 133 add x_tbl1, x_tbl, x_vec_i 134 add x_tbl2, x_tbl1, x_vec 135 add x_tbl3, x_tbl2, x_vec 136 add x_src_end, x_src, x_len 137 ldr x_dest1, [x_dest] 138 ldr x_dest2, [x_dest, #8] 139 ldr x_dest3, [x_dest, #16] 140 ldr q_gft1_lo, [x_tbl1] 141 ldr q_gft1_hi, [x_tbl1, #16] 142 ldr q_gft2_lo, [x_tbl2] 143 ldr q_gft2_hi, [x_tbl2, #16] 144 ldr q_gft3_lo, [x_tbl3] 145 ldr q_gft3_hi, [x_tbl3, #16] 146 147.Lloop64_init: 148 /* less than 64 bytes, goto Lloop16_init */ 149 cmp x_len, #64 150 blt .Lloop16_init 151 152 /* save d8 ~ d15 to stack */ 153 sub sp, sp, #64 154 stp d8, d9, [sp] 155 stp d10, d11, [sp, #16] 156 stp d12, d13, [sp, #32] 157 stp d14, d15, [sp, #48] 158 159 sub x_src_end, x_src_end, #64 160 161.Lloop64: 162 ldr q_data_0, [x_src, #16*0] 163 ldr q_data_1, [x_src, #16*1] 164 ldr q_data_2, [x_src, #16*2] 165 ldr q_data_3, [x_src, #16*3] 166 add x_src, x_src, #64 167 168 ldr q_d1_0, [x_dest1, #16*0] 169 ldr q_d1_1, [x_dest1, #16*1] 170 ldr q_d1_2, [x_dest1, #16*2] 171 ldr q_d1_3, [x_dest1, #16*3] 172 173 ldr q_d2_0, [x_dest2, #16*0] 174 ldr q_d2_1, [x_dest2, #16*1] 175 ldr q_d2_2, [x_dest2, #16*2] 176 ldr q_d2_3, [x_dest2, #16*3] 177 178 ldr q_d3_0, [x_dest3, #16*0] 179 ldr q_d3_1, [x_dest3, #16*1] 180 ldr q_d3_2, [x_dest3, #16*2] 181 ldr q_d3_3, [x_dest3, #16*3] 182 183 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b 184 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b 185 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b 186 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b 187 188 ushr v_data_0_hi.16b, v_data_0.16b, #4 189 ushr v_data_1_hi.16b, v_data_1.16b, #4 190 ushr v_data_2_hi.16b, v_data_2.16b, #4 191 ushr v_data_3_hi.16b, v_data_3.16b, #4 192 193 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b 194 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b 195 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b 196 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 197 198 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b 199 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b 200 eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b 201 eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b 202 203 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b 204 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b 205 eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b 206 eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b 207 208 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b 209 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b 210 eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b 211 eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b 212 213 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b 214 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b 215 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b 216 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 217 218 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b 219 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b 220 eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b 221 eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b 222 223 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b 224 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b 225 eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b 226 eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b 227 228 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b 229 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b 230 eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b 231 eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b 232 233 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b 234 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b 235 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b 236 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 237 238 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b 239 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b 240 eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b 241 eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b 242 243 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b 244 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b 245 eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b 246 eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b 247 248 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b 249 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b 250 eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b 251 eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b 252 253 str q_d1_0, [x_dest1, #16*0] 254 str q_d1_1, [x_dest1, #16*1] 255 str q_d1_2, [x_dest1, #16*2] 256 str q_d1_3, [x_dest1, #16*3] 257 add x_dest1, x_dest1, #64 258 259 str q_d2_0, [x_dest2, #16*0] 260 str q_d2_1, [x_dest2, #16*1] 261 str q_d2_2, [x_dest2, #16*2] 262 str q_d2_3, [x_dest2, #16*3] 263 add x_dest2, x_dest2, #64 264 265 str q_d3_0, [x_dest3, #16*0] 266 str q_d3_1, [x_dest3, #16*1] 267 str q_d3_2, [x_dest3, #16*2] 268 str q_d3_3, [x_dest3, #16*3] 269 add x_dest3, x_dest3, #64 270 271 cmp x_src, x_src_end 272 bls .Lloop64 273 274.Lloop64_end: 275 /* restore d8 ~ d15 */ 276 ldp d8, d9, [sp] 277 ldp d10, d11, [sp, #16] 278 ldp d12, d13, [sp, #32] 279 ldp d14, d15, [sp, #48] 280 add sp, sp, #64 281 add x_src_end, x_src_end, #64 282 283.Lloop16_init: 284 sub x_src_end, x_src_end, #16 285 cmp x_src, x_src_end 286 bhi .lessthan16_init 287 288.Lloop16: 289 ldr q_data, [x_src] 290 291 ldr q_d1_0, [x_dest1] 292 ldr q_d2_0, [x_dest2] 293 ldr q_d3_0, [x_dest3] 294 295 and v_data_lo.16b, v_data.16b, v_mask0f.16b 296 ushr v_data_hi.16b, v_data.16b, #4 297 298 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 299 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 300 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b 301 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 302 303 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 304 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 305 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b 306 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 307 308 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 309 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 310 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b 311 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 312 313 str q_d1_0, [x_dest1] 314 str q_d2_0, [x_dest2] 315 str q_d3_0, [x_dest3] 316 317 add x_src, x_src, #16 318 add x_dest1, x_dest1, #16 319 add x_dest2, x_dest2, #16 320 add x_dest3, x_dest3, #16 321 cmp x_src, x_src_end 322 bls .Lloop16 323 324.lessthan16_init: 325 sub x_tmp, x_src, x_src_end 326 cmp x_tmp, #16 327 beq .return_pass 328 329.lessthan16: 330 mov x_src, x_src_end 331 sub x_dest1, x_dest1, x_tmp 332 sub x_dest2, x_dest2, x_tmp 333 sub x_dest3, x_dest3, x_tmp 334 335 ldr x_const, =const_tbl 336 sub x_const, x_const, x_tmp 337 ldr q_tmp, [x_const, #16] 338 339 ldr q_data, [x_src] 340 ldr q_d1_0, [x_dest1] 341 ldr q_d2_0, [x_dest2] 342 ldr q_d3_0, [x_dest3] 343 344 and v_data_lo.16b, v_data.16b, v_mask0f.16b 345 ushr v_data_hi.16b, v_data.16b, #4 346 347 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 348 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 349 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 350 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 351 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b 352 353 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 354 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 355 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 356 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 357 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b 358 359 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 360 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 361 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b 362 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b 363 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b 364 365 str q_d1_0, [x_dest1] 366 str q_d2_0, [x_dest2] 367 str q_d3_0, [x_dest3] 368 369.return_pass: 370 mov w_ret, #0 371 ret 372 373.return_fail: 374 mov w_ret, #1 375 ret 376 377.section .data 378.balign 8 379const_tbl: 380 .dword 0x0000000000000000, 0x0000000000000000 381 .dword 0xffffffffffffffff, 0xffffffffffffffff 382