1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29 30.text 31 32.global gf_3vect_dot_prod_neon 33.type gf_3vect_dot_prod_neon, %function 34 35 36/* arguments */ 37x_len .req x0 38x_vec .req x1 39x_tbl .req x2 40x_src .req x3 41x_dest .req x4 42 43/* returns */ 44w_ret .req w0 45 46/* local variables */ 47x_vec_i .req x5 48x_ptr .req x6 49x_pos .req x7 50x_tmp .req x8 51x_dest1 .req x9 52x_tbl1 .req x10 53x_dest2 .req x11 54x_tbl2 .req x12 55x_dest3 .req x13 56x_tbl3 .req x14 57 58/* vectors */ 59v_gft1_lo .req v0 60v_gft1_hi .req v1 61v_gft2_lo .req v2 62v_gft2_hi .req v3 63v_gft3_lo .req v4 64v_gft3_hi .req v5 65q_gft1_lo .req q0 66q_gft1_hi .req q1 67q_gft2_lo .req q2 68q_gft2_hi .req q3 69q_gft3_lo .req q4 70q_gft3_hi .req q5 71 72v_mask0f .req v6 73q_mask0f .req q6 74v_tmp1 .req v7 75 76v_data_0 .req v8 77v_data_1 .req v9 78v_data_2 .req v10 79v_data_3 .req v11 80q_data_0 .req q8 81q_data_1 .req q9 82q_data_2 .req q10 83q_data_3 .req q11 84 85v_tmp1_lo .req v12 86v_tmp1_hi .req v13 87 88v_p1_0 .req v20 89v_p1_1 .req v21 90v_p1_2 .req v22 91v_p1_3 .req v23 92v_p2_0 .req v24 93v_p2_1 .req v25 94v_p2_2 .req v26 95v_p2_3 .req v27 96v_p3_0 .req v28 97v_p3_1 .req v29 98v_p3_2 .req v30 99v_p3_3 .req v31 100 101q_p1_0 .req q20 102q_p1_1 .req q21 103q_p1_2 .req q22 104q_p1_3 .req q23 105q_p2_0 .req q24 106q_p2_1 .req q25 107q_p2_2 .req q26 108q_p2_3 .req q27 109q_p3_0 .req q28 110q_p3_1 .req q29 111q_p3_2 .req q30 112q_p3_3 .req q31 113 114v_data .req v_p1_1 115q_data .req q_p1_1 116v_data_lo .req v_p1_2 117v_data_hi .req v_p1_3 118 119 120gf_3vect_dot_prod_neon: 121 /* less than 16 bytes, return_fail */ 122 cmp x_len, #16 123 blt .return_fail 124 125 movi v_mask0f.16b, #0x0f 126 mov x_pos, #0 127 lsl x_vec, x_vec, #3 128 ldr x_dest1, [x_dest, #8*0] 129 ldr x_dest2, [x_dest, #8*1] 130 ldr x_dest3, [x_dest, #8*2] 131 132.Lloop64_init: 133 /* less than 64 bytes, goto Lloop16_init */ 134 cmp x_len, #64 135 blt .Lloop16_init 136 137 /* save d8 ~ d15 to stack */ 138 sub sp, sp, #64 139 stp d8, d9, [sp] 140 stp d10, d11, [sp, #16] 141 stp d12, d13, [sp, #32] 142 stp d14, d15, [sp, #48] 143 144 sub x_len, x_len, #64 145 146.Lloop64: 147 movi v_p1_0.16b, #0 148 movi v_p1_1.16b, #0 149 movi v_p1_2.16b, #0 150 movi v_p1_3.16b, #0 151 movi v_p2_0.16b, #0 152 movi v_p2_1.16b, #0 153 movi v_p2_2.16b, #0 154 movi v_p2_3.16b, #0 155 movi v_p3_0.16b, #0 156 movi v_p3_1.16b, #0 157 movi v_p3_2.16b, #0 158 movi v_p3_3.16b, #0 159 160 mov x_tbl1, x_tbl 161 add x_tbl2, x_tbl1, x_vec, lsl #2 162 add x_tbl3, x_tbl2, x_vec, lsl #2 163 mov x_vec_i, #0 164 165.Lloop64_vects: 166 ldr x_ptr, [x_src, x_vec_i] 167 add x_vec_i, x_vec_i, #8 168 add x_ptr, x_ptr, x_pos 169 170 ldr q_data_0, [x_ptr], #16 171 ldr q_data_1, [x_ptr], #16 172 173 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 174 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 175 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 176 177 ldr q_data_2, [x_ptr], #16 178 ldr q_data_3, [x_ptr], #16 179 prfm pldl1strm, [x_ptr] 180 prfm pldl1keep, [x_tbl1] 181 prfm pldl1keep, [x_tbl2] 182 prfm pldl1keep, [x_tbl3] 183 184 /* data_0 */ 185 and v_tmp1.16b, v_data_0.16b, v_mask0f.16b 186 ushr v_data_0.16b, v_data_0.16b, #4 187 188 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 189 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b 190 eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b 191 eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b 192 193 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 194 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b 195 eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b 196 eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b 197 198 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 199 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b 200 eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b 201 eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b 202 203 /* data_1 */ 204 and v_tmp1.16b, v_data_1.16b, v_mask0f.16b 205 ushr v_data_1.16b, v_data_1.16b, #4 206 207 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 208 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b 209 eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b 210 eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b 211 212 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 213 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b 214 eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b 215 eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b 216 217 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 218 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b 219 eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b 220 eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b 221 222 /* data_2 */ 223 and v_tmp1.16b, v_data_2.16b, v_mask0f.16b 224 ushr v_data_2.16b, v_data_2.16b, #4 225 226 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 227 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b 228 eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b 229 eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b 230 231 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 232 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b 233 eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b 234 eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b 235 236 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 237 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b 238 eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b 239 eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b 240 241 /* data_3 */ 242 and v_tmp1.16b, v_data_3.16b, v_mask0f.16b 243 ushr v_data_3.16b, v_data_3.16b, #4 244 245 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b 246 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b 247 eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b 248 eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b 249 250 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b 251 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b 252 eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b 253 eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b 254 255 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b 256 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b 257 eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b 258 eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b 259 260 cmp x_vec_i, x_vec 261 blt .Lloop64_vects 262 263.Lloop64_vects_end: 264 add x_ptr, x_dest1, x_pos 265 stp q_p1_0, q_p1_1, [x_ptr], #32 266 stp q_p1_2, q_p1_3, [x_ptr] 267 268 add x_ptr, x_dest2, x_pos 269 stp q_p2_0, q_p2_1, [x_ptr], #32 270 stp q_p2_2, q_p2_3, [x_ptr] 271 272 add x_ptr, x_dest3, x_pos 273 stp q_p3_0, q_p3_1, [x_ptr], #32 274 stp q_p3_2, q_p3_3, [x_ptr] 275 276 add x_pos, x_pos, #64 277 cmp x_pos, x_len 278 ble .Lloop64 279 280.Lloop64_end: 281 /* restore d8 ~ d15 */ 282 ldp d8, d9, [sp] 283 ldp d10, d11, [sp, #16] 284 ldp d12, d13, [sp, #32] 285 ldp d14, d15, [sp, #48] 286 add sp, sp, #64 287 288 add x_len, x_len, #64 289 cmp x_pos, x_len 290 beq .return_pass 291 292.Lloop16_init: 293 sub x_len, x_len, #16 294 cmp x_pos, x_len 295 bgt .lessthan16_init 296 297.Lloop16: 298 movi v_p1_0.16b, #0 299 movi v_p2_0.16b, #0 300 movi v_p3_0.16b, #0 301 mov x_tbl1, x_tbl 302 add x_tbl2, x_tbl1, x_vec, lsl #2 303 add x_tbl3, x_tbl2, x_vec, lsl #2 304 mov x_vec_i, #0 305 306.Lloop16_vects: 307 ldr x_ptr, [x_src, x_vec_i] 308 add x_vec_i, x_vec_i, #8 309 ldr q_data, [x_ptr, x_pos] 310 311 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 312 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 313 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 314 315 and v_data_lo.16b, v_data.16b, v_mask0f.16b 316 ushr v_data_hi.16b, v_data.16b, #4 317 318 tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 319 tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 320 tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b 321 tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b 322 tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b 323 tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b 324 325 eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b 326 eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b 327 eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b 328 eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b 329 eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b 330 eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b 331 332 cmp x_vec_i, x_vec 333 bne .Lloop16_vects 334 335.Lloop16_vects_end: 336 str q_p1_0, [x_dest1, x_pos] 337 str q_p2_0, [x_dest2, x_pos] 338 str q_p3_0, [x_dest3, x_pos] 339 add x_pos, x_pos, #16 340 cmp x_pos, x_len 341 ble .Lloop16 342 343.Lloop16_end: 344 sub x_tmp, x_pos, x_len 345 cmp x_tmp, #16 346 beq .return_pass 347 348.lessthan16_init: 349 mov x_pos, x_len 350 b .Lloop16 351 352.return_pass: 353 mov w_ret, #0 354 ret 355 356.return_fail: 357 mov w_ret, #1 358 ret 359