1/************************************************************** 2 Copyright (c) 2019 Huawei Technologies Co., Ltd. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Huawei Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28**********************************************************************/ 29.text 30 31.global gf_vect_dot_prod_neon 32.type gf_vect_dot_prod_neon, %function 33 34/* arguments */ 35x_len .req x0 36x_vec .req x1 37x_tbl .req x2 38x_src .req x3 39x_dest1 .req x4 40 41/* returns */ 42w_ret .req w0 43 44/* local variables */ 45x_vec_i .req x5 46x_ptr .req x6 47x_pos .req x7 48x_tmp .req x8 49x_tbl1 .req x9 50 51/* vectors */ 52v_gft1_lo .req v0 53v_gft1_hi .req v1 54q_gft1_lo .req q0 55q_gft1_hi .req q1 56v_mask0f .req v2 57q_mask0f .req q2 58 59v_data_0 .req v8 60v_data_1 .req v9 61v_data_2 .req v10 62v_data_3 .req v11 63v_data_4 .req v12 64v_data_5 .req v13 65v_data_6 .req v14 66v_data_7 .req v15 67q_data_0 .req q8 68q_data_1 .req q9 69q_data_2 .req q10 70q_data_3 .req q11 71q_data_4 .req q12 72q_data_5 .req q13 73q_data_6 .req q14 74q_data_7 .req q15 75 76v_data_0_lo .req v16 77v_data_1_lo .req v17 78v_data_2_lo .req v18 79v_data_3_lo .req v19 80v_data_4_lo .req v20 81v_data_5_lo .req v21 82v_data_6_lo .req v22 83v_data_7_lo .req v23 84v_data_0_hi .req v_data_0 85v_data_1_hi .req v_data_1 86v_data_2_hi .req v_data_2 87v_data_3_hi .req v_data_3 88v_data_4_hi .req v_data_4 89v_data_5_hi .req v_data_5 90v_data_6_hi .req v_data_6 91v_data_7_hi .req v_data_7 92 93v_p0 .req v24 94v_p1 .req v25 95v_p2 .req v26 96v_p3 .req v27 97v_p4 .req v28 98v_p5 .req v29 99v_p6 .req v30 100v_p7 .req v31 101q_p0 .req q24 102q_p1 .req q25 103q_p2 .req q26 104q_p3 .req q27 105q_p4 .req q28 106q_p5 .req q29 107q_p6 .req q30 108q_p7 .req q31 109 110v_p .req v_p0 111q_p .req q_p0 112v_data .req v_p1 113q_data .req q_p1 114v_data_lo .req v_p2 115v_data_hi .req v_p3 116 117 118gf_vect_dot_prod_neon: 119 /* less than 16 bytes, return_fail */ 120 cmp x_len, #16 121 blt .return_fail 122 123 movi v_mask0f.16b, #0x0f 124 mov x_pos, #0 125 126 lsl x_vec, x_vec, #3 127 128.Lloop128_init: 129 /* less than 128 bytes, goto Lloop16_init */ 130 cmp x_len, #128 131 blt .Lloop16_init 132 133 /* save d8 ~ d15 to stack */ 134 sub sp, sp, #64 135 stp d8, d9, [sp] 136 stp d10, d11, [sp, #16] 137 stp d12, d13, [sp, #32] 138 stp d14, d15, [sp, #48] 139 140 sub x_len, x_len, #128 141 142.Lloop128: 143 movi v_p0.16b, #0 144 movi v_p1.16b, #0 145 movi v_p2.16b, #0 146 movi v_p3.16b, #0 147 movi v_p4.16b, #0 148 movi v_p5.16b, #0 149 movi v_p6.16b, #0 150 movi v_p7.16b, #0 151 152 mov x_tbl1, x_tbl 153 mov x_vec_i, #0 154 155.Lloop128_vects: 156 ldr x_ptr, [x_src, x_vec_i] 157 add x_vec_i, x_vec_i, #8 158 add x_ptr, x_ptr, x_pos 159 160 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 161 162 ldp q_data_0, q_data_1, [x_ptr], #32 163 ldp q_data_2, q_data_3, [x_ptr], #32 164 ldp q_data_4, q_data_5, [x_ptr], #32 165 ldp q_data_6, q_data_7, [x_ptr] 166 167 prfm pldl1keep, [x_tbl1] 168 prfm pldl1strm, [x_ptr] 169 170 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b 171 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b 172 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b 173 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b 174 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b 175 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b 176 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b 177 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b 178 179 ushr v_data_0_hi.16b, v_data_0.16b, #4 180 ushr v_data_1_hi.16b, v_data_1.16b, #4 181 ushr v_data_2_hi.16b, v_data_2.16b, #4 182 ushr v_data_3_hi.16b, v_data_3.16b, #4 183 ushr v_data_4_hi.16b, v_data_4.16b, #4 184 ushr v_data_5_hi.16b, v_data_5.16b, #4 185 ushr v_data_6_hi.16b, v_data_6.16b, #4 186 ushr v_data_7_hi.16b, v_data_7.16b, #4 187 188 tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b 189 tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b 190 tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b 191 tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b 192 tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b 193 tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b 194 tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b 195 tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b 196 197 tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b 198 tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b 199 tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b 200 tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b 201 tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b 202 tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b 203 tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b 204 tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b 205 206 eor v_p0.16b, v_data_0_lo.16b, v_p0.16b 207 eor v_p0.16b, v_p0.16b, v_data_0_hi.16b 208 eor v_p1.16b, v_data_1_lo.16b, v_p1.16b 209 eor v_p1.16b, v_p1.16b, v_data_1_hi.16b 210 eor v_p2.16b, v_data_2_lo.16b, v_p2.16b 211 eor v_p2.16b, v_p2.16b, v_data_2_hi.16b 212 eor v_p3.16b, v_data_3_lo.16b, v_p3.16b 213 eor v_p3.16b, v_p3.16b, v_data_3_hi.16b 214 eor v_p4.16b, v_data_4_lo.16b, v_p4.16b 215 eor v_p4.16b, v_p4.16b, v_data_4_hi.16b 216 eor v_p5.16b, v_data_5_lo.16b, v_p5.16b 217 eor v_p5.16b, v_p5.16b, v_data_5_hi.16b 218 eor v_p6.16b, v_data_6_lo.16b, v_p6.16b 219 eor v_p6.16b, v_p6.16b, v_data_6_hi.16b 220 eor v_p7.16b, v_data_7_lo.16b, v_p7.16b 221 eor v_p7.16b, v_p7.16b, v_data_7_hi.16b 222 223 cmp x_vec_i, x_vec 224 blt .Lloop128_vects 225 226.Lloop128_vects_end: 227 add x_ptr, x_dest1, x_pos 228 stp q_p0, q_p1, [x_ptr], #32 229 stp q_p2, q_p3, [x_ptr], #32 230 stp q_p4, q_p5, [x_ptr], #32 231 stp q_p6, q_p7, [x_ptr] 232 233 add x_pos, x_pos, #128 234 cmp x_pos, x_len 235 ble .Lloop128 236 237.Lloop128_end: 238 /* restore d8 ~ d15 */ 239 ldp d8, d9, [sp] 240 ldp d10, d11, [sp, #16] 241 ldp d12, d13, [sp, #32] 242 ldp d14, d15, [sp, #48] 243 add sp, sp, #64 244 245 add x_len, x_len, #128 246 cmp x_pos, x_len 247 beq .return_pass 248 249.Lloop16_init: 250 sub x_len, x_len, #16 251 cmp x_pos, x_len 252 bgt .lessthan16_init 253 254.Lloop16: 255 movi v_p.16b, #0 256 mov x_tbl1, x_tbl 257 mov x_vec_i, #0 258 259.Lloop16_vects: 260 ldr x_ptr, [x_src, x_vec_i] 261 ldr q_data, [x_ptr, x_pos] 262 add x_vec_i, x_vec_i, #8 263 264 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 265 266 and v_data_lo.16b, v_data.16b, v_mask0f.16b 267 ushr v_data_hi.16b, v_data.16b, #4 268 269 tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b 270 tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b 271 eor v_p.16b, v_data_lo.16b, v_p.16b 272 eor v_p.16b, v_p.16b, v_data_hi.16b 273 274 cmp x_vec_i, x_vec 275 blt .Lloop16_vects 276 277.Lloop16_vects_end: 278 str q_p, [x_dest1, x_pos] 279 add x_pos, x_pos, #16 280 cmp x_pos, x_len 281 ble .Lloop16 282 283.Lloop16_end: 284 sub x_tmp, x_pos, x_len 285 cmp x_tmp, #16 286 beq .return_pass 287 288.lessthan16_init: 289 mov x_pos, x_len 290 b .Lloop16 291 292.return_pass: 293 mov w_ret, #0 294 ret 295 296.return_fail: 297 mov w_ret, #1 298 ret 299