1// 2// MNNGemmInt8AddBiasScale_16x4_Unit.S 3// MNN 4// 5// Created by MNN on 2019/06/11. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8 9#ifdef __aarch64__ 10 11#include "MNNAsmGlobal.h" 12 13.text 14.align 5 15 16asm_function MNNGemmInt8AddBiasScale_16x4_Unit 17 18//struct QuanPostTreatParameters { 19// const float* scale; 20// const int32_t* bias; 21// int32_t maxValue; 22// int32_t minValue; 23//}; 24 25//void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, 26// size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realSize) { 27 28//Auto: x0: dst*, x1: src*, x2:weight*, x3: src_depth_quad, x4: dst_step, 29// x5: dst_depth_quad, x6: post, x7: realSize 30 31//Load from post: 32// x7: scale, x10: bias, w11: maxValue, w6: minValue 33mov x8, x7 34ldr x7, [x6, #0] 35ldr x10, [x6, #8] 36ldr w11, [x6, #16] 37ldr w6, [x6, #20] 38 39sub sp, sp, #128 40st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 41st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 42 43cmp x8, #3 44beq L3Dz 45 46cmp x8, #2 47beq L2Dz 48 49cmp x8, #1 50beq L1Dz 51 52cbz x7, L4LoopDz 53sub x4, x4, #8 54L4LoopDz: 55 mov x8, x1 56 ld1 {v0.16b}, [x2], #16 57 ld1 {v1.16b}, [x2], #16 58 ld1 {v2.16b}, [x2], #16 59 ld1 {v3.16b}, [x2], #16 60 dup v16.4s, wzr 61 dup v17.4s, wzr 62 ld1 {v4.16b}, [x1], #16 63 ld1 {v5.16b}, [x1], #16 64 ld1 {v6.16b}, [x1], #16 65 ld1 {v7.16b}, [x1], #16 66 67 smull v8.8h, v0.8b, v4.8b 68 dup v18.4s, wzr 69 smull v9.8h, v1.8b, v4.8b 70 dup v19.4s, wzr 71 smull v10.8h, v2.8b, v4.8b 72 dup v20.4s, wzr 73 smull v11.8h, v3.8b, v4.8b 74 dup v21.4s, wzr 75 smull v12.8h, v0.8b, v5.8b 76 dup v22.4s, wzr 77 smull v13.8h, v1.8b, v5.8b 78 dup v23.4s, wzr 79 smull v14.8h, v2.8b, v5.8b 80 dup v24.4s, wzr 81 smull v15.8h, v3.8b, v5.8b 82 subs x9, x3, #1 83 smlal2 v8.8h, v0.16b, v4.16b 84 dup v25.4s, wzr 85 smlal2 v9.8h, v1.16b, v4.16b 86 dup v26.4s, wzr 87 smlal2 v10.8h, v2.16b, v4.16b 88 dup v27.4s, wzr 89 smlal2 v11.8h, v3.16b, v4.16b 90 dup v28.4s, wzr 91 smlal2 v12.8h, v0.16b, v5.16b 92 dup v29.4s, wzr 93 smlal2 v13.8h, v1.16b, v5.16b 94 dup v30.4s, wzr 95 smlal2 v14.8h, v2.16b, v5.16b 96 dup v31.4s, wzr 97 smlal2 v15.8h, v3.16b, v5.16b 98 beq L4LoopSzEnd 99 100 L4LoopSz: 101 102 sadalp v16.4s, v8.8h 103 ld1 {v4.16b}, [x1], #16 104 smull v8.8h, v0.8b, v6.8b 105 sadalp v17.4s, v9.8h 106 ld1 {v5.16b}, [x1], #16 107 smull v9.8h, v1.8b, v6.8b 108 sadalp v18.4s, v10.8h 109 smull v10.8h, v2.8b, v6.8b 110 sadalp v19.4s, v11.8h 111 smull v11.8h, v3.8b, v6.8b 112 sadalp v20.4s, v12.8h 113 smull v12.8h, v0.8b, v7.8b 114 sadalp v21.4s, v13.8h 115 smull v13.8h, v1.8b, v7.8b 116 sadalp v22.4s, v14.8h 117 smull v14.8h, v2.8b, v7.8b 118 sadalp v23.4s, v15.8h 119 smull v15.8h, v3.8b, v7.8b 120 121 smlal2 v8.8h, v0.16b, v6.16b 122 smlal2 v9.8h, v1.16b, v6.16b 123 smlal2 v10.8h, v2.16b, v6.16b 124 smlal2 v11.8h, v3.16b, v6.16b 125 126 ld1 {v6.16b}, [x1], #16 127 128 smlal2 v12.8h, v0.16b, v7.16b 129 ld1 {v0.16b}, [x2], #16 130 smlal2 v13.8h, v1.16b, v7.16b 131 ld1 {v1.16b}, [x2], #16 132 smlal2 v14.8h, v2.16b, v7.16b 133 ld1 {v2.16b}, [x2], #16 134 smlal2 v15.8h, v3.16b, v7.16b 135 ld1 {v3.16b}, [x2], #16 136 137 sadalp v24.4s, v8.8h 138 smull v8.8h, v0.8b, v4.8b 139 sadalp v25.4s, v9.8h 140 ld1 {v7.16b}, [x1], #16 141 smull v9.8h, v1.8b, v4.8b 142 sadalp v26.4s, v10.8h 143 smull v10.8h, v2.8b, v4.8b 144 sadalp v27.4s, v11.8h 145 smull v11.8h, v3.8b, v4.8b 146 sadalp v28.4s, v12.8h 147 smull v12.8h, v0.8b, v5.8b 148 sadalp v29.4s, v13.8h 149 smull v13.8h, v1.8b, v5.8b 150 sadalp v30.4s, v14.8h 151 smull v14.8h, v2.8b, v5.8b 152 sadalp v31.4s, v15.8h 153 smull v15.8h, v3.8b, v5.8b 154 155 smlal2 v8.8h, v0.16b, v4.16b 156 smlal2 v9.8h, v1.16b, v4.16b 157 smlal2 v10.8h, v2.16b, v4.16b 158 smlal2 v11.8h, v3.16b, v4.16b 159 160 smlal2 v12.8h, v0.16b, v5.16b 161 smlal2 v13.8h, v1.16b, v5.16b 162 smlal2 v14.8h, v2.16b, v5.16b 163 smlal2 v15.8h, v3.16b, v5.16b 164 165 subs x9, x9, #1 166 bne L4LoopSz 167 168 L4LoopSzEnd: 169 sadalp v16.4s, v8.8h 170 smull v8.8h, v0.8b, v6.8b 171 sadalp v17.4s, v9.8h 172 smull v9.8h, v1.8b, v6.8b 173 sadalp v18.4s, v10.8h 174 smull v10.8h, v2.8b, v6.8b 175 sadalp v19.4s, v11.8h 176 smull v11.8h, v3.8b, v6.8b 177 sadalp v20.4s, v12.8h 178 smull v12.8h, v0.8b, v7.8b 179 sadalp v21.4s, v13.8h 180 smull v13.8h, v1.8b, v7.8b 181 sadalp v22.4s, v14.8h 182 smull v14.8h, v2.8b, v7.8b 183 sadalp v23.4s, v15.8h 184 smull v15.8h, v3.8b, v7.8b 185 186 smlal2 v8.8h, v0.16b, v6.16b 187 smlal2 v9.8h, v1.16b, v6.16b 188 smlal2 v10.8h, v2.16b, v6.16b 189 smlal2 v11.8h, v3.16b, v6.16b 190 191 smlal2 v12.8h, v0.16b, v7.16b 192 smlal2 v13.8h, v1.16b, v7.16b 193 smlal2 v14.8h, v2.16b, v7.16b 194 smlal2 v15.8h, v3.16b, v7.16b 195 196 sadalp v24.4s, v8.8h 197 sadalp v25.4s, v9.8h 198 sadalp v26.4s, v10.8h 199 sadalp v27.4s, v11.8h 200 sadalp v28.4s, v12.8h 201 sadalp v29.4s, v13.8h 202 sadalp v30.4s, v14.8h 203 sadalp v31.4s, v15.8h 204 205 ld1 {v0.4s}, [x10], #16 206 addp v4.4s, v16.4s, v17.4s 207 addp v5.4s, v18.4s, v19.4s 208 addp v6.4s, v20.4s, v21.4s 209 addp v7.4s, v22.4s, v23.4s 210 addp v8.4s, v24.4s, v25.4s 211 addp v9.4s, v26.4s, v27.4s 212 addp v10.4s, v28.4s, v29.4s 213 addp v11.4s, v30.4s, v31.4s 214 215 addp v12.4s, v4.4s, v5.4s 216 addp v13.4s, v6.4s, v7.4s 217 addp v14.4s, v8.4s, v9.4s 218 addp v15.4s, v10.4s, v11.4s 219 220 cbnz x7, L4Quan 221 add v16.4s, v12.4s, v0.4s 222 add v17.4s, v13.4s, v0.4s 223 add v18.4s, v14.4s, v0.4s 224 add v19.4s, v15.4s, v0.4s 225 scvtf v0.4s, v16.4s 226 scvtf v1.4s, v17.4s 227 scvtf v2.4s, v18.4s 228 scvtf v3.4s, v19.4s 229 subs x5, x5, #1 230 mov x1, x8 231 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x4 232 b L4LoopCheck 233 234 L4Quan: 235 ld1 {v1.4s}, [x7], #16 236 add v16.4s, v12.4s, v0.4s 237 add v17.4s, v13.4s, v0.4s 238 add v18.4s, v14.4s, v0.4s 239 add v19.4s, v15.4s, v0.4s 240 241 dup v31.4s, w6 // Min 242 dup v30.4s, w11 // Max 243 244 scvtf v4.4s, v16.4s 245 scvtf v5.4s, v17.4s 246 scvtf v6.4s, v18.4s 247 scvtf v7.4s, v19.4s 248 249 fmul v12.4s, v4.4s, v1.4s 250 fmul v13.4s, v5.4s, v1.4s 251 fmul v14.4s, v6.4s, v1.4s 252 fmul v15.4s, v7.4s, v1.4s 253 254 fcvtas v8.4s, v12.4s 255 fcvtas v9.4s, v13.4s 256 fcvtas v10.4s, v14.4s 257 fcvtas v11.4s, v15.4s 258 259 smin v8.4s, v30.4s, v8.4s 260 smin v9.4s, v30.4s, v9.4s 261 smin v10.4s, v30.4s, v10.4s 262 smin v11.4s, v30.4s, v11.4s 263 264 smax v8.4s, v31.4s, v8.4s 265 smax v9.4s, v31.4s, v9.4s 266 smax v10.4s, v31.4s, v10.4s 267 smax v11.4s, v31.4s, v11.4s 268 269 sqxtn v0.4h, v8.4s 270 sqxtn2 v0.8h, v9.4s 271 sqxtn v1.4h, v10.4s 272 sqxtn2 v1.8h, v11.4s 273 274 sqxtn v2.8b, v0.8h 275 sqxtn v3.8b, v1.8h 276 st1 {v2.8b}, [x0], #8 277 subs x5, x5, #1 278 mov x1, x8 279 st1 {v3.8b}, [x0], x4 280L4LoopCheck: 281 bne L4LoopDz 282 283b End 284 285L3Dz: 286cbz x7, L3LoopDz 287sub x4, x4, #8 288L3LoopDz: 289 mov x8, x1 290 ld1 {v0.16b}, [x2], #16 291 ld1 {v1.16b}, [x2], #16 292 ld1 {v2.16b}, [x2], #16 293 ld1 {v3.16b}, [x2], #16 294 dup v16.4s, wzr 295 dup v17.4s, wzr 296 ld1 {v4.16b}, [x1], #16 297 ld1 {v5.16b}, [x1], #16 298 ld1 {v6.16b}, [x1], #16 299 add x1, x1, #16 300 301 smull v8.8h, v0.8b, v4.8b 302 dup v18.4s, wzr 303 smull v9.8h, v1.8b, v4.8b 304 dup v19.4s, wzr 305 smull v10.8h, v2.8b, v4.8b 306 dup v20.4s, wzr 307 smull v11.8h, v3.8b, v4.8b 308 dup v21.4s, wzr 309 smull v12.8h, v0.8b, v5.8b 310 dup v22.4s, wzr 311 smull v13.8h, v1.8b, v5.8b 312 dup v23.4s, wzr 313 smull v14.8h, v2.8b, v5.8b 314 dup v24.4s, wzr 315 smull v15.8h, v3.8b, v5.8b 316 subs x9, x3, #1 317 smlal2 v8.8h, v0.16b, v4.16b 318 dup v25.4s, wzr 319 smlal2 v9.8h, v1.16b, v4.16b 320 dup v26.4s, wzr 321 smlal2 v10.8h, v2.16b, v4.16b 322 dup v27.4s, wzr 323 smlal2 v11.8h, v3.16b, v4.16b 324 smlal2 v12.8h, v0.16b, v5.16b 325 smlal2 v13.8h, v1.16b, v5.16b 326 smlal2 v14.8h, v2.16b, v5.16b 327 smlal2 v15.8h, v3.16b, v5.16b 328 beq L3LoopSzEnd 329 330 L3LoopSz: 331 332 sadalp v16.4s, v8.8h 333 ld1 {v4.16b}, [x1], #16 334 smull v8.8h, v0.8b, v6.8b 335 sadalp v17.4s, v9.8h 336 ld1 {v5.16b}, [x1], #16 337 smull v9.8h, v1.8b, v6.8b 338 sadalp v18.4s, v10.8h 339 smull v10.8h, v2.8b, v6.8b 340 sadalp v19.4s, v11.8h 341 smull v11.8h, v3.8b, v6.8b 342 sadalp v20.4s, v12.8h 343 sadalp v21.4s, v13.8h 344 sadalp v22.4s, v14.8h 345 sadalp v23.4s, v15.8h 346 347 smlal2 v8.8h, v0.16b, v6.16b 348 smlal2 v9.8h, v1.16b, v6.16b 349 smlal2 v10.8h, v2.16b, v6.16b 350 smlal2 v11.8h, v3.16b, v6.16b 351 352 ld1 {v6.16b}, [x1], #16 353 354 ld1 {v0.16b}, [x2], #16 355 ld1 {v1.16b}, [x2], #16 356 ld1 {v2.16b}, [x2], #16 357 ld1 {v3.16b}, [x2], #16 358 add x1, x1, #16 359 360 sadalp v24.4s, v8.8h 361 smull v8.8h, v0.8b, v4.8b 362 sadalp v25.4s, v9.8h 363 smull v9.8h, v1.8b, v4.8b 364 sadalp v26.4s, v10.8h 365 smull v10.8h, v2.8b, v4.8b 366 sadalp v27.4s, v11.8h 367 smull v11.8h, v3.8b, v4.8b 368 smull v12.8h, v0.8b, v5.8b 369 smull v13.8h, v1.8b, v5.8b 370 smull v14.8h, v2.8b, v5.8b 371 smull v15.8h, v3.8b, v5.8b 372 373 smlal2 v8.8h, v0.16b, v4.16b 374 smlal2 v9.8h, v1.16b, v4.16b 375 smlal2 v10.8h, v2.16b, v4.16b 376 smlal2 v11.8h, v3.16b, v4.16b 377 378 smlal2 v12.8h, v0.16b, v5.16b 379 smlal2 v13.8h, v1.16b, v5.16b 380 smlal2 v14.8h, v2.16b, v5.16b 381 smlal2 v15.8h, v3.16b, v5.16b 382 383 subs x9, x9, #1 384 bne L3LoopSz 385 386 L3LoopSzEnd: 387 sadalp v16.4s, v8.8h 388 smull v8.8h, v0.8b, v6.8b 389 sadalp v17.4s, v9.8h 390 smull v9.8h, v1.8b, v6.8b 391 sadalp v18.4s, v10.8h 392 smull v10.8h, v2.8b, v6.8b 393 sadalp v19.4s, v11.8h 394 smull v11.8h, v3.8b, v6.8b 395 sadalp v20.4s, v12.8h 396 sadalp v21.4s, v13.8h 397 sadalp v22.4s, v14.8h 398 sadalp v23.4s, v15.8h 399 400 smlal2 v8.8h, v0.16b, v6.16b 401 smlal2 v9.8h, v1.16b, v6.16b 402 smlal2 v10.8h, v2.16b, v6.16b 403 smlal2 v11.8h, v3.16b, v6.16b 404 405 sadalp v24.4s, v8.8h 406 sadalp v25.4s, v9.8h 407 sadalp v26.4s, v10.8h 408 sadalp v27.4s, v11.8h 409 410 ld1 {v0.4s}, [x10], #16 411 addp v4.4s, v16.4s, v17.4s 412 addp v5.4s, v18.4s, v19.4s 413 addp v6.4s, v20.4s, v21.4s 414 addp v7.4s, v22.4s, v23.4s 415 addp v8.4s, v24.4s, v25.4s 416 addp v9.4s, v26.4s, v27.4s 417 418 addp v12.4s, v4.4s, v5.4s 419 addp v13.4s, v6.4s, v7.4s 420 addp v14.4s, v8.4s, v9.4s 421 422 cbnz x7, L3Quan 423 add v16.4s, v12.4s, v0.4s 424 add v17.4s, v13.4s, v0.4s 425 add v18.4s, v14.4s, v0.4s 426 scvtf v0.4s, v16.4s 427 scvtf v1.4s, v17.4s 428 scvtf v2.4s, v18.4s 429 subs x5, x5, #1 430 mov x1, x8 431 st1 {v0.4s, v1.4s, v2.4s}, [x0], x4 432 b L3LoopCheck 433 434 L3Quan: 435 ld1 {v1.4s}, [x7], #16 436 add v16.4s, v12.4s, v0.4s 437 add v17.4s, v13.4s, v0.4s 438 add v18.4s, v14.4s, v0.4s 439 440 dup v31.4s, w6 // Min 441 dup v30.4s, w11 // Max 442 443 scvtf v4.4s, v16.4s 444 scvtf v5.4s, v17.4s 445 scvtf v6.4s, v18.4s 446 447 fmul v12.4s, v4.4s, v1.4s 448 fmul v13.4s, v5.4s, v1.4s 449 fmul v14.4s, v6.4s, v1.4s 450 451 fcvtas v8.4s, v12.4s 452 fcvtas v9.4s, v13.4s 453 fcvtas v10.4s, v14.4s 454 455 smin v8.4s, v30.4s, v8.4s 456 smin v9.4s, v30.4s, v9.4s 457 smin v10.4s, v30.4s, v10.4s 458 459 smax v8.4s, v31.4s, v8.4s 460 smax v9.4s, v31.4s, v9.4s 461 smax v10.4s, v31.4s, v10.4s 462 463 sqxtn v0.4h, v8.4s 464 sqxtn2 v0.8h, v9.4s 465 sqxtn v1.4h, v10.4s 466 467 sqxtn v2.8b, v0.8h 468 sqxtn v3.8b, v1.8h 469 st1 {v2.8b}, [x0], #8 470 subs x5, x5, #1 471 mov x1, x8 472 st1 {v3.s}[0], [x0], x4 473L3LoopCheck: 474 bne L3LoopDz 475 476b End 477 478L2Dz: 479L2LoopDz: 480 mov x8, x1 481 ld1 {v0.16b}, [x2], #16 482 ld1 {v1.16b}, [x2], #16 483 ld1 {v2.16b}, [x2], #16 484 ld1 {v3.16b}, [x2], #16 485 dup v16.4s, wzr 486 dup v17.4s, wzr 487 ld1 {v4.16b}, [x1], #16 488 ld1 {v5.16b}, [x1], #16 489 add x1, x1, #32 490 491 smull v8.8h, v0.8b, v4.8b 492 dup v18.4s, wzr 493 smull v9.8h, v1.8b, v4.8b 494 dup v19.4s, wzr 495 smull v10.8h, v2.8b, v4.8b 496 dup v20.4s, wzr 497 smull v11.8h, v3.8b, v4.8b 498 dup v21.4s, wzr 499 smull v12.8h, v0.8b, v5.8b 500 dup v22.4s, wzr 501 smull v13.8h, v1.8b, v5.8b 502 dup v23.4s, wzr 503 smull v14.8h, v2.8b, v5.8b 504 smull v15.8h, v3.8b, v5.8b 505 subs x9, x3, #1 506 smlal2 v8.8h, v0.16b, v4.16b 507 smlal2 v9.8h, v1.16b, v4.16b 508 smlal2 v10.8h, v2.16b, v4.16b 509 smlal2 v11.8h, v3.16b, v4.16b 510 smlal2 v12.8h, v0.16b, v5.16b 511 smlal2 v13.8h, v1.16b, v5.16b 512 smlal2 v14.8h, v2.16b, v5.16b 513 smlal2 v15.8h, v3.16b, v5.16b 514 beq L2LoopSzEnd 515 516 L2LoopSz: 517 518 sadalp v16.4s, v8.8h 519 ld1 {v4.16b}, [x1], #16 520 sadalp v17.4s, v9.8h 521 ld1 {v5.16b}, [x1], #16 522 sadalp v18.4s, v10.8h 523 sadalp v19.4s, v11.8h 524 sadalp v20.4s, v12.8h 525 sadalp v21.4s, v13.8h 526 sadalp v22.4s, v14.8h 527 sadalp v23.4s, v15.8h 528 529 ld1 {v0.16b}, [x2], #16 530 ld1 {v1.16b}, [x2], #16 531 ld1 {v2.16b}, [x2], #16 532 ld1 {v3.16b}, [x2], #16 533 add x1, x1, #32 534 535 smull v8.8h, v0.8b, v4.8b 536 smull v9.8h, v1.8b, v4.8b 537 smull v10.8h, v2.8b, v4.8b 538 smull v11.8h, v3.8b, v4.8b 539 smull v12.8h, v0.8b, v5.8b 540 smull v13.8h, v1.8b, v5.8b 541 smull v14.8h, v2.8b, v5.8b 542 smull v15.8h, v3.8b, v5.8b 543 544 smlal2 v8.8h, v0.16b, v4.16b 545 smlal2 v9.8h, v1.16b, v4.16b 546 smlal2 v10.8h, v2.16b, v4.16b 547 smlal2 v11.8h, v3.16b, v4.16b 548 549 smlal2 v12.8h, v0.16b, v5.16b 550 smlal2 v13.8h, v1.16b, v5.16b 551 smlal2 v14.8h, v2.16b, v5.16b 552 smlal2 v15.8h, v3.16b, v5.16b 553 554 subs x9, x9, #1 555 bne L2LoopSz 556 557 L2LoopSzEnd: 558 sadalp v16.4s, v8.8h 559 sadalp v17.4s, v9.8h 560 sadalp v18.4s, v10.8h 561 sadalp v19.4s, v11.8h 562 sadalp v20.4s, v12.8h 563 sadalp v21.4s, v13.8h 564 sadalp v22.4s, v14.8h 565 sadalp v23.4s, v15.8h 566 567 ld1 {v0.4s}, [x10], #16 568 addp v4.4s, v16.4s, v17.4s 569 addp v5.4s, v18.4s, v19.4s 570 addp v6.4s, v20.4s, v21.4s 571 addp v7.4s, v22.4s, v23.4s 572 573 addp v12.4s, v4.4s, v5.4s 574 addp v13.4s, v6.4s, v7.4s 575 576 cbnz x7, L2Quan 577 add v16.4s, v12.4s, v0.4s 578 add v17.4s, v13.4s, v0.4s 579 scvtf v0.4s, v16.4s 580 scvtf v1.4s, v17.4s 581 subs x5, x5, #1 582 mov x1, x8 583 st1 {v0.4s, v1.4s}, [x0], x4 584 b L2LoopCheck 585 586 L2Quan: 587 ld1 {v1.4s}, [x7], #16 588 add v16.4s, v12.4s, v0.4s 589 add v17.4s, v13.4s, v0.4s 590 591 dup v31.4s, w6 // Min 592 dup v30.4s, w11 // Max 593 594 scvtf v4.4s, v16.4s 595 scvtf v5.4s, v17.4s 596 597 fmul v12.4s, v4.4s, v1.4s 598 fmul v13.4s, v5.4s, v1.4s 599 600 fcvtas v8.4s, v12.4s 601 fcvtas v9.4s, v13.4s 602 603 smin v8.4s, v30.4s, v8.4s 604 smin v9.4s, v30.4s, v9.4s 605 606 smax v8.4s, v31.4s, v8.4s 607 smax v9.4s, v31.4s, v9.4s 608 609 sqxtn v0.4h, v8.4s 610 sqxtn2 v0.8h, v9.4s 611 612 sqxtn v2.8b, v0.8h 613 st1 {v2.8b}, [x0], x4 614 subs x5, x5, #1 615 mov x1, x8 616L2LoopCheck: 617 bne L2LoopDz 618 619b End 620 621L1Dz: 622L1LoopDz: 623 mov x8, x1 624 ld1 {v0.16b}, [x2], #16 625 ld1 {v1.16b}, [x2], #16 626 ld1 {v2.16b}, [x2], #16 627 ld1 {v3.16b}, [x2], #16 628 dup v16.4s, wzr 629 dup v17.4s, wzr 630 ld1 {v4.16b}, [x1], #16 631 add x1, x1, #48 632 633 smull v8.8h, v0.8b, v4.8b 634 dup v18.4s, wzr 635 smull v9.8h, v1.8b, v4.8b 636 dup v19.4s, wzr 637 smull v10.8h, v2.8b, v4.8b 638 smull v11.8h, v3.8b, v4.8b 639 subs x9, x3, #1 640 smlal2 v8.8h, v0.16b, v4.16b 641 smlal2 v9.8h, v1.16b, v4.16b 642 smlal2 v10.8h, v2.16b, v4.16b 643 smlal2 v11.8h, v3.16b, v4.16b 644 beq L1LoopSzEnd 645 646 L1LoopSz: 647 sadalp v16.4s, v8.8h 648 ld1 {v4.16b}, [x1], #16 649 sadalp v17.4s, v9.8h 650 sadalp v18.4s, v10.8h 651 sadalp v19.4s, v11.8h 652 sadalp v20.4s, v12.8h 653 sadalp v21.4s, v13.8h 654 sadalp v22.4s, v14.8h 655 sadalp v23.4s, v15.8h 656 657 ld1 {v0.16b}, [x2], #16 658 ld1 {v1.16b}, [x2], #16 659 ld1 {v2.16b}, [x2], #16 660 ld1 {v3.16b}, [x2], #16 661 add x1, x1, #48 662 663 smull v8.8h, v0.8b, v4.8b 664 smull v9.8h, v1.8b, v4.8b 665 smull v10.8h, v2.8b, v4.8b 666 smull v11.8h, v3.8b, v4.8b 667 668 smlal2 v8.8h, v0.16b, v4.16b 669 smlal2 v9.8h, v1.16b, v4.16b 670 smlal2 v10.8h, v2.16b, v4.16b 671 smlal2 v11.8h, v3.16b, v4.16b 672 673 subs x9, x9, #1 674 bne L1LoopSz 675 676 L1LoopSzEnd: 677 sadalp v16.4s, v8.8h 678 sadalp v17.4s, v9.8h 679 sadalp v18.4s, v10.8h 680 sadalp v19.4s, v11.8h 681 682 ld1 {v0.4s}, [x10], #16 683 addp v4.4s, v16.4s, v17.4s 684 addp v5.4s, v18.4s, v19.4s 685 686 addp v12.4s, v4.4s, v5.4s 687 688 cbnz x7, L1Quan 689 add v16.4s, v12.4s, v0.4s 690 scvtf v0.4s, v16.4s 691 subs x5, x5, #1 692 mov x1, x8 693 st1 {v0.4s}, [x0], x4 694 b L1LoopCheck 695 696 L1Quan: 697 ld1 {v1.4s}, [x7], #16 698 add v16.4s, v12.4s, v0.4s 699 700 dup v31.4s, w6 // Min 701 dup v30.4s, w11 // Max 702 703 scvtf v4.4s, v16.4s 704 705 fmul v12.4s, v4.4s, v1.4s 706 707 fcvtas v8.4s, v12.4s 708 709 smin v8.4s, v30.4s, v8.4s 710 711 smax v8.4s, v31.4s, v8.4s 712 713 sqxtn v0.4h, v8.4s 714 715 sqxtn v2.8b, v0.8h 716 st1 {v2.s}[0], [x0], x4 717 subs x5, x5, #1 718 mov x1, x8 719L1LoopCheck: 720 bne L1LoopDz 721 722End: 723sub sp, sp, #128 724ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 725ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 726ret 727 728#endif 729