1; 2; Copyright (c) 2018 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10;**************Variables Vs Registers***************************************** 11; r0 => src 12; r1 => dst 13; r2 => src_stride 14; r3 => dst_stride 15; r4 => filter_x0 16; r8 => ht 17; r10 => wd 18 19 EXPORT |vpx_convolve8_avg_horiz_filter_type1_neon| 20 ARM 21 REQUIRE8 22 PRESERVE8 23 24 AREA ||.text||, CODE, READONLY, ALIGN=2 25 26|vpx_convolve8_avg_horiz_filter_type1_neon| PROC 27 28 stmfd sp!, {r4 - r12, r14} ;stack stores the values of 29 ; the arguments 30 vpush {d8 - d15} ; stack offset by 64 31 mov r4, r1 32 mov r1, r2 33 mov r2, r4 34 35start_loop_count 36 ldr r4, [sp, #104] ;loads pi1_coeff 37 ldr r8, [sp, #108] ;loads x0_q4 38 add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] 39 ldr r8, [sp, #128] ;loads ht 40 ldr r10, [sp, #124] ;loads wd 41 vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) 42 mov r11, #1 43 subs r14, r8, #0 ;checks for ht == 0 44 vabs.s8 d2, d0 ;vabs_s8(coeff) 45 vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, 46 ; 0) 47 sub r12, r0, #3 ;pu1_src - 3 48 vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, 49 ; 1) 50 add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd 51 vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, 52 ; 2) 53 rsb r9, r10, r2, lsl #1 ;2*src_strd - wd 54 vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, 55 ; 3) 56 rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd 57 vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, 58 ; 4) 59 vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, 60 ; 5) 61 vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, 62 ; 6) 63 vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, 64 ; 7) 65 mov r7, r1 66 cmp r10, #4 67 ble outer_loop_4 68 69 cmp r10, #24 70 moveq r10, #16 71 addeq r8, #8 72 addeq r9, #8 73 cmp r10, #16 74 bge outer_loop_16 75 76 cmp r10, #12 77 addeq r8, #4 78 addeq r9, #4 79 b outer_loop_8 80 81outer_loop8_residual 82 sub r12, r0, #3 ;pu1_src - 3 83 mov r1, r7 84 mov r14, #32 85 add r1, #16 86 add r12, #16 87 mov r10, #8 88 add r8, #8 89 add r9, #8 90 91outer_loop_8 92 add r6, r1, r3 ;pu1_dst + dst_strd 93 add r4, r12, r2 ;pu1_src + src_strd 94 subs r5, r10, #0 ;checks wd 95 ble end_inner_loop_8 96 97inner_loop_8 98 mov r7, #0xc000 99 vld1.u32 {d0}, [r12], r11 ;vector load pu1_src 100 vdup.16 q4, r7 101 vld1.u32 {d1}, [r12], r11 102 vdup.16 q5, r7 103 vld1.u32 {d2}, [r12], r11 104 vld1.u32 {d3}, [r12], r11 105 mov r7, #0x4000 106 vld1.u32 {d4}, [r12], r11 107 vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], 108 ; coeffabs_1); 109 vld1.u32 {d5}, [r12], r11 110 vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], 111 ; coeffabs_3); 112 vld1.u32 {d6}, [r12], r11 113 vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], 114 ; coeffabs_0); 115 vld1.u32 {d7}, [r12], r11 116 vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], 117 ; coeffabs_2); 118 vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd 119 vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], 120 ; coeffabs_4); 121 vld1.u32 {d13}, [r4], r11 122 vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], 123 ; coeffabs_5); 124 vld1.u32 {d14}, [r4], r11 125 vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], 126 ; coeffabs_6); 127 vld1.u32 {d15}, [r4], r11 128 vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], 129 ; coeffabs_7); 130 vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd 131 vdup.16 q11, r7 132 vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], 133 ; coeffabs_3); 134 vld1.u32 {d17}, [r4], r11 135 vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], 136 ; coeffabs_2); 137 vhadd.s16 q4, q4, q11 138 vld1.u32 {d18}, [r4], r11 139 vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], 140 ; coeffabs_4); 141 vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd 142 vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], 143 ; coeffabs_5); 144 vld1.u8 {d6}, [r1] 145 vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow 146 ; result 1 147 vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], 148 ; coeffabs_6); 149 vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], 150 ; coeffabs_7); 151 vld1.u8 {d7}, [r6] 152 vrhadd.u8 d20, d20, d6 153 vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], 154 ; coeffabs_0); 155 vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], 156 ; coeffabs_1); 157 vst1.8 {d20}, [r1]! ;store the result pu1_dst 158 vhadd.s16 q5, q5, q11 159 subs r5, r5, #8 ;decrement the wd loop 160 vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow 161 ; result 2 162 vrhadd.u8 d8, d8, d7 163 vst1.8 {d8}, [r6]! ;store the result pu1_dst 164 cmp r5, #4 165 bgt inner_loop_8 166 167end_inner_loop_8 168 subs r14, r14, #2 ;decrement the ht loop 169 add r12, r12, r9 ;increment the src pointer by 170 ; 2*src_strd-wd 171 add r1, r1, r8 ;increment the dst pointer by 172 ; 2*dst_strd-wd 173 bgt outer_loop_8 174 175 ldr r10, [sp, #120] ;loads wd 176 cmp r10, #12 177 beq outer_loop4_residual 178 179end_loops 180 b end_func 181 182outer_loop_16 183 str r0, [sp, #-4]! 184 str r7, [sp, #-4]! 185 add r6, r1, r3 ;pu1_dst + dst_strd 186 add r4, r12, r2 ;pu1_src + src_strd 187 and r0, r12, #31 188 mov r7, #0xc000 189 sub r5, r10, #0 ;checks wd 190 pld [r4, r2, lsl #1] 191 pld [r12, r2, lsl #1] 192 vld1.u32 {q0}, [r12], r11 ;vector load pu1_src 193 vdup.16 q4, r7 194 vld1.u32 {q1}, [r12], r11 195 vld1.u32 {q2}, [r12], r11 196 vld1.u32 {q3}, [r12], r11 197 vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], 198 ; coeffabs_0); 199 vld1.u32 {q6}, [r12], r11 200 vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], 201 ; coeffabs_1); 202 vld1.u32 {q7}, [r12], r11 203 vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], 204 ; coeffabs_2); 205 vld1.u32 {q8}, [r12], r11 206 vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], 207 ; coeffabs_3); 208 vld1.u32 {q9}, [r12], r11 209 vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], 210 ; coeffabs_4); 211 vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], 212 ; coeffabs_5); 213 vdup.16 q10, r7 214 vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], 215 ; coeffabs_6); 216 vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], 217 ; coeffabs_7); 218 219inner_loop_16 220 vmlsl.u8 q10, d1, d24 221 vdup.16 q5, r7 222 vmlsl.u8 q10, d3, d25 223 mov r7, #0x4000 224 vdup.16 q11, r7 225 vmlal.u8 q10, d5, d26 226 vld1.u32 {q0}, [r4], r11 ;vector load pu1_src 227 vhadd.s16 q4, q4, q11 228 vld1.u32 {q1}, [r4], r11 229 vmlal.u8 q10, d7, d27 230 add r12, #8 231 subs r5, r5, #16 232 vmlal.u8 q10, d13, d28 233 vld1.u32 {q2}, [r4], r11 234 vmlal.u8 q10, d15, d29 235 vld1.u32 {q3}, [r4], r11 236 vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow 237 ; result 1 238 vmlsl.u8 q10, d17, d30 239 vld1.u32 {q6}, [r4], r11 240 vmlsl.u8 q10, d19, d31 241 vld1.u32 {q7}, [r4], r11 242 add r7, r1, #8 243 vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], 244 ; coeffabs_0); 245 vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], 246 ; coeffabs_1); 247 vld1.u32 {q8}, [r4], r11 248 vhadd.s16 q10, q10, q11 249 vld1.u32 {q9}, [r4], r11 250 vld1.u8 {d0}, [r1] 251 vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], 252 ; coeffabs_2); 253 vld1.u8 {d2}, [r7] 254 vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], 255 ; coeffabs_3); 256 add r4, #8 257 mov r7, #0xc000 258 vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], 259 ; coeffabs_4); 260 vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], 261 ; coeffabs_5); 262 vqrshrun.s16 d9, q10, #6 263 vdup.16 q11, r7 264 vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], 265 ; coeffabs_6); 266 vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], 267 ; coeffabs_7); 268 mov r7, #0x4000 269 vrhadd.u8 d8, d8, d0 270 vrhadd.u8 d9, d9, d2 271 vmlsl.u8 q11, d1, d24 272 vmlsl.u8 q11, d3, d25 273 vdup.16 q10, r7 274 vmlal.u8 q11, d5, d26 275 pld [r12, r2, lsl #2] 276 pld [r4, r2, lsl #2] 277 addeq r12, r12, r9 ;increment the src pointer by 278 ; 2*src_strd-wd 279 addeq r4, r12, r2 ;pu1_src + src_strd 280 vmlal.u8 q11, d7, d27 281 vmlal.u8 q11, d13, d28 282 vst1.8 {q4}, [r1]! ;store the result pu1_dst 283 subeq r14, r14, #2 284 vhadd.s16 q5, q5, q10 285 vmlal.u8 q11, d15, d29 286 addeq r1, r1, r8 287 vmlsl.u8 q11, d17, d30 288 cmp r14, #0 289 vmlsl.u8 q11, d19, d31 290 vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow 291 ; result 2 292 beq epilog_16 293 294 vld1.u32 {q0}, [r12], r11 ;vector load pu1_src 295 mov r7, #0xc000 296 cmp r5, #0 297 vld1.u32 {q1}, [r12], r11 298 vhadd.s16 q11, q11, q10 299 vld1.u32 {q2}, [r12], r11 300 vdup.16 q4, r7 301 vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], 302 ; coeffabs_0); 303 vdup.16 q10, r7 304 vld1.u32 {q3}, [r12], r11 305 add r7, r6, #8 306 moveq r5, r10 307 vld1.u8 {d0}, [r6] 308 vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], 309 ; coeffabs_1); 310 vld1.u8 {d2}, [r7] 311 vqrshrun.s16 d11, q11, #6 312 vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], 313 ; coeffabs_2); 314 vld1.u32 {q6}, [r12], r11 315 vrhadd.u8 d10, d10, d0 316 vld1.u32 {q7}, [r12], r11 317 vrhadd.u8 d11, d11, d2 318 vld1.u32 {q8}, [r12], r11 319 vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], 320 ; coeffabs_3); 321 vld1.u32 {q9}, [r12], r11 322 vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], 323 ; coeffabs_4); 324 vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], 325 ; coeffabs_5); 326 mov r7, #0xc000 327 vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], 328 ; coeffabs_6); 329 vst1.8 {q5}, [r6]! ;store the result pu1_dst 330 vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], 331 ; coeffabs_7); 332 addeq r6, r1, r3 ;pu1_dst + dst_strd 333 b inner_loop_16 334 335epilog_16 336 mov r7, #0x4000 337 ldr r0, [sp], #4 338 ldr r10, [sp, #120] 339 vdup.16 q10, r7 340 vhadd.s16 q11, q11, q10 341 vqrshrun.s16 d11, q11, #6 342 add r7, r6, #8 343 vld1.u8 {d20}, [r6] 344 vld1.u8 {d21}, [r7] 345 vrhadd.u8 d10, d10, d20 346 vrhadd.u8 d11, d11, d21 347 vst1.8 {q5}, [r6]! ;store the result pu1_dst 348 ldr r7, [sp], #4 349 cmp r10, #24 350 beq outer_loop8_residual 351 352end_loops1 353 b end_func 354 355outer_loop4_residual 356 sub r12, r0, #3 ;pu1_src - 3 357 mov r1, r7 358 add r1, #8 359 mov r10, #4 360 add r12, #8 361 mov r14, #16 362 add r8, #4 363 add r9, #4 364 365outer_loop_4 366 add r6, r1, r3 ;pu1_dst + dst_strd 367 add r4, r12, r2 ;pu1_src + src_strd 368 subs r5, r10, #0 ;checks wd 369 ble end_inner_loop_4 370 371inner_loop_4 372 vld1.u32 {d0}, [r12], r11 ;vector load pu1_src 373 vld1.u32 {d1}, [r12], r11 374 vld1.u32 {d2}, [r12], r11 375 vld1.u32 {d3}, [r12], r11 376 vld1.u32 {d4}, [r12], r11 377 vld1.u32 {d5}, [r12], r11 378 vld1.u32 {d6}, [r12], r11 379 vld1.u32 {d7}, [r12], r11 380 sub r12, r12, #4 381 vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd 382 vld1.u32 {d13}, [r4], r11 383 vzip.32 d0, d12 ;vector zip the i iteration and ii 384 ; interation in single register 385 vld1.u32 {d14}, [r4], r11 386 vzip.32 d1, d13 387 vld1.u32 {d15}, [r4], r11 388 vzip.32 d2, d14 389 vld1.u32 {d16}, [r4], r11 390 vzip.32 d3, d15 391 vld1.u32 {d17}, [r4], r11 392 vzip.32 d4, d16 393 vld1.u32 {d18}, [r4], r11 394 vzip.32 d5, d17 395 vld1.u32 {d19}, [r4], r11 396 mov r7, #0xc000 397 vdup.16 q4, r7 398 sub r4, r4, #4 399 vzip.32 d6, d18 400 vzip.32 d7, d19 401 vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii 402 ; iteration in the same time 403 vmlsl.u8 q4, d0, d24 404 vmlal.u8 q4, d2, d26 405 vmlal.u8 q4, d3, d27 406 vmlal.u8 q4, d4, d28 407 vmlal.u8 q4, d5, d29 408 vmlsl.u8 q4, d6, d30 409 vmlsl.u8 q4, d7, d31 410 mov r7, #0x4000 411 vdup.16 q10, r7 412 vhadd.s16 q4, q4, q10 413 vqrshrun.s16 d8, q4, #6 414 vld1.u32 {d10[0]}, [r1] 415 vld1.u32 {d10[1]}, [r6] 416 vrhadd.u8 d8, d8, d10 417 vst1.32 {d8[0]},[r1]! ;store the i iteration result which 418 ; is in upper part of the register 419 vst1.32 {d8[1]},[r6]! ;store the ii iteration result which 420 ; is in lower part of the register 421 subs r5, r5, #4 ;decrement the wd by 4 422 bgt inner_loop_4 423 424end_inner_loop_4 425 subs r14, r14, #2 ;decrement the ht by 4 426 add r12, r12, r9 ;increment the input pointer 427 ; 2*src_strd-wd 428 add r1, r1, r8 ;increment the output pointer 429 ; 2*dst_strd-wd 430 bgt outer_loop_4 431 432end_func 433 vpop {d8 - d15} 434 ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp 435 436 ENDP 437 438 END 439