1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14SECTION .text 15 16;unsigned int vpx_highbd_calc16x16var_sse2 17;( 18; unsigned char * src_ptr, 19; int src_stride, 20; unsigned char * ref_ptr, 21; int ref_stride, 22; unsigned int * SSE, 23; int * Sum 24;) 25global sym(vpx_highbd_calc16x16var_sse2) PRIVATE 26sym(vpx_highbd_calc16x16var_sse2): 27 push rbp 28 mov rbp, rsp 29 SHADOW_ARGS_TO_STACK 6 30 SAVE_XMM 7 31 push rbx 32 push rsi 33 push rdi 34 ; end prolog 35 36 mov rsi, arg(0) ;[src_ptr] 37 mov rdi, arg(2) ;[ref_ptr] 38 39 movsxd rax, DWORD PTR arg(1) ;[src_stride] 40 movsxd rdx, DWORD PTR arg(3) ;[ref_stride] 41 add rax, rax ; source stride in bytes 42 add rdx, rdx ; recon stride in bytes 43 44 ; Prefetch data 45 prefetcht0 [rsi] 46 prefetcht0 [rsi+16] 47 prefetcht0 [rsi+rax] 48 prefetcht0 [rsi+rax+16] 49 lea rbx, [rsi+rax*2] 50 prefetcht0 [rbx] 51 prefetcht0 [rbx+16] 52 prefetcht0 [rbx+rax] 53 prefetcht0 [rbx+rax+16] 54 55 prefetcht0 [rdi] 56 prefetcht0 [rdi+16] 57 prefetcht0 [rdi+rdx] 58 prefetcht0 [rdi+rdx+16] 59 lea rbx, [rdi+rdx*2] 60 prefetcht0 [rbx] 61 prefetcht0 [rbx+16] 62 prefetcht0 [rbx+rdx] 63 prefetcht0 [rbx+rdx+16] 64 65 pxor xmm0, xmm0 ; clear xmm0 for unpack 66 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 67 68 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 69 mov rcx, 16 70 71.var16loop: 72 movdqu xmm1, XMMWORD PTR [rsi] 73 movdqu xmm2, XMMWORD PTR [rdi] 74 75 lea rbx, [rsi+rax*2] 76 prefetcht0 [rbx] 77 prefetcht0 [rbx+16] 78 prefetcht0 [rbx+rax] 79 prefetcht0 [rbx+rax+16] 80 lea rbx, [rdi+rdx*2] 81 prefetcht0 [rbx] 82 prefetcht0 [rbx+16] 83 prefetcht0 [rbx+rdx] 84 prefetcht0 [rbx+rdx+16] 85 86 pxor xmm5, xmm5 87 88 psubw xmm1, xmm2 89 movdqu xmm3, XMMWORD PTR [rsi+16] 90 paddw xmm5, xmm1 91 pmaddwd xmm1, xmm1 92 movdqu xmm2, XMMWORD PTR [rdi+16] 93 paddd xmm6, xmm1 94 95 psubw xmm3, xmm2 96 movdqu xmm1, XMMWORD PTR [rsi+rax] 97 paddw xmm5, xmm3 98 pmaddwd xmm3, xmm3 99 movdqu xmm2, XMMWORD PTR [rdi+rdx] 100 paddd xmm6, xmm3 101 102 psubw xmm1, xmm2 103 movdqu xmm3, XMMWORD PTR [rsi+rax+16] 104 paddw xmm5, xmm1 105 pmaddwd xmm1, xmm1 106 movdqu xmm2, XMMWORD PTR [rdi+rdx+16] 107 paddd xmm6, xmm1 108 109 psubw xmm3, xmm2 110 paddw xmm5, xmm3 111 pmaddwd xmm3, xmm3 112 paddd xmm6, xmm3 113 114 movdqa xmm1, xmm5 115 movdqa xmm2, xmm5 116 pcmpgtw xmm1, xmm0 117 pcmpeqw xmm2, xmm0 118 por xmm1, xmm2 119 pcmpeqw xmm1, xmm0 120 movdqa xmm2, xmm5 121 punpcklwd xmm5, xmm1 122 punpckhwd xmm2, xmm1 123 paddd xmm7, xmm5 124 paddd xmm7, xmm2 125 126 lea rsi, [rsi + 2*rax] 127 lea rdi, [rdi + 2*rdx] 128 sub rcx, 2 129 jnz .var16loop 130 131 movdqa xmm4, xmm6 132 punpckldq xmm6, xmm0 133 134 punpckhdq xmm4, xmm0 135 movdqa xmm5, xmm7 136 137 paddd xmm6, xmm4 138 punpckldq xmm7, xmm0 139 140 punpckhdq xmm5, xmm0 141 paddd xmm7, xmm5 142 143 movdqa xmm4, xmm6 144 movdqa xmm5, xmm7 145 146 psrldq xmm4, 8 147 psrldq xmm5, 8 148 149 paddd xmm6, xmm4 150 paddd xmm7, xmm5 151 152 mov rdi, arg(4) ; [SSE] 153 mov rax, arg(5) ; [Sum] 154 155 movd DWORD PTR [rdi], xmm6 156 movd DWORD PTR [rax], xmm7 157 158 159 ; begin epilog 160 pop rdi 161 pop rsi 162 pop rbx 163 RESTORE_XMM 164 UNSHADOW_ARGS 165 pop rbp 166 ret 167 168 169;unsigned int vpx_highbd_calc8x8var_sse2 170;( 171; unsigned char * src_ptr, 172; int src_stride, 173; unsigned char * ref_ptr, 174; int ref_stride, 175; unsigned int * SSE, 176; int * Sum 177;) 178global sym(vpx_highbd_calc8x8var_sse2) PRIVATE 179sym(vpx_highbd_calc8x8var_sse2): 180 push rbp 181 mov rbp, rsp 182 SHADOW_ARGS_TO_STACK 6 183 SAVE_XMM 7 184 push rbx 185 push rsi 186 push rdi 187 ; end prolog 188 189 mov rsi, arg(0) ;[src_ptr] 190 mov rdi, arg(2) ;[ref_ptr] 191 192 movsxd rax, DWORD PTR arg(1) ;[src_stride] 193 movsxd rdx, DWORD PTR arg(3) ;[ref_stride] 194 add rax, rax ; source stride in bytes 195 add rdx, rdx ; recon stride in bytes 196 197 ; Prefetch data 198 prefetcht0 [rsi] 199 prefetcht0 [rsi+rax] 200 lea rbx, [rsi+rax*2] 201 prefetcht0 [rbx] 202 prefetcht0 [rbx+rax] 203 204 prefetcht0 [rdi] 205 prefetcht0 [rdi+rdx] 206 lea rbx, [rdi+rdx*2] 207 prefetcht0 [rbx] 208 prefetcht0 [rbx+rdx] 209 210 pxor xmm0, xmm0 ; clear xmm0 for unpack 211 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 212 213 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 214 mov rcx, 8 215 216.var8loop: 217 movdqu xmm1, XMMWORD PTR [rsi] 218 movdqu xmm2, XMMWORD PTR [rdi] 219 220 lea rbx, [rsi+rax*4] 221 prefetcht0 [rbx] 222 prefetcht0 [rbx+rax] 223 lea rbx, [rbx+rax*2] 224 prefetcht0 [rbx] 225 prefetcht0 [rbx+rax] 226 lea rbx, [rdi+rdx*4] 227 prefetcht0 [rbx] 228 prefetcht0 [rbx+rdx] 229 lea rbx, [rbx+rdx*2] 230 prefetcht0 [rbx] 231 prefetcht0 [rbx+rdx] 232 233 pxor xmm5, xmm5 234 235 psubw xmm1, xmm2 236 movdqu xmm3, XMMWORD PTR [rsi+rax] 237 paddw xmm5, xmm1 238 pmaddwd xmm1, xmm1 239 movdqu xmm2, XMMWORD PTR [rdi+rdx] 240 paddd xmm6, xmm1 241 242 lea rsi, [rsi + 2*rax] 243 lea rdi, [rdi + 2*rdx] 244 245 psubw xmm3, xmm2 246 movdqu xmm1, XMMWORD PTR [rsi] 247 paddw xmm5, xmm3 248 pmaddwd xmm3, xmm3 249 movdqu xmm2, XMMWORD PTR [rdi] 250 paddd xmm6, xmm3 251 252 psubw xmm1, xmm2 253 movdqu xmm3, XMMWORD PTR [rsi+rax] 254 paddw xmm5, xmm1 255 pmaddwd xmm1, xmm1 256 movdqu xmm2, XMMWORD PTR [rdi+rdx] 257 paddd xmm6, xmm1 258 259 psubw xmm3, xmm2 260 paddw xmm5, xmm3 261 pmaddwd xmm3, xmm3 262 paddd xmm6, xmm3 263 264 movdqa xmm1, xmm5 265 movdqa xmm2, xmm5 266 pcmpgtw xmm1, xmm0 267 pcmpeqw xmm2, xmm0 268 por xmm1, xmm2 269 pcmpeqw xmm1, xmm0 270 movdqa xmm2, xmm5 271 punpcklwd xmm5, xmm1 272 punpckhwd xmm2, xmm1 273 paddd xmm7, xmm5 274 paddd xmm7, xmm2 275 276 lea rsi, [rsi + 2*rax] 277 lea rdi, [rdi + 2*rdx] 278 sub rcx, 4 279 jnz .var8loop 280 281 movdqa xmm4, xmm6 282 punpckldq xmm6, xmm0 283 284 punpckhdq xmm4, xmm0 285 movdqa xmm5, xmm7 286 287 paddd xmm6, xmm4 288 punpckldq xmm7, xmm0 289 290 punpckhdq xmm5, xmm0 291 paddd xmm7, xmm5 292 293 movdqa xmm4, xmm6 294 movdqa xmm5, xmm7 295 296 psrldq xmm4, 8 297 psrldq xmm5, 8 298 299 paddd xmm6, xmm4 300 paddd xmm7, xmm5 301 302 mov rdi, arg(4) ; [SSE] 303 mov rax, arg(5) ; [Sum] 304 305 movd DWORD PTR [rdi], xmm6 306 movd DWORD PTR [rax], xmm7 307 308 ; begin epilog 309 pop rdi 310 pop rsi 311 pop rbx 312 RESTORE_XMM 313 UNSHADOW_ARGS 314 pop rbp 315 ret 316