1/* Copyright (c) Mark Harmstone 2020 2 * 3 * This file is part of WinBtrfs. 4 * 5 * WinBtrfs is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public Licence as published by 7 * the Free Software Foundation, either version 3 of the Licence, or 8 * (at your option) any later version. 9 * 10 * WinBtrfs is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public Licence for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public Licence 16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <asm.inc> 19 20#ifdef __x86_64__ 21 22.code64 23 24/* void do_xor_sse2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */ 25PUBLIC do_xor_sse2 26do_xor_sse2: 27 /* rcx = buf1 28 * rdx = buf2 29 * r8d = len 30 * rax = tmp1 31 * r9 = tmp2 32 * xmm0 = tmp3 33 * xmm1 = tmp4 */ 34 35 mov rax, rcx 36 and rax, 15 37 cmp rax, 0 38 jne stragglers2 39 40 mov rax, rdx 41 and rax, 15 42 cmp rax, 0 43 jne stragglers2 44 45do_xor_sse2_loop: 46 cmp r8d, 16 47 jl stragglers2 48 49 movdqa xmm0, [rcx] 50 movdqa xmm1, [rdx] 51 pxor xmm0, xmm1 52 movdqa [rcx], xmm0 53 54 add rcx, 16 55 add rdx, 16 56 sub r8d, 16 57 58 jmp do_xor_sse2_loop 59 60stragglers2: 61 62 cmp r8d, 8 63 jl stragglers 64 65 mov rax, [rcx] 66 mov r9, [rdx] 67 xor rax, r9 68 mov [rcx], rax 69 70 add rcx, 8 71 add rdx, 8 72 sub r8d, 8 73 74 jmp stragglers2 75 76stragglers: 77 78 cmp r8d, 0 79 je do_xor_sse2_end 80 81 mov al, [rcx] 82 mov r9b, [rdx] 83 xor al, r9b 84 mov [rcx], al 85 86 inc rcx 87 inc rdx 88 dec r8d 89 90 jmp stragglers 91 92do_xor_sse2_end: 93 ret 94 95/* void do_xor_avx2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */ 96PUBLIC do_xor_avx2 97do_xor_avx2: 98 /* rcx = buf1 99 * rdx = buf2 100 * r8d = len 101 * rax = tmp1 102 * r9 = tmp2 103 * xmm0 = tmp3 104 * xmm1 = tmp4 */ 105 106 mov rax, rcx 107 and rax, 31 108 cmp rax, 0 109 jne stragglers4 110 111 mov rax, rdx 112 and rax, 31 113 cmp rax, 0 114 jne stragglers4 115 116do_xor_avx2_loop: 117 cmp r8d, 32 118 jl stragglers4 119 120 vmovdqa ymm0, YMMWORD PTR[rcx] 121 vmovdqa ymm1, YMMWORD PTR[rdx] 122 vpxor ymm0, ymm0, ymm1 123 vmovdqa YMMWORD PTR[rcx], ymm0 124 125 add rcx, 32 126 add rdx, 32 127 sub r8d, 32 128 129 jmp do_xor_avx2_loop 130 131stragglers4: 132 133 cmp r8d, 8 134 jl stragglers3 135 136 mov rax, [rcx] 137 mov r9, [rdx] 138 xor rax, r9 139 mov [rcx], rax 140 141 add rcx, 8 142 add rdx, 8 143 sub r8d, 8 144 145 jmp stragglers4 146 147stragglers3: 148 149 cmp r8d, 0 150 je do_xor_avx2_end 151 152 mov al, [rcx] 153 mov r9b, [rdx] 154 xor al, r9b 155 mov [rcx], al 156 157 inc rcx 158 inc rdx 159 dec r8d 160 161 jmp stragglers3 162 163do_xor_avx2_end: 164 ret 165END 166#else 167 168.code 169 170/* void __stdcall do_xor_sse2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */ 171PUBLIC _do_xor_sse2@12 172_do_xor_sse2@12: 173 /* edi = buf1 174 * edx = buf2 175 * esi = len 176 * eax = tmp1 177 * ecx = tmp2 178 * xmm0 = tmp3 179 * xmm1 = tmp4 */ 180 181 push ebp 182 mov ebp, esp 183 184 push esi 185 push edi 186 187 mov edi, [ebp+8] 188 mov edx, [ebp+12] 189 mov esi, [ebp+16] 190 191 mov eax, edi 192 and eax, 15 193 cmp eax, 0 194 jne stragglers2 195 196 mov eax, edx 197 and eax, 15 198 cmp eax, 0 199 jne stragglers2 200 201do_xor_sse2_loop: 202 cmp esi, 16 203 jl stragglers2 204 205 movdqa xmm0, [edi] 206 movdqa xmm1, [edx] 207 pxor xmm0, xmm1 208 movdqa [edi], xmm0 209 210 add edi, 16 211 add edx, 16 212 sub esi, 16 213 214 jmp do_xor_sse2_loop 215 216stragglers2: 217 218 cmp esi, 4 219 jl stragglers 220 221 mov eax, [edi] 222 mov ecx, [edx] 223 xor eax, ecx 224 mov [edi], eax 225 226 add edi, 4 227 add edx, 4 228 sub esi, 4 229 230 jmp stragglers2 231 232stragglers: 233 234 cmp esi, 0 235 je do_xor_sse2_end 236 237 mov al, [edi] 238 mov cl, [edx] 239 xor al, cl 240 mov [edi], al 241 242 inc edi 243 inc edx 244 dec esi 245 246 jmp stragglers 247 248do_xor_sse2_end: 249 pop edi 250 pop esi 251 pop ebp 252 253 ret 12 254 255/* void __stdcall do_xor_avx2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */ 256PUBLIC _do_xor_avx2@12 257_do_xor_avx2@12: 258 /* edi = buf1 259 * edx = buf2 260 * esi = len 261 * eax = tmp1 262 * ecx = tmp2 263 * xmm0 = tmp3 264 * xmm1 = tmp4 */ 265 266 push ebp 267 mov ebp, esp 268 269 push esi 270 push edi 271 272 mov edi, [ebp+8] 273 mov edx, [ebp+12] 274 mov esi, [ebp+16] 275 276 mov eax, edi 277 and eax, 31 278 cmp eax, 0 279 jne stragglers4 280 281 mov eax, edx 282 and eax, 31 283 cmp eax, 0 284 jne stragglers4 285 286do_xor_avx2_loop: 287 cmp esi, 32 288 jl stragglers4 289 290 vmovdqa ymm0, YMMWORD PTR[edi] 291 vmovdqa ymm1, YMMWORD PTR[edx] 292 vpxor ymm0, ymm0, ymm1 293 vmovdqa YMMWORD PTR[edi], ymm0 294 295 add edi, 32 296 add edx, 32 297 sub esi, 32 298 299 jmp do_xor_avx2_loop 300 301stragglers4: 302 303 cmp esi, 4 304 jl stragglers3 305 306 mov eax, [edi] 307 mov ecx, [edx] 308 xor eax, ecx 309 mov [edi], eax 310 311 add edi, 4 312 add edx, 4 313 sub esi, 4 314 315 jmp stragglers4 316 317stragglers3: 318 319 cmp esi, 0 320 je do_xor_avx2_end 321 322 mov al, [edi] 323 mov cl, [edx] 324 xor al, cl 325 mov [edi], al 326 327 inc edi 328 inc edx 329 dec esi 330 331 jmp stragglers3 332 333do_xor_avx2_end: 334 pop edi 335 pop esi 336 pop ebp 337 338 ret 12 339 340END 341 342#endif 343