xref: /reactos/drivers/filesystems/btrfs/xor.S (revision cdf90707)
1/* Copyright (c) Mark Harmstone 2020
2 *
3 * This file is part of WinBtrfs.
4 *
5 * WinBtrfs is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public Licence as published by
7 * the Free Software Foundation, either version 3 of the Licence, or
8 * (at your option) any later version.
9 *
10 * WinBtrfs is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU Lesser General Public Licence for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public Licence
16 * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17
18#include <asm.inc>
19
20#ifdef __x86_64__
21
22.code64
23
24/* void do_xor_sse2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */
25PUBLIC do_xor_sse2
26do_xor_sse2:
27    /* rcx = buf1
28    *  rdx = buf2
29    *  r8d = len
30    *  rax = tmp1
31    *  r9 = tmp2
32    *  xmm0 = tmp3
33    *  xmm1 = tmp4 */
34
35    mov rax, rcx
36    and rax, 15
37    cmp rax, 0
38    jne stragglers2
39
40    mov rax, rdx
41    and rax, 15
42    cmp rax, 0
43    jne stragglers2
44
45do_xor_sse2_loop:
46    cmp r8d, 16
47    jl stragglers2
48
49    movdqa xmm0, [rcx]
50    movdqa xmm1, [rdx]
51    pxor xmm0, xmm1
52    movdqa [rcx], xmm0
53
54    add rcx, 16
55    add rdx, 16
56    sub r8d, 16
57
58    jmp do_xor_sse2_loop
59
60stragglers2:
61
62    cmp r8d, 8
63    jl stragglers
64
65    mov rax, [rcx]
66    mov r9, [rdx]
67    xor rax, r9
68    mov [rcx], rax
69
70    add rcx, 8
71    add rdx, 8
72    sub r8d, 8
73
74    jmp stragglers2
75
76stragglers:
77
78    cmp r8d, 0
79    je do_xor_sse2_end
80
81    mov al, [rcx]
82    mov r9b, [rdx]
83    xor al, r9b
84    mov [rcx], al
85
86    inc rcx
87    inc rdx
88    dec r8d
89
90    jmp stragglers
91
92do_xor_sse2_end:
93    ret
94
95/* void do_xor_avx2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */
96PUBLIC do_xor_avx2
97do_xor_avx2:
98    /* rcx = buf1
99    *  rdx = buf2
100    *  r8d = len
101    *  rax = tmp1
102    *  r9 = tmp2
103    *  xmm0 = tmp3
104    *  xmm1 = tmp4 */
105
106    mov rax, rcx
107    and rax, 31
108    cmp rax, 0
109    jne stragglers4
110
111    mov rax, rdx
112    and rax, 31
113    cmp rax, 0
114    jne stragglers4
115
116do_xor_avx2_loop:
117    cmp r8d, 32
118    jl stragglers4
119
120    vmovdqa ymm0, YMMWORD PTR[rcx]
121    vmovdqa ymm1, YMMWORD PTR[rdx]
122    vpxor ymm0, ymm0, ymm1
123    vmovdqa YMMWORD PTR[rcx], ymm0
124
125    add rcx, 32
126    add rdx, 32
127    sub r8d, 32
128
129    jmp do_xor_avx2_loop
130
131stragglers4:
132
133    cmp r8d, 8
134    jl stragglers3
135
136    mov rax, [rcx]
137    mov r9, [rdx]
138    xor rax, r9
139    mov [rcx], rax
140
141    add rcx, 8
142    add rdx, 8
143    sub r8d, 8
144
145    jmp stragglers4
146
147stragglers3:
148
149    cmp r8d, 0
150    je do_xor_avx2_end
151
152    mov al, [rcx]
153    mov r9b, [rdx]
154    xor al, r9b
155    mov [rcx], al
156
157    inc rcx
158    inc rdx
159    dec r8d
160
161    jmp stragglers3
162
163do_xor_avx2_end:
164    ret
165END
166#else
167
168.code
169
170/* void __stdcall do_xor_sse2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */
171PUBLIC _do_xor_sse2@12
172_do_xor_sse2@12:
173    /* edi = buf1
174    *  edx = buf2
175    *  esi = len
176    *  eax = tmp1
177    *  ecx = tmp2
178    *  xmm0 = tmp3
179    *  xmm1 = tmp4 */
180
181    push ebp
182    mov ebp, esp
183
184    push esi
185    push edi
186
187    mov edi, [ebp+8]
188    mov edx, [ebp+12]
189    mov esi, [ebp+16]
190
191    mov eax, edi
192    and eax, 15
193    cmp eax, 0
194    jne stragglers2
195
196    mov eax, edx
197    and eax, 15
198    cmp eax, 0
199    jne stragglers2
200
201do_xor_sse2_loop:
202    cmp esi, 16
203    jl stragglers2
204
205    movdqa xmm0, [edi]
206    movdqa xmm1, [edx]
207    pxor xmm0, xmm1
208    movdqa [edi], xmm0
209
210    add edi, 16
211    add edx, 16
212    sub esi, 16
213
214    jmp do_xor_sse2_loop
215
216stragglers2:
217
218    cmp esi, 4
219    jl stragglers
220
221    mov eax, [edi]
222    mov ecx, [edx]
223    xor eax, ecx
224    mov [edi], eax
225
226    add edi, 4
227    add edx, 4
228    sub esi, 4
229
230    jmp stragglers2
231
232stragglers:
233
234    cmp esi, 0
235    je do_xor_sse2_end
236
237    mov al, [edi]
238    mov cl, [edx]
239    xor al, cl
240    mov [edi], al
241
242    inc edi
243    inc edx
244    dec esi
245
246    jmp stragglers
247
248do_xor_sse2_end:
249    pop edi
250    pop esi
251    pop ebp
252
253    ret 12
254
255/* void __stdcall do_xor_avx2(uint8_t* buf1, uint8_t* buf2, uint32_t len); */
256PUBLIC _do_xor_avx2@12
257_do_xor_avx2@12:
258    /* edi = buf1
259    *  edx = buf2
260    *  esi = len
261    *  eax = tmp1
262    *  ecx = tmp2
263    *  xmm0 = tmp3
264    *  xmm1 = tmp4 */
265
266    push ebp
267    mov ebp, esp
268
269    push esi
270    push edi
271
272    mov edi, [ebp+8]
273    mov edx, [ebp+12]
274    mov esi, [ebp+16]
275
276    mov eax, edi
277    and eax, 31
278    cmp eax, 0
279    jne stragglers4
280
281    mov eax, edx
282    and eax, 31
283    cmp eax, 0
284    jne stragglers4
285
286do_xor_avx2_loop:
287    cmp esi, 32
288    jl stragglers4
289
290    vmovdqa ymm0, YMMWORD PTR[edi]
291    vmovdqa ymm1, YMMWORD PTR[edx]
292    vpxor ymm0, ymm0, ymm1
293    vmovdqa YMMWORD PTR[edi], ymm0
294
295    add edi, 32
296    add edx, 32
297    sub esi, 32
298
299    jmp do_xor_avx2_loop
300
301stragglers4:
302
303    cmp esi, 4
304    jl stragglers3
305
306    mov eax, [edi]
307    mov ecx, [edx]
308    xor eax, ecx
309    mov [edi], eax
310
311    add edi, 4
312    add edx, 4
313    sub esi, 4
314
315    jmp stragglers4
316
317stragglers3:
318
319    cmp esi, 0
320    je do_xor_avx2_end
321
322    mov al, [edi]
323    mov cl, [edx]
324    xor al, cl
325    mov [edi], al
326
327    inc edi
328    inc edx
329    dec esi
330
331    jmp stragglers3
332
333do_xor_avx2_end:
334    pop edi
335    pop esi
336    pop ebp
337
338    ret 12
339
340END
341
342#endif
343