1;; https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention
2;; The first four integer arguments are passed in registers.
3;; Integer values are passed in left-to-right order in RCX,
4;; RDX, R8, and R9, respectively. Arguments five and higher
5;; are passed on the stack.
6
7;; The registers RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5,
8;; and the upper portions of YMM0-15 and ZMM0-15 are
9;; considered volatile and must be considered destroyed on
10;; function calls.
11
12include ksamd64.inc
13EXTERNDEF ?Te@rdtable@CryptoPP@@3PA_KA:FAR
14EXTERNDEF ?g_cacheLineSize@CryptoPP@@3IA:FAR
15EXTERNDEF ?SHA256_K@CryptoPP@@3QBIB:FAR
16.CODE
17
18    ALIGN   8
19Baseline_Add	PROC
20	lea		rdx, [rdx+8*rcx]
21	lea		r8, [r8+8*rcx]
22	lea		r9, [r9+8*rcx]
23	neg		rcx					; rcx is negative index
24	jz		$1@Baseline_Add
25	mov		rax,[r8+8*rcx]
26	add		rax,[r9+8*rcx]
27	mov		[rdx+8*rcx],rax
28$0@Baseline_Add:
29	mov		rax,[r8+8*rcx+8]
30	adc		rax,[r9+8*rcx+8]
31	mov		[rdx+8*rcx+8],rax
32	lea		rcx,[rcx+2]			; advance index, avoid inc which causes slowdown on Intel Core 2
33	jrcxz	$1@Baseline_Add		; loop until rcx overflows and becomes zero
34	mov		rax,[r8+8*rcx]
35	adc		rax,[r9+8*rcx]
36	mov		[rdx+8*rcx],rax
37	jmp		$0@Baseline_Add
38$1@Baseline_Add:
39	mov		rax, 0
40	adc		rax, rax			; store carry into rax (return result register)
41	ret
42Baseline_Add ENDP
43
44    ALIGN   8
45Baseline_Sub	PROC
46	lea		rdx, [rdx+8*rcx]
47	lea		r8, [r8+8*rcx]
48	lea		r9, [r9+8*rcx]
49	neg		rcx					; rcx is negative index
50	jz		$1@Baseline_Sub
51	mov		rax,[r8+8*rcx]
52	sub		rax,[r9+8*rcx]
53	mov		[rdx+8*rcx],rax
54$0@Baseline_Sub:
55	mov		rax,[r8+8*rcx+8]
56	sbb		rax,[r9+8*rcx+8]
57	mov		[rdx+8*rcx+8],rax
58	lea		rcx,[rcx+2]			; advance index, avoid inc which causes slowdown on Intel Core 2
59	jrcxz	$1@Baseline_Sub		; loop until rcx overflows and becomes zero
60	mov		rax,[r8+8*rcx]
61	sbb		rax,[r9+8*rcx]
62	mov		[rdx+8*rcx],rax
63	jmp		$0@Baseline_Sub
64$1@Baseline_Sub:
65	mov		rax, 0
66	adc		rax, rax			; store carry into rax (return result register)
67
68	ret
69Baseline_Sub ENDP
70
71ALIGN   8
72Rijndael_Enc_AdvancedProcessBlocks_SSE2	PROC FRAME
73rex_push_reg rsi
74push_reg rdi
75push_reg rbx
76push_reg r12
77.endprolog
78mov r8, rcx
79mov r11, ?Te@rdtable@CryptoPP@@3PA_KA
80mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
81mov rsi, [(r8+16*19)]
82mov rax, 16
83and rax, rsi
84movdqa xmm3, XMMWORD PTR [rdx+16+rax]
85movdqa [(r8+16*12)], xmm3
86lea rax, [rdx+rax+2*16]
87sub rax, rsi
88label0:
89movdqa xmm0, [rax+rsi]
90movdqa XMMWORD PTR [(r8+0)+rsi], xmm0
91add rsi, 16
92cmp rsi, 16*12
93jl label0
94movdqa xmm4, [rax+rsi]
95movdqa xmm1, [rdx]
96mov r12d, [rdx+4*4]
97mov ebx, [rdx+5*4]
98mov ecx, [rdx+6*4]
99mov edx, [rdx+7*4]
100xor rax, rax
101label9:
102mov esi, [r11+rax]
103add rax, rdi
104mov esi, [r11+rax]
105add rax, rdi
106mov esi, [r11+rax]
107add rax, rdi
108mov esi, [r11+rax]
109add rax, rdi
110cmp rax, 2048
111jl label9
112lfence
113test DWORD PTR [(r8+16*18+8)], 1
114jz label8
115mov rsi, [(r8+16*14)]
116movdqu xmm2, [rsi]
117pxor xmm2, xmm1
118psrldq xmm1, 14
119movd eax, xmm1
120mov al, BYTE PTR [rsi+15]
121mov r10d, eax
122movd eax, xmm2
123psrldq xmm2, 4
124movd edi, xmm2
125psrldq xmm2, 4
126movzx esi, al
127xor r12d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
128movzx esi, ah
129xor edx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
130shr eax, 16
131movzx esi, al
132xor ecx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
133movzx esi, ah
134xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
135mov eax, edi
136movd edi, xmm2
137psrldq xmm2, 4
138movzx esi, al
139xor ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
140movzx esi, ah
141xor r12d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
142shr eax, 16
143movzx esi, al
144xor edx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
145movzx esi, ah
146xor ecx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
147mov eax, edi
148movd edi, xmm2
149movzx esi, al
150xor ecx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
151movzx esi, ah
152xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
153shr eax, 16
154movzx esi, al
155xor r12d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
156movzx esi, ah
157xor edx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
158mov eax, edi
159movzx esi, al
160xor edx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
161movzx esi, ah
162xor ecx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
163shr eax, 16
164movzx esi, al
165xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
166psrldq xmm2, 3
167mov eax, [(r8+16*12)+0*4]
168mov edi, [(r8+16*12)+2*4]
169mov r9d, [(r8+16*12)+3*4]
170movzx esi, cl
171xor r9d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
172movzx esi, bl
173xor edi, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
174movzx esi, bh
175xor r9d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
176shr ebx, 16
177movzx esi, bl
178xor eax, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
179movzx esi, bh
180mov ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
181xor ebx, [(r8+16*12)+1*4]
182movzx esi, ch
183xor eax, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
184shr ecx, 16
185movzx esi, dl
186xor eax, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
187movzx esi, dh
188xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
189shr edx, 16
190movzx esi, ch
191xor edi, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
192movzx esi, cl
193xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
194movzx esi, dl
195xor edi, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
196movzx esi, dh
197xor r9d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
198movd ecx, xmm2
199mov edx, r12d
200mov [(r8+0)+3*4], r9d
201mov [(r8+0)+0*4], eax
202mov [(r8+0)+1*4], ebx
203mov [(r8+0)+2*4], edi
204jmp label5
205label3:
206mov r12d, [(r8+16*12)+0*4]
207mov ebx, [(r8+16*12)+1*4]
208mov ecx, [(r8+16*12)+2*4]
209mov edx, [(r8+16*12)+3*4]
210label8:
211mov rax, [(r8+16*14)]
212movdqu xmm2, [rax]
213mov rsi, [(r8+16*14)+8]
214movdqu xmm5, [rsi]
215pxor xmm2, xmm1
216pxor xmm2, xmm5
217movd eax, xmm2
218psrldq xmm2, 4
219movd edi, xmm2
220psrldq xmm2, 4
221movzx esi, al
222xor r12d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
223movzx esi, ah
224xor edx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
225shr eax, 16
226movzx esi, al
227xor ecx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
228movzx esi, ah
229xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
230mov eax, edi
231movd edi, xmm2
232psrldq xmm2, 4
233movzx esi, al
234xor ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
235movzx esi, ah
236xor r12d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
237shr eax, 16
238movzx esi, al
239xor edx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
240movzx esi, ah
241xor ecx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
242mov eax, edi
243movd edi, xmm2
244movzx esi, al
245xor ecx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
246movzx esi, ah
247xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
248shr eax, 16
249movzx esi, al
250xor r12d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
251movzx esi, ah
252xor edx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
253mov eax, edi
254movzx esi, al
255xor edx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
256movzx esi, ah
257xor ecx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
258shr eax, 16
259movzx esi, al
260xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
261movzx esi, ah
262xor r12d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
263mov eax, r12d
264add r8, [(r8+16*19)]
265add r8, 4*16
266jmp label2
267label1:
268mov ecx, r10d
269mov edx, r12d
270mov eax, [(r8+0)+0*4]
271mov ebx, [(r8+0)+1*4]
272xor cl, ch
273and rcx, 255
274label5:
275add r10d, 1
276xor edx, DWORD PTR [r11+rcx*8+3]
277movzx esi, dl
278xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
279movzx esi, dh
280mov ecx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
281shr edx, 16
282xor ecx, [(r8+0)+2*4]
283movzx esi, dh
284xor eax, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
285movzx esi, dl
286mov edx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
287xor edx, [(r8+0)+3*4]
288add r8, [(r8+16*19)]
289add r8, 3*16
290jmp label4
291label2:
292mov r9d, [(r8+0)-4*16+3*4]
293mov edi, [(r8+0)-4*16+2*4]
294movzx esi, cl
295xor r9d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
296mov cl, al
297movzx esi, ah
298xor edi, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
299shr eax, 16
300movzx esi, bl
301xor edi, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
302movzx esi, bh
303xor r9d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
304shr ebx, 16
305movzx esi, al
306xor r9d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
307movzx esi, ah
308mov eax, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
309movzx esi, bl
310xor eax, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
311movzx esi, bh
312mov ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
313movzx esi, ch
314xor eax, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
315movzx esi, cl
316xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
317shr ecx, 16
318movzx esi, dl
319xor eax, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
320movzx esi, dh
321xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
322shr edx, 16
323movzx esi, ch
324xor edi, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
325movzx esi, cl
326xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
327movzx esi, dl
328xor edi, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
329movzx esi, dh
330xor r9d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
331mov ecx, edi
332xor eax, [(r8+0)-4*16+0*4]
333xor ebx, [(r8+0)-4*16+1*4]
334mov edx, r9d
335label4:
336mov r9d, [(r8+0)-4*16+7*4]
337mov edi, [(r8+0)-4*16+6*4]
338movzx esi, cl
339xor r9d, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
340mov cl, al
341movzx esi, ah
342xor edi, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
343shr eax, 16
344movzx esi, bl
345xor edi, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
346movzx esi, bh
347xor r9d, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
348shr ebx, 16
349movzx esi, al
350xor r9d, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
351movzx esi, ah
352mov eax, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
353movzx esi, bl
354xor eax, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
355movzx esi, bh
356mov ebx, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
357movzx esi, ch
358xor eax, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
359movzx esi, cl
360xor ebx, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
361shr ecx, 16
362movzx esi, dl
363xor eax, DWORD PTR [r11+8*rsi+(((3+3) MOD (4))+1)]
364movzx esi, dh
365xor ebx, DWORD PTR [r11+8*rsi+(((2+3) MOD (4))+1)]
366shr edx, 16
367movzx esi, ch
368xor edi, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
369movzx esi, cl
370xor ebx, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
371movzx esi, dl
372xor edi, DWORD PTR [r11+8*rsi+(((1+3) MOD (4))+1)]
373movzx esi, dh
374xor r9d, DWORD PTR [r11+8*rsi+(((0+3) MOD (4))+1)]
375mov ecx, edi
376xor eax, [(r8+0)-4*16+4*4]
377xor ebx, [(r8+0)-4*16+5*4]
378mov edx, r9d
379add r8, 32
380test r8, 255
381jnz label2
382sub r8, 16*16
383movzx esi, ch
384movzx edi, BYTE PTR [r11+rsi*8+1]
385movzx esi, dl
386xor edi, DWORD PTR [r11+rsi*8+0]
387mov WORD PTR [(r8+16*13)+2], di
388movzx esi, dh
389movzx edi, BYTE PTR [r11+rsi*8+1]
390movzx esi, al
391xor edi, DWORD PTR [r11+rsi*8+0]
392mov WORD PTR [(r8+16*13)+6], di
393shr edx, 16
394movzx esi, ah
395movzx edi, BYTE PTR [r11+rsi*8+1]
396movzx esi, bl
397xor edi, DWORD PTR [r11+rsi*8+0]
398mov WORD PTR [(r8+16*13)+10], di
399shr eax, 16
400movzx esi, bh
401movzx edi, BYTE PTR [r11+rsi*8+1]
402movzx esi, cl
403xor edi, DWORD PTR [r11+rsi*8+0]
404mov WORD PTR [(r8+16*13)+14], di
405shr ebx, 16
406movzx esi, dh
407movzx edi, BYTE PTR [r11+rsi*8+1]
408movzx esi, al
409xor edi, DWORD PTR [r11+rsi*8+0]
410mov WORD PTR [(r8+16*13)+12], di
411shr ecx, 16
412movzx esi, ah
413movzx edi, BYTE PTR [r11+rsi*8+1]
414movzx esi, bl
415xor edi, DWORD PTR [r11+rsi*8+0]
416mov WORD PTR [(r8+16*13)+0], di
417movzx esi, bh
418movzx edi, BYTE PTR [r11+rsi*8+1]
419movzx esi, cl
420xor edi, DWORD PTR [r11+rsi*8+0]
421mov WORD PTR [(r8+16*13)+4], di
422movzx esi, ch
423movzx edi, BYTE PTR [r11+rsi*8+1]
424movzx esi, dl
425xor edi, DWORD PTR [r11+rsi*8+0]
426mov WORD PTR [(r8+16*13)+8], di
427mov rax, [(r8+16*14)+16]
428mov rbx, [(r8+16*14)+24]
429mov rcx, [(r8+16*18+8)]
430sub rcx, 16
431movdqu xmm2, [rax]
432pxor xmm2, xmm4
433movdqa xmm0, [(r8+16*16)+16]
434paddq xmm0, [(r8+16*14)+16]
435movdqa [(r8+16*14)+16], xmm0
436pxor xmm2, [(r8+16*13)]
437movdqu [rbx], xmm2
438jle label7
439mov [(r8+16*18+8)], rcx
440test rcx, 1
441jnz label1
442movdqa xmm0, [(r8+16*16)]
443paddq xmm0, [(r8+16*14)]
444movdqa [(r8+16*14)], xmm0
445jmp label3
446label7:
447xorps xmm0, xmm0
448lea rax, [(r8+0)+7*16]
449movaps [rax-7*16], xmm0
450movaps [rax-6*16], xmm0
451movaps [rax-5*16], xmm0
452movaps [rax-4*16], xmm0
453movaps [rax-3*16], xmm0
454movaps [rax-2*16], xmm0
455movaps [rax-1*16], xmm0
456movaps [rax+0*16], xmm0
457movaps [rax+1*16], xmm0
458movaps [rax+2*16], xmm0
459movaps [rax+3*16], xmm0
460movaps [rax+4*16], xmm0
461movaps [rax+5*16], xmm0
462movaps [rax+6*16], xmm0
463pop r12
464pop rbx
465pop rdi
466pop rsi
467ret
468Rijndael_Enc_AdvancedProcessBlocks_SSE2 ENDP
469
470ALIGN   8
471GCM_AuthenticateBlocks_2K_SSE2	PROC FRAME
472rex_push_reg rsi
473push_reg rdi
474push_reg rbx
475.endprolog
476mov rsi, r8
477mov r11, r9
478movdqa xmm0, [rsi]
479label0:
480movdqu xmm4, [rcx]
481pxor xmm0, xmm4
482movd ebx, xmm0
483mov eax, 0f0f0f0f0h
484and eax, ebx
485shl ebx, 4
486and ebx, 0f0f0f0f0h
487movzx edi, ah
488movdqa xmm5, XMMWORD PTR [rsi + 32 + 1024 + rdi]
489movzx edi, al
490movdqa xmm4, XMMWORD PTR [rsi + 32 + 1024 + rdi]
491shr eax, 16
492movzx edi, ah
493movdqa xmm3, XMMWORD PTR [rsi + 32 + 1024 + rdi]
494movzx edi, al
495movdqa xmm2, XMMWORD PTR [rsi + 32 + 1024 + rdi]
496psrldq xmm0, 4
497movd eax, xmm0
498and eax, 0f0f0f0f0h
499movzx edi, bh
500pxor xmm5, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
501movzx edi, bl
502pxor xmm4, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
503shr ebx, 16
504movzx edi, bh
505pxor xmm3, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
506movzx edi, bl
507pxor xmm2, XMMWORD PTR [rsi + 32 + (1-1)*256 + rdi]
508movd ebx, xmm0
509shl ebx, 4
510and ebx, 0f0f0f0f0h
511movzx edi, ah
512pxor xmm5, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
513movzx edi, al
514pxor xmm4, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
515shr eax, 16
516movzx edi, ah
517pxor xmm3, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
518movzx edi, al
519pxor xmm2, XMMWORD PTR [rsi + 32 + 1024 + 1*256 + rdi]
520psrldq xmm0, 4
521movd eax, xmm0
522and eax, 0f0f0f0f0h
523movzx edi, bh
524pxor xmm5, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
525movzx edi, bl
526pxor xmm4, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
527shr ebx, 16
528movzx edi, bh
529pxor xmm3, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
530movzx edi, bl
531pxor xmm2, XMMWORD PTR [rsi + 32 + (2-1)*256 + rdi]
532movd ebx, xmm0
533shl ebx, 4
534and ebx, 0f0f0f0f0h
535movzx edi, ah
536pxor xmm5, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
537movzx edi, al
538pxor xmm4, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
539shr eax, 16
540movzx edi, ah
541pxor xmm3, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
542movzx edi, al
543pxor xmm2, XMMWORD PTR [rsi + 32 + 1024 + 2*256 + rdi]
544psrldq xmm0, 4
545movd eax, xmm0
546and eax, 0f0f0f0f0h
547movzx edi, bh
548pxor xmm5, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
549movzx edi, bl
550pxor xmm4, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
551shr ebx, 16
552movzx edi, bh
553pxor xmm3, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
554movzx edi, bl
555pxor xmm2, XMMWORD PTR [rsi + 32 + (3-1)*256 + rdi]
556movd ebx, xmm0
557shl ebx, 4
558and ebx, 0f0f0f0f0h
559movzx edi, ah
560pxor xmm5, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
561movzx edi, al
562pxor xmm4, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
563shr eax, 16
564movzx edi, ah
565pxor xmm3, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
566movzx edi, al
567pxor xmm2, XMMWORD PTR [rsi + 32 + 1024 + 3*256 + rdi]
568movzx edi, bh
569pxor xmm5, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
570movzx edi, bl
571pxor xmm4, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
572shr ebx, 16
573movzx edi, bh
574pxor xmm3, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
575movzx edi, bl
576pxor xmm2, XMMWORD PTR [rsi + 32 + 3*256 + rdi]
577movdqa xmm0, xmm3
578pslldq xmm3, 1
579pxor xmm2, xmm3
580movdqa xmm1, xmm2
581pslldq xmm2, 1
582pxor xmm5, xmm2
583psrldq xmm0, 15
584movd rdi, xmm0
585movzx eax, WORD PTR [r11 + rdi*2]
586shl eax, 8
587movdqa xmm0, xmm5
588pslldq xmm5, 1
589pxor xmm4, xmm5
590psrldq xmm1, 15
591movd rdi, xmm1
592xor ax, WORD PTR [r11 + rdi*2]
593shl eax, 8
594psrldq xmm0, 15
595movd rdi, xmm0
596xor ax, WORD PTR [r11 + rdi*2]
597movd xmm0, eax
598pxor xmm0, xmm4
599add rcx, 16
600sub rdx, 1
601jnz label0
602movdqa [rsi], xmm0
603pop rbx
604pop rdi
605pop rsi
606ret
607GCM_AuthenticateBlocks_2K_SSE2 ENDP
608
609ALIGN   8
610GCM_AuthenticateBlocks_64K_SSE2	PROC FRAME
611rex_push_reg rsi
612push_reg rdi
613.endprolog
614mov rsi, r8
615movdqa xmm0, [rsi]
616label1:
617movdqu xmm1, [rcx]
618pxor xmm1, xmm0
619pxor xmm0, xmm0
620movd eax, xmm1
621psrldq xmm1, 4
622movzx edi, al
623add rdi, rdi
624pxor xmm0, [rsi + 32 + (0*4+0)*256*16 + rdi*8]
625movzx edi, ah
626add rdi, rdi
627pxor xmm0, [rsi + 32 + (0*4+1)*256*16 + rdi*8]
628shr eax, 16
629movzx edi, al
630add rdi, rdi
631pxor xmm0, [rsi + 32 + (0*4+2)*256*16 + rdi*8]
632movzx edi, ah
633add rdi, rdi
634pxor xmm0, [rsi + 32 + (0*4+3)*256*16 + rdi*8]
635movd eax, xmm1
636psrldq xmm1, 4
637movzx edi, al
638add rdi, rdi
639pxor xmm0, [rsi + 32 + (1*4+0)*256*16 + rdi*8]
640movzx edi, ah
641add rdi, rdi
642pxor xmm0, [rsi + 32 + (1*4+1)*256*16 + rdi*8]
643shr eax, 16
644movzx edi, al
645add rdi, rdi
646pxor xmm0, [rsi + 32 + (1*4+2)*256*16 + rdi*8]
647movzx edi, ah
648add rdi, rdi
649pxor xmm0, [rsi + 32 + (1*4+3)*256*16 + rdi*8]
650movd eax, xmm1
651psrldq xmm1, 4
652movzx edi, al
653add rdi, rdi
654pxor xmm0, [rsi + 32 + (2*4+0)*256*16 + rdi*8]
655movzx edi, ah
656add rdi, rdi
657pxor xmm0, [rsi + 32 + (2*4+1)*256*16 + rdi*8]
658shr eax, 16
659movzx edi, al
660add rdi, rdi
661pxor xmm0, [rsi + 32 + (2*4+2)*256*16 + rdi*8]
662movzx edi, ah
663add rdi, rdi
664pxor xmm0, [rsi + 32 + (2*4+3)*256*16 + rdi*8]
665movd eax, xmm1
666psrldq xmm1, 4
667movzx edi, al
668add rdi, rdi
669pxor xmm0, [rsi + 32 + (3*4+0)*256*16 + rdi*8]
670movzx edi, ah
671add rdi, rdi
672pxor xmm0, [rsi + 32 + (3*4+1)*256*16 + rdi*8]
673shr eax, 16
674movzx edi, al
675add rdi, rdi
676pxor xmm0, [rsi + 32 + (3*4+2)*256*16 + rdi*8]
677movzx edi, ah
678add rdi, rdi
679pxor xmm0, [rsi + 32 + (3*4+3)*256*16 + rdi*8]
680add rcx, 16
681sub rdx, 1
682jnz label1
683movdqa [rsi], xmm0
684pop rdi
685pop rsi
686ret
687GCM_AuthenticateBlocks_64K_SSE2 ENDP
688
689ALIGN   8
690SHA256_HashMultipleBlocks_SSE2	PROC FRAME
691rex_push_reg rsi
692push_reg rdi
693push_reg rbx
694push_reg rbp
695alloc_stack(8*4 + 16*4 + 4*8 + 8)
696.endprolog
697mov rdi, r8
698lea rsi, [?SHA256_K@CryptoPP@@3QBIB + 48*4]
699mov [rsp+8*4+16*4+1*8], rcx
700mov [rsp+8*4+16*4+2*8], rdx
701add rdi, rdx
702mov [rsp+8*4+16*4+3*8], rdi
703movdqa xmm0, XMMWORD PTR [rcx+0*16]
704movdqa xmm1, XMMWORD PTR [rcx+1*16]
705mov [rsp+8*4+16*4+0*8], rsi
706label0:
707sub rsi, 48*4
708movdqa [rsp+((1024+7-(0+3)) MOD (8))*4], xmm1
709movdqa [rsp+((1024+7-(0+7)) MOD (8))*4], xmm0
710mov rbx, [rdx+0*8]
711bswap rbx
712mov [rsp+8*4+((1024+15-(0*(1+1)+1)) MOD (16))*4], rbx
713mov rbx, [rdx+1*8]
714bswap rbx
715mov [rsp+8*4+((1024+15-(1*(1+1)+1)) MOD (16))*4], rbx
716mov rbx, [rdx+2*8]
717bswap rbx
718mov [rsp+8*4+((1024+15-(2*(1+1)+1)) MOD (16))*4], rbx
719mov rbx, [rdx+3*8]
720bswap rbx
721mov [rsp+8*4+((1024+15-(3*(1+1)+1)) MOD (16))*4], rbx
722mov rbx, [rdx+4*8]
723bswap rbx
724mov [rsp+8*4+((1024+15-(4*(1+1)+1)) MOD (16))*4], rbx
725mov rbx, [rdx+5*8]
726bswap rbx
727mov [rsp+8*4+((1024+15-(5*(1+1)+1)) MOD (16))*4], rbx
728mov rbx, [rdx+6*8]
729bswap rbx
730mov [rsp+8*4+((1024+15-(6*(1+1)+1)) MOD (16))*4], rbx
731mov rbx, [rdx+7*8]
732bswap rbx
733mov [rsp+8*4+((1024+15-(7*(1+1)+1)) MOD (16))*4], rbx
734mov edi, [rsp+((1024+7-(0+3)) MOD (8))*4]
735mov eax, [rsp+((1024+7-(0+6)) MOD (8))*4]
736xor eax, [rsp+((1024+7-(0+5)) MOD (8))*4]
737mov ecx, [rsp+((1024+7-(0+7)) MOD (8))*4]
738mov edx, [rsp+((1024+7-(0+2)) MOD (8))*4]
739xor edx, [rsp+((1024+7-(0+1)) MOD (8))*4]
740and edx, edi
741xor edx, [rsp+((1024+7-(0+1)) MOD (8))*4]
742mov ebp, edi
743ror edi, 6
744ror ebp, 25
745add edx, [rsi+(0)*4]
746add edx, [rsp+8*4+((1024+15-(0)) MOD (16))*4]
747add edx, [rsp+((1024+7-(0)) MOD (8))*4]
748xor ebp, edi
749ror edi, 5
750xor ebp, edi
751add edx, ebp
752mov ebx, ecx
753xor ecx, [rsp+((1024+7-(0+6)) MOD (8))*4]
754and eax, ecx
755xor eax, [rsp+((1024+7-(0+6)) MOD (8))*4]
756mov ebp, ebx
757ror ebx, 2
758add eax, edx
759add edx, [rsp+((1024+7-(0+4)) MOD (8))*4]
760mov [rsp+((1024+7-(0+4)) MOD (8))*4], edx
761ror ebp, 22
762xor ebp, ebx
763ror ebx, 11
764xor ebp, ebx
765add eax, ebp
766mov [rsp+((1024+7-(0)) MOD (8))*4], eax
767mov edi, [rsp+((1024+7-(1+2)) MOD (8))*4]
768xor edi, [rsp+((1024+7-(1+1)) MOD (8))*4]
769and edi, edx
770xor edi, [rsp+((1024+7-(1+1)) MOD (8))*4]
771mov ebp, edx
772ror edx, 6
773ror ebp, 25
774add edi, [rsi+(1)*4]
775add edi, [rsp+8*4+((1024+15-(1)) MOD (16))*4]
776add edi, [rsp+((1024+7-(1)) MOD (8))*4]
777xor ebp, edx
778ror edx, 5
779xor ebp, edx
780add edi, ebp
781mov ebx, eax
782xor eax, [rsp+((1024+7-(1+6)) MOD (8))*4]
783and ecx, eax
784xor ecx, [rsp+((1024+7-(1+6)) MOD (8))*4]
785mov ebp, ebx
786ror ebx, 2
787add ecx, edi
788add edi, [rsp+((1024+7-(1+4)) MOD (8))*4]
789mov [rsp+((1024+7-(1+4)) MOD (8))*4], edi
790ror ebp, 22
791xor ebp, ebx
792ror ebx, 11
793xor ebp, ebx
794add ecx, ebp
795mov [rsp+((1024+7-(1)) MOD (8))*4], ecx
796mov edx, [rsp+((1024+7-(2+2)) MOD (8))*4]
797xor edx, [rsp+((1024+7-(2+1)) MOD (8))*4]
798and edx, edi
799xor edx, [rsp+((1024+7-(2+1)) MOD (8))*4]
800mov ebp, edi
801ror edi, 6
802ror ebp, 25
803add edx, [rsi+(2)*4]
804add edx, [rsp+8*4+((1024+15-(2)) MOD (16))*4]
805add edx, [rsp+((1024+7-(2)) MOD (8))*4]
806xor ebp, edi
807ror edi, 5
808xor ebp, edi
809add edx, ebp
810mov ebx, ecx
811xor ecx, [rsp+((1024+7-(2+6)) MOD (8))*4]
812and eax, ecx
813xor eax, [rsp+((1024+7-(2+6)) MOD (8))*4]
814mov ebp, ebx
815ror ebx, 2
816add eax, edx
817add edx, [rsp+((1024+7-(2+4)) MOD (8))*4]
818mov [rsp+((1024+7-(2+4)) MOD (8))*4], edx
819ror ebp, 22
820xor ebp, ebx
821ror ebx, 11
822xor ebp, ebx
823add eax, ebp
824mov [rsp+((1024+7-(2)) MOD (8))*4], eax
825mov edi, [rsp+((1024+7-(3+2)) MOD (8))*4]
826xor edi, [rsp+((1024+7-(3+1)) MOD (8))*4]
827and edi, edx
828xor edi, [rsp+((1024+7-(3+1)) MOD (8))*4]
829mov ebp, edx
830ror edx, 6
831ror ebp, 25
832add edi, [rsi+(3)*4]
833add edi, [rsp+8*4+((1024+15-(3)) MOD (16))*4]
834add edi, [rsp+((1024+7-(3)) MOD (8))*4]
835xor ebp, edx
836ror edx, 5
837xor ebp, edx
838add edi, ebp
839mov ebx, eax
840xor eax, [rsp+((1024+7-(3+6)) MOD (8))*4]
841and ecx, eax
842xor ecx, [rsp+((1024+7-(3+6)) MOD (8))*4]
843mov ebp, ebx
844ror ebx, 2
845add ecx, edi
846add edi, [rsp+((1024+7-(3+4)) MOD (8))*4]
847mov [rsp+((1024+7-(3+4)) MOD (8))*4], edi
848ror ebp, 22
849xor ebp, ebx
850ror ebx, 11
851xor ebp, ebx
852add ecx, ebp
853mov [rsp+((1024+7-(3)) MOD (8))*4], ecx
854mov edx, [rsp+((1024+7-(4+2)) MOD (8))*4]
855xor edx, [rsp+((1024+7-(4+1)) MOD (8))*4]
856and edx, edi
857xor edx, [rsp+((1024+7-(4+1)) MOD (8))*4]
858mov ebp, edi
859ror edi, 6
860ror ebp, 25
861add edx, [rsi+(4)*4]
862add edx, [rsp+8*4+((1024+15-(4)) MOD (16))*4]
863add edx, [rsp+((1024+7-(4)) MOD (8))*4]
864xor ebp, edi
865ror edi, 5
866xor ebp, edi
867add edx, ebp
868mov ebx, ecx
869xor ecx, [rsp+((1024+7-(4+6)) MOD (8))*4]
870and eax, ecx
871xor eax, [rsp+((1024+7-(4+6)) MOD (8))*4]
872mov ebp, ebx
873ror ebx, 2
874add eax, edx
875add edx, [rsp+((1024+7-(4+4)) MOD (8))*4]
876mov [rsp+((1024+7-(4+4)) MOD (8))*4], edx
877ror ebp, 22
878xor ebp, ebx
879ror ebx, 11
880xor ebp, ebx
881add eax, ebp
882mov [rsp+((1024+7-(4)) MOD (8))*4], eax
883mov edi, [rsp+((1024+7-(5+2)) MOD (8))*4]
884xor edi, [rsp+((1024+7-(5+1)) MOD (8))*4]
885and edi, edx
886xor edi, [rsp+((1024+7-(5+1)) MOD (8))*4]
887mov ebp, edx
888ror edx, 6
889ror ebp, 25
890add edi, [rsi+(5)*4]
891add edi, [rsp+8*4+((1024+15-(5)) MOD (16))*4]
892add edi, [rsp+((1024+7-(5)) MOD (8))*4]
893xor ebp, edx
894ror edx, 5
895xor ebp, edx
896add edi, ebp
897mov ebx, eax
898xor eax, [rsp+((1024+7-(5+6)) MOD (8))*4]
899and ecx, eax
900xor ecx, [rsp+((1024+7-(5+6)) MOD (8))*4]
901mov ebp, ebx
902ror ebx, 2
903add ecx, edi
904add edi, [rsp+((1024+7-(5+4)) MOD (8))*4]
905mov [rsp+((1024+7-(5+4)) MOD (8))*4], edi
906ror ebp, 22
907xor ebp, ebx
908ror ebx, 11
909xor ebp, ebx
910add ecx, ebp
911mov [rsp+((1024+7-(5)) MOD (8))*4], ecx
912mov edx, [rsp+((1024+7-(6+2)) MOD (8))*4]
913xor edx, [rsp+((1024+7-(6+1)) MOD (8))*4]
914and edx, edi
915xor edx, [rsp+((1024+7-(6+1)) MOD (8))*4]
916mov ebp, edi
917ror edi, 6
918ror ebp, 25
919add edx, [rsi+(6)*4]
920add edx, [rsp+8*4+((1024+15-(6)) MOD (16))*4]
921add edx, [rsp+((1024+7-(6)) MOD (8))*4]
922xor ebp, edi
923ror edi, 5
924xor ebp, edi
925add edx, ebp
926mov ebx, ecx
927xor ecx, [rsp+((1024+7-(6+6)) MOD (8))*4]
928and eax, ecx
929xor eax, [rsp+((1024+7-(6+6)) MOD (8))*4]
930mov ebp, ebx
931ror ebx, 2
932add eax, edx
933add edx, [rsp+((1024+7-(6+4)) MOD (8))*4]
934mov [rsp+((1024+7-(6+4)) MOD (8))*4], edx
935ror ebp, 22
936xor ebp, ebx
937ror ebx, 11
938xor ebp, ebx
939add eax, ebp
940mov [rsp+((1024+7-(6)) MOD (8))*4], eax
941mov edi, [rsp+((1024+7-(7+2)) MOD (8))*4]
942xor edi, [rsp+((1024+7-(7+1)) MOD (8))*4]
943and edi, edx
944xor edi, [rsp+((1024+7-(7+1)) MOD (8))*4]
945mov ebp, edx
946ror edx, 6
947ror ebp, 25
948add edi, [rsi+(7)*4]
949add edi, [rsp+8*4+((1024+15-(7)) MOD (16))*4]
950add edi, [rsp+((1024+7-(7)) MOD (8))*4]
951xor ebp, edx
952ror edx, 5
953xor ebp, edx
954add edi, ebp
955mov ebx, eax
956xor eax, [rsp+((1024+7-(7+6)) MOD (8))*4]
957and ecx, eax
958xor ecx, [rsp+((1024+7-(7+6)) MOD (8))*4]
959mov ebp, ebx
960ror ebx, 2
961add ecx, edi
962add edi, [rsp+((1024+7-(7+4)) MOD (8))*4]
963mov [rsp+((1024+7-(7+4)) MOD (8))*4], edi
964ror ebp, 22
965xor ebp, ebx
966ror ebx, 11
967xor ebp, ebx
968add ecx, ebp
969mov [rsp+((1024+7-(7)) MOD (8))*4], ecx
970mov edx, [rsp+((1024+7-(8+2)) MOD (8))*4]
971xor edx, [rsp+((1024+7-(8+1)) MOD (8))*4]
972and edx, edi
973xor edx, [rsp+((1024+7-(8+1)) MOD (8))*4]
974mov ebp, edi
975ror edi, 6
976ror ebp, 25
977add edx, [rsi+(8)*4]
978add edx, [rsp+8*4+((1024+15-(8)) MOD (16))*4]
979add edx, [rsp+((1024+7-(8)) MOD (8))*4]
980xor ebp, edi
981ror edi, 5
982xor ebp, edi
983add edx, ebp
984mov ebx, ecx
985xor ecx, [rsp+((1024+7-(8+6)) MOD (8))*4]
986and eax, ecx
987xor eax, [rsp+((1024+7-(8+6)) MOD (8))*4]
988mov ebp, ebx
989ror ebx, 2
990add eax, edx
991add edx, [rsp+((1024+7-(8+4)) MOD (8))*4]
992mov [rsp+((1024+7-(8+4)) MOD (8))*4], edx
993ror ebp, 22
994xor ebp, ebx
995ror ebx, 11
996xor ebp, ebx
997add eax, ebp
998mov [rsp+((1024+7-(8)) MOD (8))*4], eax
999mov edi, [rsp+((1024+7-(9+2)) MOD (8))*4]
1000xor edi, [rsp+((1024+7-(9+1)) MOD (8))*4]
1001and edi, edx
1002xor edi, [rsp+((1024+7-(9+1)) MOD (8))*4]
1003mov ebp, edx
1004ror edx, 6
1005ror ebp, 25
1006add edi, [rsi+(9)*4]
1007add edi, [rsp+8*4+((1024+15-(9)) MOD (16))*4]
1008add edi, [rsp+((1024+7-(9)) MOD (8))*4]
1009xor ebp, edx
1010ror edx, 5
1011xor ebp, edx
1012add edi, ebp
1013mov ebx, eax
1014xor eax, [rsp+((1024+7-(9+6)) MOD (8))*4]
1015and ecx, eax
1016xor ecx, [rsp+((1024+7-(9+6)) MOD (8))*4]
1017mov ebp, ebx
1018ror ebx, 2
1019add ecx, edi
1020add edi, [rsp+((1024+7-(9+4)) MOD (8))*4]
1021mov [rsp+((1024+7-(9+4)) MOD (8))*4], edi
1022ror ebp, 22
1023xor ebp, ebx
1024ror ebx, 11
1025xor ebp, ebx
1026add ecx, ebp
1027mov [rsp+((1024+7-(9)) MOD (8))*4], ecx
1028mov edx, [rsp+((1024+7-(10+2)) MOD (8))*4]
1029xor edx, [rsp+((1024+7-(10+1)) MOD (8))*4]
1030and edx, edi
1031xor edx, [rsp+((1024+7-(10+1)) MOD (8))*4]
1032mov ebp, edi
1033ror edi, 6
1034ror ebp, 25
1035add edx, [rsi+(10)*4]
1036add edx, [rsp+8*4+((1024+15-(10)) MOD (16))*4]
1037add edx, [rsp+((1024+7-(10)) MOD (8))*4]
1038xor ebp, edi
1039ror edi, 5
1040xor ebp, edi
1041add edx, ebp
1042mov ebx, ecx
1043xor ecx, [rsp+((1024+7-(10+6)) MOD (8))*4]
1044and eax, ecx
1045xor eax, [rsp+((1024+7-(10+6)) MOD (8))*4]
1046mov ebp, ebx
1047ror ebx, 2
1048add eax, edx
1049add edx, [rsp+((1024+7-(10+4)) MOD (8))*4]
1050mov [rsp+((1024+7-(10+4)) MOD (8))*4], edx
1051ror ebp, 22
1052xor ebp, ebx
1053ror ebx, 11
1054xor ebp, ebx
1055add eax, ebp
1056mov [rsp+((1024+7-(10)) MOD (8))*4], eax
1057mov edi, [rsp+((1024+7-(11+2)) MOD (8))*4]
1058xor edi, [rsp+((1024+7-(11+1)) MOD (8))*4]
1059and edi, edx
1060xor edi, [rsp+((1024+7-(11+1)) MOD (8))*4]
1061mov ebp, edx
1062ror edx, 6
1063ror ebp, 25
1064add edi, [rsi+(11)*4]
1065add edi, [rsp+8*4+((1024+15-(11)) MOD (16))*4]
1066add edi, [rsp+((1024+7-(11)) MOD (8))*4]
1067xor ebp, edx
1068ror edx, 5
1069xor ebp, edx
1070add edi, ebp
1071mov ebx, eax
1072xor eax, [rsp+((1024+7-(11+6)) MOD (8))*4]
1073and ecx, eax
1074xor ecx, [rsp+((1024+7-(11+6)) MOD (8))*4]
1075mov ebp, ebx
1076ror ebx, 2
1077add ecx, edi
1078add edi, [rsp+((1024+7-(11+4)) MOD (8))*4]
1079mov [rsp+((1024+7-(11+4)) MOD (8))*4], edi
1080ror ebp, 22
1081xor ebp, ebx
1082ror ebx, 11
1083xor ebp, ebx
1084add ecx, ebp
1085mov [rsp+((1024+7-(11)) MOD (8))*4], ecx
1086mov edx, [rsp+((1024+7-(12+2)) MOD (8))*4]
1087xor edx, [rsp+((1024+7-(12+1)) MOD (8))*4]
1088and edx, edi
1089xor edx, [rsp+((1024+7-(12+1)) MOD (8))*4]
1090mov ebp, edi
1091ror edi, 6
1092ror ebp, 25
1093add edx, [rsi+(12)*4]
1094add edx, [rsp+8*4+((1024+15-(12)) MOD (16))*4]
1095add edx, [rsp+((1024+7-(12)) MOD (8))*4]
1096xor ebp, edi
1097ror edi, 5
1098xor ebp, edi
1099add edx, ebp
1100mov ebx, ecx
1101xor ecx, [rsp+((1024+7-(12+6)) MOD (8))*4]
1102and eax, ecx
1103xor eax, [rsp+((1024+7-(12+6)) MOD (8))*4]
1104mov ebp, ebx
1105ror ebx, 2
1106add eax, edx
1107add edx, [rsp+((1024+7-(12+4)) MOD (8))*4]
1108mov [rsp+((1024+7-(12+4)) MOD (8))*4], edx
1109ror ebp, 22
1110xor ebp, ebx
1111ror ebx, 11
1112xor ebp, ebx
1113add eax, ebp
1114mov [rsp+((1024+7-(12)) MOD (8))*4], eax
1115mov edi, [rsp+((1024+7-(13+2)) MOD (8))*4]
1116xor edi, [rsp+((1024+7-(13+1)) MOD (8))*4]
1117and edi, edx
1118xor edi, [rsp+((1024+7-(13+1)) MOD (8))*4]
1119mov ebp, edx
1120ror edx, 6
1121ror ebp, 25
1122add edi, [rsi+(13)*4]
1123add edi, [rsp+8*4+((1024+15-(13)) MOD (16))*4]
1124add edi, [rsp+((1024+7-(13)) MOD (8))*4]
1125xor ebp, edx
1126ror edx, 5
1127xor ebp, edx
1128add edi, ebp
1129mov ebx, eax
1130xor eax, [rsp+((1024+7-(13+6)) MOD (8))*4]
1131and ecx, eax
1132xor ecx, [rsp+((1024+7-(13+6)) MOD (8))*4]
1133mov ebp, ebx
1134ror ebx, 2
1135add ecx, edi
1136add edi, [rsp+((1024+7-(13+4)) MOD (8))*4]
1137mov [rsp+((1024+7-(13+4)) MOD (8))*4], edi
1138ror ebp, 22
1139xor ebp, ebx
1140ror ebx, 11
1141xor ebp, ebx
1142add ecx, ebp
1143mov [rsp+((1024+7-(13)) MOD (8))*4], ecx
1144mov edx, [rsp+((1024+7-(14+2)) MOD (8))*4]
1145xor edx, [rsp+((1024+7-(14+1)) MOD (8))*4]
1146and edx, edi
1147xor edx, [rsp+((1024+7-(14+1)) MOD (8))*4]
1148mov ebp, edi
1149ror edi, 6
1150ror ebp, 25
1151add edx, [rsi+(14)*4]
1152add edx, [rsp+8*4+((1024+15-(14)) MOD (16))*4]
1153add edx, [rsp+((1024+7-(14)) MOD (8))*4]
1154xor ebp, edi
1155ror edi, 5
1156xor ebp, edi
1157add edx, ebp
1158mov ebx, ecx
1159xor ecx, [rsp+((1024+7-(14+6)) MOD (8))*4]
1160and eax, ecx
1161xor eax, [rsp+((1024+7-(14+6)) MOD (8))*4]
1162mov ebp, ebx
1163ror ebx, 2
1164add eax, edx
1165add edx, [rsp+((1024+7-(14+4)) MOD (8))*4]
1166mov [rsp+((1024+7-(14+4)) MOD (8))*4], edx
1167ror ebp, 22
1168xor ebp, ebx
1169ror ebx, 11
1170xor ebp, ebx
1171add eax, ebp
1172mov [rsp+((1024+7-(14)) MOD (8))*4], eax
1173mov edi, [rsp+((1024+7-(15+2)) MOD (8))*4]
1174xor edi, [rsp+((1024+7-(15+1)) MOD (8))*4]
1175and edi, edx
1176xor edi, [rsp+((1024+7-(15+1)) MOD (8))*4]
1177mov ebp, edx
1178ror edx, 6
1179ror ebp, 25
1180add edi, [rsi+(15)*4]
1181add edi, [rsp+8*4+((1024+15-(15)) MOD (16))*4]
1182add edi, [rsp+((1024+7-(15)) MOD (8))*4]
1183xor ebp, edx
1184ror edx, 5
1185xor ebp, edx
1186add edi, ebp
1187mov ebx, eax
1188xor eax, [rsp+((1024+7-(15+6)) MOD (8))*4]
1189and ecx, eax
1190xor ecx, [rsp+((1024+7-(15+6)) MOD (8))*4]
1191mov ebp, ebx
1192ror ebx, 2
1193add ecx, edi
1194add edi, [rsp+((1024+7-(15+4)) MOD (8))*4]
1195mov [rsp+((1024+7-(15+4)) MOD (8))*4], edi
1196ror ebp, 22
1197xor ebp, ebx
1198ror ebx, 11
1199xor ebp, ebx
1200add ecx, ebp
1201mov [rsp+((1024+7-(15)) MOD (8))*4], ecx
1202label1:
1203add rsi, 4*16
1204mov edx, [rsp+((1024+7-(0+2)) MOD (8))*4]
1205xor edx, [rsp+((1024+7-(0+1)) MOD (8))*4]
1206and edx, edi
1207xor edx, [rsp+((1024+7-(0+1)) MOD (8))*4]
1208mov ebp, edi
1209ror edi, 6
1210ror ebp, 25
1211xor ebp, edi
1212ror edi, 5
1213xor ebp, edi
1214add edx, ebp
1215mov ebp, [rsp+8*4+((1024+15-((0)-2)) MOD (16))*4]
1216mov edi, [rsp+8*4+((1024+15-((0)-15)) MOD (16))*4]
1217mov ebx, ebp
1218shr ebp, 10
1219ror ebx, 17
1220xor ebp, ebx
1221ror ebx, 2
1222xor ebx, ebp
1223add ebx, [rsp+8*4+((1024+15-((0)-7)) MOD (16))*4]
1224mov ebp, edi
1225shr ebp, 3
1226ror edi, 7
1227add ebx, [rsp+8*4+((1024+15-(0)) MOD (16))*4]
1228xor ebp, edi
1229add edx, [rsi+(0)*4]
1230ror edi, 11
1231add edx, [rsp+((1024+7-(0)) MOD (8))*4]
1232xor ebp, edi
1233add ebp, ebx
1234mov [rsp+8*4+((1024+15-(0)) MOD (16))*4], ebp
1235add edx, ebp
1236mov ebx, ecx
1237xor ecx, [rsp+((1024+7-(0+6)) MOD (8))*4]
1238and eax, ecx
1239xor eax, [rsp+((1024+7-(0+6)) MOD (8))*4]
1240mov ebp, ebx
1241ror ebx, 2
1242add eax, edx
1243add edx, [rsp+((1024+7-(0+4)) MOD (8))*4]
1244mov [rsp+((1024+7-(0+4)) MOD (8))*4], edx
1245ror ebp, 22
1246xor ebp, ebx
1247ror ebx, 11
1248xor ebp, ebx
1249add eax, ebp
1250mov [rsp+((1024+7-(0)) MOD (8))*4], eax
1251mov edi, [rsp+((1024+7-(1+2)) MOD (8))*4]
1252xor edi, [rsp+((1024+7-(1+1)) MOD (8))*4]
1253and edi, edx
1254xor edi, [rsp+((1024+7-(1+1)) MOD (8))*4]
1255mov ebp, edx
1256ror edx, 6
1257ror ebp, 25
1258xor ebp, edx
1259ror edx, 5
1260xor ebp, edx
1261add edi, ebp
1262mov ebp, [rsp+8*4+((1024+15-((1)-2)) MOD (16))*4]
1263mov edx, [rsp+8*4+((1024+15-((1)-15)) MOD (16))*4]
1264mov ebx, ebp
1265shr ebp, 10
1266ror ebx, 17
1267xor ebp, ebx
1268ror ebx, 2
1269xor ebx, ebp
1270add ebx, [rsp+8*4+((1024+15-((1)-7)) MOD (16))*4]
1271mov ebp, edx
1272shr ebp, 3
1273ror edx, 7
1274add ebx, [rsp+8*4+((1024+15-(1)) MOD (16))*4]
1275xor ebp, edx
1276add edi, [rsi+(1)*4]
1277ror edx, 11
1278add edi, [rsp+((1024+7-(1)) MOD (8))*4]
1279xor ebp, edx
1280add ebp, ebx
1281mov [rsp+8*4+((1024+15-(1)) MOD (16))*4], ebp
1282add edi, ebp
1283mov ebx, eax
1284xor eax, [rsp+((1024+7-(1+6)) MOD (8))*4]
1285and ecx, eax
1286xor ecx, [rsp+((1024+7-(1+6)) MOD (8))*4]
1287mov ebp, ebx
1288ror ebx, 2
1289add ecx, edi
1290add edi, [rsp+((1024+7-(1+4)) MOD (8))*4]
1291mov [rsp+((1024+7-(1+4)) MOD (8))*4], edi
1292ror ebp, 22
1293xor ebp, ebx
1294ror ebx, 11
1295xor ebp, ebx
1296add ecx, ebp
1297mov [rsp+((1024+7-(1)) MOD (8))*4], ecx
1298mov edx, [rsp+((1024+7-(2+2)) MOD (8))*4]
1299xor edx, [rsp+((1024+7-(2+1)) MOD (8))*4]
1300and edx, edi
1301xor edx, [rsp+((1024+7-(2+1)) MOD (8))*4]
1302mov ebp, edi
1303ror edi, 6
1304ror ebp, 25
1305xor ebp, edi
1306ror edi, 5
1307xor ebp, edi
1308add edx, ebp
1309mov ebp, [rsp+8*4+((1024+15-((2)-2)) MOD (16))*4]
1310mov edi, [rsp+8*4+((1024+15-((2)-15)) MOD (16))*4]
1311mov ebx, ebp
1312shr ebp, 10
1313ror ebx, 17
1314xor ebp, ebx
1315ror ebx, 2
1316xor ebx, ebp
1317add ebx, [rsp+8*4+((1024+15-((2)-7)) MOD (16))*4]
1318mov ebp, edi
1319shr ebp, 3
1320ror edi, 7
1321add ebx, [rsp+8*4+((1024+15-(2)) MOD (16))*4]
1322xor ebp, edi
1323add edx, [rsi+(2)*4]
1324ror edi, 11
1325add edx, [rsp+((1024+7-(2)) MOD (8))*4]
1326xor ebp, edi
1327add ebp, ebx
1328mov [rsp+8*4+((1024+15-(2)) MOD (16))*4], ebp
1329add edx, ebp
1330mov ebx, ecx
1331xor ecx, [rsp+((1024+7-(2+6)) MOD (8))*4]
1332and eax, ecx
1333xor eax, [rsp+((1024+7-(2+6)) MOD (8))*4]
1334mov ebp, ebx
1335ror ebx, 2
1336add eax, edx
1337add edx, [rsp+((1024+7-(2+4)) MOD (8))*4]
1338mov [rsp+((1024+7-(2+4)) MOD (8))*4], edx
1339ror ebp, 22
1340xor ebp, ebx
1341ror ebx, 11
1342xor ebp, ebx
1343add eax, ebp
1344mov [rsp+((1024+7-(2)) MOD (8))*4], eax
1345mov edi, [rsp+((1024+7-(3+2)) MOD (8))*4]
1346xor edi, [rsp+((1024+7-(3+1)) MOD (8))*4]
1347and edi, edx
1348xor edi, [rsp+((1024+7-(3+1)) MOD (8))*4]
1349mov ebp, edx
1350ror edx, 6
1351ror ebp, 25
1352xor ebp, edx
1353ror edx, 5
1354xor ebp, edx
1355add edi, ebp
1356mov ebp, [rsp+8*4+((1024+15-((3)-2)) MOD (16))*4]
1357mov edx, [rsp+8*4+((1024+15-((3)-15)) MOD (16))*4]
1358mov ebx, ebp
1359shr ebp, 10
1360ror ebx, 17
1361xor ebp, ebx
1362ror ebx, 2
1363xor ebx, ebp
1364add ebx, [rsp+8*4+((1024+15-((3)-7)) MOD (16))*4]
1365mov ebp, edx
1366shr ebp, 3
1367ror edx, 7
1368add ebx, [rsp+8*4+((1024+15-(3)) MOD (16))*4]
1369xor ebp, edx
1370add edi, [rsi+(3)*4]
1371ror edx, 11
1372add edi, [rsp+((1024+7-(3)) MOD (8))*4]
1373xor ebp, edx
1374add ebp, ebx
1375mov [rsp+8*4+((1024+15-(3)) MOD (16))*4], ebp
1376add edi, ebp
1377mov ebx, eax
1378xor eax, [rsp+((1024+7-(3+6)) MOD (8))*4]
1379and ecx, eax
1380xor ecx, [rsp+((1024+7-(3+6)) MOD (8))*4]
1381mov ebp, ebx
1382ror ebx, 2
1383add ecx, edi
1384add edi, [rsp+((1024+7-(3+4)) MOD (8))*4]
1385mov [rsp+((1024+7-(3+4)) MOD (8))*4], edi
1386ror ebp, 22
1387xor ebp, ebx
1388ror ebx, 11
1389xor ebp, ebx
1390add ecx, ebp
1391mov [rsp+((1024+7-(3)) MOD (8))*4], ecx
1392mov edx, [rsp+((1024+7-(4+2)) MOD (8))*4]
1393xor edx, [rsp+((1024+7-(4+1)) MOD (8))*4]
1394and edx, edi
1395xor edx, [rsp+((1024+7-(4+1)) MOD (8))*4]
1396mov ebp, edi
1397ror edi, 6
1398ror ebp, 25
1399xor ebp, edi
1400ror edi, 5
1401xor ebp, edi
1402add edx, ebp
1403mov ebp, [rsp+8*4+((1024+15-((4)-2)) MOD (16))*4]
1404mov edi, [rsp+8*4+((1024+15-((4)-15)) MOD (16))*4]
1405mov ebx, ebp
1406shr ebp, 10
1407ror ebx, 17
1408xor ebp, ebx
1409ror ebx, 2
1410xor ebx, ebp
1411add ebx, [rsp+8*4+((1024+15-((4)-7)) MOD (16))*4]
1412mov ebp, edi
1413shr ebp, 3
1414ror edi, 7
1415add ebx, [rsp+8*4+((1024+15-(4)) MOD (16))*4]
1416xor ebp, edi
1417add edx, [rsi+(4)*4]
1418ror edi, 11
1419add edx, [rsp+((1024+7-(4)) MOD (8))*4]
1420xor ebp, edi
1421add ebp, ebx
1422mov [rsp+8*4+((1024+15-(4)) MOD (16))*4], ebp
1423add edx, ebp
1424mov ebx, ecx
1425xor ecx, [rsp+((1024+7-(4+6)) MOD (8))*4]
1426and eax, ecx
1427xor eax, [rsp+((1024+7-(4+6)) MOD (8))*4]
1428mov ebp, ebx
1429ror ebx, 2
1430add eax, edx
1431add edx, [rsp+((1024+7-(4+4)) MOD (8))*4]
1432mov [rsp+((1024+7-(4+4)) MOD (8))*4], edx
1433ror ebp, 22
1434xor ebp, ebx
1435ror ebx, 11
1436xor ebp, ebx
1437add eax, ebp
1438mov [rsp+((1024+7-(4)) MOD (8))*4], eax
1439mov edi, [rsp+((1024+7-(5+2)) MOD (8))*4]
1440xor edi, [rsp+((1024+7-(5+1)) MOD (8))*4]
1441and edi, edx
1442xor edi, [rsp+((1024+7-(5+1)) MOD (8))*4]
1443mov ebp, edx
1444ror edx, 6
1445ror ebp, 25
1446xor ebp, edx
1447ror edx, 5
1448xor ebp, edx
1449add edi, ebp
1450mov ebp, [rsp+8*4+((1024+15-((5)-2)) MOD (16))*4]
1451mov edx, [rsp+8*4+((1024+15-((5)-15)) MOD (16))*4]
1452mov ebx, ebp
1453shr ebp, 10
1454ror ebx, 17
1455xor ebp, ebx
1456ror ebx, 2
1457xor ebx, ebp
1458add ebx, [rsp+8*4+((1024+15-((5)-7)) MOD (16))*4]
1459mov ebp, edx
1460shr ebp, 3
1461ror edx, 7
1462add ebx, [rsp+8*4+((1024+15-(5)) MOD (16))*4]
1463xor ebp, edx
1464add edi, [rsi+(5)*4]
1465ror edx, 11
1466add edi, [rsp+((1024+7-(5)) MOD (8))*4]
1467xor ebp, edx
1468add ebp, ebx
1469mov [rsp+8*4+((1024+15-(5)) MOD (16))*4], ebp
1470add edi, ebp
1471mov ebx, eax
1472xor eax, [rsp+((1024+7-(5+6)) MOD (8))*4]
1473and ecx, eax
1474xor ecx, [rsp+((1024+7-(5+6)) MOD (8))*4]
1475mov ebp, ebx
1476ror ebx, 2
1477add ecx, edi
1478add edi, [rsp+((1024+7-(5+4)) MOD (8))*4]
1479mov [rsp+((1024+7-(5+4)) MOD (8))*4], edi
1480ror ebp, 22
1481xor ebp, ebx
1482ror ebx, 11
1483xor ebp, ebx
1484add ecx, ebp
1485mov [rsp+((1024+7-(5)) MOD (8))*4], ecx
1486mov edx, [rsp+((1024+7-(6+2)) MOD (8))*4]
1487xor edx, [rsp+((1024+7-(6+1)) MOD (8))*4]
1488and edx, edi
1489xor edx, [rsp+((1024+7-(6+1)) MOD (8))*4]
1490mov ebp, edi
1491ror edi, 6
1492ror ebp, 25
1493xor ebp, edi
1494ror edi, 5
1495xor ebp, edi
1496add edx, ebp
1497mov ebp, [rsp+8*4+((1024+15-((6)-2)) MOD (16))*4]
1498mov edi, [rsp+8*4+((1024+15-((6)-15)) MOD (16))*4]
1499mov ebx, ebp
1500shr ebp, 10
1501ror ebx, 17
1502xor ebp, ebx
1503ror ebx, 2
1504xor ebx, ebp
1505add ebx, [rsp+8*4+((1024+15-((6)-7)) MOD (16))*4]
1506mov ebp, edi
1507shr ebp, 3
1508ror edi, 7
1509add ebx, [rsp+8*4+((1024+15-(6)) MOD (16))*4]
1510xor ebp, edi
1511add edx, [rsi+(6)*4]
1512ror edi, 11
1513add edx, [rsp+((1024+7-(6)) MOD (8))*4]
1514xor ebp, edi
1515add ebp, ebx
1516mov [rsp+8*4+((1024+15-(6)) MOD (16))*4], ebp
1517add edx, ebp
1518mov ebx, ecx
1519xor ecx, [rsp+((1024+7-(6+6)) MOD (8))*4]
1520and eax, ecx
1521xor eax, [rsp+((1024+7-(6+6)) MOD (8))*4]
1522mov ebp, ebx
1523ror ebx, 2
1524add eax, edx
1525add edx, [rsp+((1024+7-(6+4)) MOD (8))*4]
1526mov [rsp+((1024+7-(6+4)) MOD (8))*4], edx
1527ror ebp, 22
1528xor ebp, ebx
1529ror ebx, 11
1530xor ebp, ebx
1531add eax, ebp
1532mov [rsp+((1024+7-(6)) MOD (8))*4], eax
1533mov edi, [rsp+((1024+7-(7+2)) MOD (8))*4]
1534xor edi, [rsp+((1024+7-(7+1)) MOD (8))*4]
1535and edi, edx
1536xor edi, [rsp+((1024+7-(7+1)) MOD (8))*4]
1537mov ebp, edx
1538ror edx, 6
1539ror ebp, 25
1540xor ebp, edx
1541ror edx, 5
1542xor ebp, edx
1543add edi, ebp
1544mov ebp, [rsp+8*4+((1024+15-((7)-2)) MOD (16))*4]
1545mov edx, [rsp+8*4+((1024+15-((7)-15)) MOD (16))*4]
1546mov ebx, ebp
1547shr ebp, 10
1548ror ebx, 17
1549xor ebp, ebx
1550ror ebx, 2
1551xor ebx, ebp
1552add ebx, [rsp+8*4+((1024+15-((7)-7)) MOD (16))*4]
1553mov ebp, edx
1554shr ebp, 3
1555ror edx, 7
1556add ebx, [rsp+8*4+((1024+15-(7)) MOD (16))*4]
1557xor ebp, edx
1558add edi, [rsi+(7)*4]
1559ror edx, 11
1560add edi, [rsp+((1024+7-(7)) MOD (8))*4]
1561xor ebp, edx
1562add ebp, ebx
1563mov [rsp+8*4+((1024+15-(7)) MOD (16))*4], ebp
1564add edi, ebp
1565mov ebx, eax
1566xor eax, [rsp+((1024+7-(7+6)) MOD (8))*4]
1567and ecx, eax
1568xor ecx, [rsp+((1024+7-(7+6)) MOD (8))*4]
1569mov ebp, ebx
1570ror ebx, 2
1571add ecx, edi
1572add edi, [rsp+((1024+7-(7+4)) MOD (8))*4]
1573mov [rsp+((1024+7-(7+4)) MOD (8))*4], edi
1574ror ebp, 22
1575xor ebp, ebx
1576ror ebx, 11
1577xor ebp, ebx
1578add ecx, ebp
1579mov [rsp+((1024+7-(7)) MOD (8))*4], ecx
1580mov edx, [rsp+((1024+7-(8+2)) MOD (8))*4]
1581xor edx, [rsp+((1024+7-(8+1)) MOD (8))*4]
1582and edx, edi
1583xor edx, [rsp+((1024+7-(8+1)) MOD (8))*4]
1584mov ebp, edi
1585ror edi, 6
1586ror ebp, 25
1587xor ebp, edi
1588ror edi, 5
1589xor ebp, edi
1590add edx, ebp
1591mov ebp, [rsp+8*4+((1024+15-((8)-2)) MOD (16))*4]
1592mov edi, [rsp+8*4+((1024+15-((8)-15)) MOD (16))*4]
1593mov ebx, ebp
1594shr ebp, 10
1595ror ebx, 17
1596xor ebp, ebx
1597ror ebx, 2
1598xor ebx, ebp
1599add ebx, [rsp+8*4+((1024+15-((8)-7)) MOD (16))*4]
1600mov ebp, edi
1601shr ebp, 3
1602ror edi, 7
1603add ebx, [rsp+8*4+((1024+15-(8)) MOD (16))*4]
1604xor ebp, edi
1605add edx, [rsi+(8)*4]
1606ror edi, 11
1607add edx, [rsp+((1024+7-(8)) MOD (8))*4]
1608xor ebp, edi
1609add ebp, ebx
1610mov [rsp+8*4+((1024+15-(8)) MOD (16))*4], ebp
1611add edx, ebp
1612mov ebx, ecx
1613xor ecx, [rsp+((1024+7-(8+6)) MOD (8))*4]
1614and eax, ecx
1615xor eax, [rsp+((1024+7-(8+6)) MOD (8))*4]
1616mov ebp, ebx
1617ror ebx, 2
1618add eax, edx
1619add edx, [rsp+((1024+7-(8+4)) MOD (8))*4]
1620mov [rsp+((1024+7-(8+4)) MOD (8))*4], edx
1621ror ebp, 22
1622xor ebp, ebx
1623ror ebx, 11
1624xor ebp, ebx
1625add eax, ebp
1626mov [rsp+((1024+7-(8)) MOD (8))*4], eax
1627mov edi, [rsp+((1024+7-(9+2)) MOD (8))*4]
1628xor edi, [rsp+((1024+7-(9+1)) MOD (8))*4]
1629and edi, edx
1630xor edi, [rsp+((1024+7-(9+1)) MOD (8))*4]
1631mov ebp, edx
1632ror edx, 6
1633ror ebp, 25
1634xor ebp, edx
1635ror edx, 5
1636xor ebp, edx
1637add edi, ebp
1638mov ebp, [rsp+8*4+((1024+15-((9)-2)) MOD (16))*4]
1639mov edx, [rsp+8*4+((1024+15-((9)-15)) MOD (16))*4]
1640mov ebx, ebp
1641shr ebp, 10
1642ror ebx, 17
1643xor ebp, ebx
1644ror ebx, 2
1645xor ebx, ebp
1646add ebx, [rsp+8*4+((1024+15-((9)-7)) MOD (16))*4]
1647mov ebp, edx
1648shr ebp, 3
1649ror edx, 7
1650add ebx, [rsp+8*4+((1024+15-(9)) MOD (16))*4]
1651xor ebp, edx
1652add edi, [rsi+(9)*4]
1653ror edx, 11
1654add edi, [rsp+((1024+7-(9)) MOD (8))*4]
1655xor ebp, edx
1656add ebp, ebx
1657mov [rsp+8*4+((1024+15-(9)) MOD (16))*4], ebp
1658add edi, ebp
1659mov ebx, eax
1660xor eax, [rsp+((1024+7-(9+6)) MOD (8))*4]
1661and ecx, eax
1662xor ecx, [rsp+((1024+7-(9+6)) MOD (8))*4]
1663mov ebp, ebx
1664ror ebx, 2
1665add ecx, edi
1666add edi, [rsp+((1024+7-(9+4)) MOD (8))*4]
1667mov [rsp+((1024+7-(9+4)) MOD (8))*4], edi
1668ror ebp, 22
1669xor ebp, ebx
1670ror ebx, 11
1671xor ebp, ebx
1672add ecx, ebp
1673mov [rsp+((1024+7-(9)) MOD (8))*4], ecx
1674mov edx, [rsp+((1024+7-(10+2)) MOD (8))*4]
1675xor edx, [rsp+((1024+7-(10+1)) MOD (8))*4]
1676and edx, edi
1677xor edx, [rsp+((1024+7-(10+1)) MOD (8))*4]
1678mov ebp, edi
1679ror edi, 6
1680ror ebp, 25
1681xor ebp, edi
1682ror edi, 5
1683xor ebp, edi
1684add edx, ebp
1685mov ebp, [rsp+8*4+((1024+15-((10)-2)) MOD (16))*4]
1686mov edi, [rsp+8*4+((1024+15-((10)-15)) MOD (16))*4]
1687mov ebx, ebp
1688shr ebp, 10
1689ror ebx, 17
1690xor ebp, ebx
1691ror ebx, 2
1692xor ebx, ebp
1693add ebx, [rsp+8*4+((1024+15-((10)-7)) MOD (16))*4]
1694mov ebp, edi
1695shr ebp, 3
1696ror edi, 7
1697add ebx, [rsp+8*4+((1024+15-(10)) MOD (16))*4]
1698xor ebp, edi
1699add edx, [rsi+(10)*4]
1700ror edi, 11
1701add edx, [rsp+((1024+7-(10)) MOD (8))*4]
1702xor ebp, edi
1703add ebp, ebx
1704mov [rsp+8*4+((1024+15-(10)) MOD (16))*4], ebp
1705add edx, ebp
1706mov ebx, ecx
1707xor ecx, [rsp+((1024+7-(10+6)) MOD (8))*4]
1708and eax, ecx
1709xor eax, [rsp+((1024+7-(10+6)) MOD (8))*4]
1710mov ebp, ebx
1711ror ebx, 2
1712add eax, edx
1713add edx, [rsp+((1024+7-(10+4)) MOD (8))*4]
1714mov [rsp+((1024+7-(10+4)) MOD (8))*4], edx
1715ror ebp, 22
1716xor ebp, ebx
1717ror ebx, 11
1718xor ebp, ebx
1719add eax, ebp
1720mov [rsp+((1024+7-(10)) MOD (8))*4], eax
1721mov edi, [rsp+((1024+7-(11+2)) MOD (8))*4]
1722xor edi, [rsp+((1024+7-(11+1)) MOD (8))*4]
1723and edi, edx
1724xor edi, [rsp+((1024+7-(11+1)) MOD (8))*4]
1725mov ebp, edx
1726ror edx, 6
1727ror ebp, 25
1728xor ebp, edx
1729ror edx, 5
1730xor ebp, edx
1731add edi, ebp
1732mov ebp, [rsp+8*4+((1024+15-((11)-2)) MOD (16))*4]
1733mov edx, [rsp+8*4+((1024+15-((11)-15)) MOD (16))*4]
1734mov ebx, ebp
1735shr ebp, 10
1736ror ebx, 17
1737xor ebp, ebx
1738ror ebx, 2
1739xor ebx, ebp
1740add ebx, [rsp+8*4+((1024+15-((11)-7)) MOD (16))*4]
1741mov ebp, edx
1742shr ebp, 3
1743ror edx, 7
1744add ebx, [rsp+8*4+((1024+15-(11)) MOD (16))*4]
1745xor ebp, edx
1746add edi, [rsi+(11)*4]
1747ror edx, 11
1748add edi, [rsp+((1024+7-(11)) MOD (8))*4]
1749xor ebp, edx
1750add ebp, ebx
1751mov [rsp+8*4+((1024+15-(11)) MOD (16))*4], ebp
1752add edi, ebp
1753mov ebx, eax
1754xor eax, [rsp+((1024+7-(11+6)) MOD (8))*4]
1755and ecx, eax
1756xor ecx, [rsp+((1024+7-(11+6)) MOD (8))*4]
1757mov ebp, ebx
1758ror ebx, 2
1759add ecx, edi
1760add edi, [rsp+((1024+7-(11+4)) MOD (8))*4]
1761mov [rsp+((1024+7-(11+4)) MOD (8))*4], edi
1762ror ebp, 22
1763xor ebp, ebx
1764ror ebx, 11
1765xor ebp, ebx
1766add ecx, ebp
1767mov [rsp+((1024+7-(11)) MOD (8))*4], ecx
1768mov edx, [rsp+((1024+7-(12+2)) MOD (8))*4]
1769xor edx, [rsp+((1024+7-(12+1)) MOD (8))*4]
1770and edx, edi
1771xor edx, [rsp+((1024+7-(12+1)) MOD (8))*4]
1772mov ebp, edi
1773ror edi, 6
1774ror ebp, 25
1775xor ebp, edi
1776ror edi, 5
1777xor ebp, edi
1778add edx, ebp
1779mov ebp, [rsp+8*4+((1024+15-((12)-2)) MOD (16))*4]
1780mov edi, [rsp+8*4+((1024+15-((12)-15)) MOD (16))*4]
1781mov ebx, ebp
1782shr ebp, 10
1783ror ebx, 17
1784xor ebp, ebx
1785ror ebx, 2
1786xor ebx, ebp
1787add ebx, [rsp+8*4+((1024+15-((12)-7)) MOD (16))*4]
1788mov ebp, edi
1789shr ebp, 3
1790ror edi, 7
1791add ebx, [rsp+8*4+((1024+15-(12)) MOD (16))*4]
1792xor ebp, edi
1793add edx, [rsi+(12)*4]
1794ror edi, 11
1795add edx, [rsp+((1024+7-(12)) MOD (8))*4]
1796xor ebp, edi
1797add ebp, ebx
1798mov [rsp+8*4+((1024+15-(12)) MOD (16))*4], ebp
1799add edx, ebp
1800mov ebx, ecx
1801xor ecx, [rsp+((1024+7-(12+6)) MOD (8))*4]
1802and eax, ecx
1803xor eax, [rsp+((1024+7-(12+6)) MOD (8))*4]
1804mov ebp, ebx
1805ror ebx, 2
1806add eax, edx
1807add edx, [rsp+((1024+7-(12+4)) MOD (8))*4]
1808mov [rsp+((1024+7-(12+4)) MOD (8))*4], edx
1809ror ebp, 22
1810xor ebp, ebx
1811ror ebx, 11
1812xor ebp, ebx
1813add eax, ebp
1814mov [rsp+((1024+7-(12)) MOD (8))*4], eax
1815mov edi, [rsp+((1024+7-(13+2)) MOD (8))*4]
1816xor edi, [rsp+((1024+7-(13+1)) MOD (8))*4]
1817and edi, edx
1818xor edi, [rsp+((1024+7-(13+1)) MOD (8))*4]
1819mov ebp, edx
1820ror edx, 6
1821ror ebp, 25
1822xor ebp, edx
1823ror edx, 5
1824xor ebp, edx
1825add edi, ebp
1826mov ebp, [rsp+8*4+((1024+15-((13)-2)) MOD (16))*4]
1827mov edx, [rsp+8*4+((1024+15-((13)-15)) MOD (16))*4]
1828mov ebx, ebp
1829shr ebp, 10
1830ror ebx, 17
1831xor ebp, ebx
1832ror ebx, 2
1833xor ebx, ebp
1834add ebx, [rsp+8*4+((1024+15-((13)-7)) MOD (16))*4]
1835mov ebp, edx
1836shr ebp, 3
1837ror edx, 7
1838add ebx, [rsp+8*4+((1024+15-(13)) MOD (16))*4]
1839xor ebp, edx
1840add edi, [rsi+(13)*4]
1841ror edx, 11
1842add edi, [rsp+((1024+7-(13)) MOD (8))*4]
1843xor ebp, edx
1844add ebp, ebx
1845mov [rsp+8*4+((1024+15-(13)) MOD (16))*4], ebp
1846add edi, ebp
1847mov ebx, eax
1848xor eax, [rsp+((1024+7-(13+6)) MOD (8))*4]
1849and ecx, eax
1850xor ecx, [rsp+((1024+7-(13+6)) MOD (8))*4]
1851mov ebp, ebx
1852ror ebx, 2
1853add ecx, edi
1854add edi, [rsp+((1024+7-(13+4)) MOD (8))*4]
1855mov [rsp+((1024+7-(13+4)) MOD (8))*4], edi
1856ror ebp, 22
1857xor ebp, ebx
1858ror ebx, 11
1859xor ebp, ebx
1860add ecx, ebp
1861mov [rsp+((1024+7-(13)) MOD (8))*4], ecx
1862mov edx, [rsp+((1024+7-(14+2)) MOD (8))*4]
1863xor edx, [rsp+((1024+7-(14+1)) MOD (8))*4]
1864and edx, edi
1865xor edx, [rsp+((1024+7-(14+1)) MOD (8))*4]
1866mov ebp, edi
1867ror edi, 6
1868ror ebp, 25
1869xor ebp, edi
1870ror edi, 5
1871xor ebp, edi
1872add edx, ebp
1873mov ebp, [rsp+8*4+((1024+15-((14)-2)) MOD (16))*4]
1874mov edi, [rsp+8*4+((1024+15-((14)-15)) MOD (16))*4]
1875mov ebx, ebp
1876shr ebp, 10
1877ror ebx, 17
1878xor ebp, ebx
1879ror ebx, 2
1880xor ebx, ebp
1881add ebx, [rsp+8*4+((1024+15-((14)-7)) MOD (16))*4]
1882mov ebp, edi
1883shr ebp, 3
1884ror edi, 7
1885add ebx, [rsp+8*4+((1024+15-(14)) MOD (16))*4]
1886xor ebp, edi
1887add edx, [rsi+(14)*4]
1888ror edi, 11
1889add edx, [rsp+((1024+7-(14)) MOD (8))*4]
1890xor ebp, edi
1891add ebp, ebx
1892mov [rsp+8*4+((1024+15-(14)) MOD (16))*4], ebp
1893add edx, ebp
1894mov ebx, ecx
1895xor ecx, [rsp+((1024+7-(14+6)) MOD (8))*4]
1896and eax, ecx
1897xor eax, [rsp+((1024+7-(14+6)) MOD (8))*4]
1898mov ebp, ebx
1899ror ebx, 2
1900add eax, edx
1901add edx, [rsp+((1024+7-(14+4)) MOD (8))*4]
1902mov [rsp+((1024+7-(14+4)) MOD (8))*4], edx
1903ror ebp, 22
1904xor ebp, ebx
1905ror ebx, 11
1906xor ebp, ebx
1907add eax, ebp
1908mov [rsp+((1024+7-(14)) MOD (8))*4], eax
1909mov edi, [rsp+((1024+7-(15+2)) MOD (8))*4]
1910xor edi, [rsp+((1024+7-(15+1)) MOD (8))*4]
1911and edi, edx
1912xor edi, [rsp+((1024+7-(15+1)) MOD (8))*4]
1913mov ebp, edx
1914ror edx, 6
1915ror ebp, 25
1916xor ebp, edx
1917ror edx, 5
1918xor ebp, edx
1919add edi, ebp
1920mov ebp, [rsp+8*4+((1024+15-((15)-2)) MOD (16))*4]
1921mov edx, [rsp+8*4+((1024+15-((15)-15)) MOD (16))*4]
1922mov ebx, ebp
1923shr ebp, 10
1924ror ebx, 17
1925xor ebp, ebx
1926ror ebx, 2
1927xor ebx, ebp
1928add ebx, [rsp+8*4+((1024+15-((15)-7)) MOD (16))*4]
1929mov ebp, edx
1930shr ebp, 3
1931ror edx, 7
1932add ebx, [rsp+8*4+((1024+15-(15)) MOD (16))*4]
1933xor ebp, edx
1934add edi, [rsi+(15)*4]
1935ror edx, 11
1936add edi, [rsp+((1024+7-(15)) MOD (8))*4]
1937xor ebp, edx
1938add ebp, ebx
1939mov [rsp+8*4+((1024+15-(15)) MOD (16))*4], ebp
1940add edi, ebp
1941mov ebx, eax
1942xor eax, [rsp+((1024+7-(15+6)) MOD (8))*4]
1943and ecx, eax
1944xor ecx, [rsp+((1024+7-(15+6)) MOD (8))*4]
1945mov ebp, ebx
1946ror ebx, 2
1947add ecx, edi
1948add edi, [rsp+((1024+7-(15+4)) MOD (8))*4]
1949mov [rsp+((1024+7-(15+4)) MOD (8))*4], edi
1950ror ebp, 22
1951xor ebp, ebx
1952ror ebx, 11
1953xor ebp, ebx
1954add ecx, ebp
1955mov [rsp+((1024+7-(15)) MOD (8))*4], ecx
1956cmp rsi, [rsp+8*4+16*4+0*8]
1957jne label1
1958mov rcx, [rsp+8*4+16*4+1*8]
1959movdqa xmm1, XMMWORD PTR [rcx+1*16]
1960movdqa xmm0, XMMWORD PTR [rcx+0*16]
1961paddd xmm1, [rsp+((1024+7-(0+3)) MOD (8))*4]
1962paddd xmm0, [rsp+((1024+7-(0+7)) MOD (8))*4]
1963movdqa [rcx+1*16], xmm1
1964movdqa [rcx+0*16], xmm0
1965mov rdx, [rsp+8*4+16*4+2*8]
1966add rdx, 64
1967mov [rsp+8*4+16*4+2*8], rdx
1968cmp rdx, [rsp+8*4+16*4+3*8]
1969jne label0
1970add		rsp, 8*4 + 16*4 + 4*8 + 8
1971pop		rbp
1972pop		rbx
1973pop		rdi
1974pop		rsi
1975ret
1976SHA256_HashMultipleBlocks_SSE2 ENDP
1977
1978;; http://www.agner.org/optimize/vectorclass/read.php?i=65
1979;; word64 Xgetbv(word32 ctrl)
1980;; ctrl = rcx
1981
1982    ALIGN   8
1983XGETBV64	PROC
1984    ;; query
1985    DB  	0fh, 01h, 0d0h
1986    ;; xcr = (EDX << 32) | EAX
1987    and 	rax, 0ffffffffh
1988    shl 	rdx, 32
1989    or  	rax, rdx
1990    ret
1991XGETBV64	ENDP
1992
1993;; word64 CpuId(word32 func, word32 subfunc, word32 output[4])
1994;; func = rcx
1995;; subfunc = rdx
1996;; output = r8
1997
1998    ALIGN   8
1999CPUID64	PROC
2000    ;; preserve per ABI
2001    push	rbx
2002    ;; eax = func
2003    mov 	rax, rcx
2004    ;; ecx = subfunc
2005    mov 	rcx, rdx
2006    ;; query
2007    cpuid
2008    ;; save
2009    mov 	[r8+0],  eax
2010    mov 	[r8+4],  ebx
2011    mov 	[r8+8],  ecx
2012    mov 	[r8+12], edx
2013    ;; restore
2014    pop 	rbx
2015    ;; return
2016    mov 	rax, 1
2017    ret
2018CPUID64	ENDP
2019
2020_TEXT ENDS
2021END
2022