1//+build !noasm,!appengine
2
3// SHA256 implementation for AVX2
4
5//
6// Minio Cloud Storage, (C) 2016 Minio, Inc.
7//
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11//
12//     http://www.apache.org/licenses/LICENSE-2.0
13//
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
20
21//
22// This code is based on an Intel White-Paper:
23// "Fast SHA-256 Implementations on Intel Architecture Processors"
24//
25// together with the reference implementation from the following authors:
26//    James Guilford <james.guilford@intel.com>
27//    Kirk Yap <kirk.s.yap@intel.com>
28//    Tim Chen <tim.c.chen@linux.intel.com>
29//
30// For Golang it has been converted to Plan 9 assembly with the help of
31// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
32// equivalents
33//
34
35DATA K256<>+0x000(SB)/8, $0x71374491428a2f98
36DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf
37DATA K256<>+0x010(SB)/8, $0x71374491428a2f98
38DATA K256<>+0x018(SB)/8, $0xe9b5dba5b5c0fbcf
39DATA K256<>+0x020(SB)/8, $0x59f111f13956c25b
40DATA K256<>+0x028(SB)/8, $0xab1c5ed5923f82a4
41DATA K256<>+0x030(SB)/8, $0x59f111f13956c25b
42DATA K256<>+0x038(SB)/8, $0xab1c5ed5923f82a4
43DATA K256<>+0x040(SB)/8, $0x12835b01d807aa98
44DATA K256<>+0x048(SB)/8, $0x550c7dc3243185be
45DATA K256<>+0x050(SB)/8, $0x12835b01d807aa98
46DATA K256<>+0x058(SB)/8, $0x550c7dc3243185be
47DATA K256<>+0x060(SB)/8, $0x80deb1fe72be5d74
48DATA K256<>+0x068(SB)/8, $0xc19bf1749bdc06a7
49DATA K256<>+0x070(SB)/8, $0x80deb1fe72be5d74
50DATA K256<>+0x078(SB)/8, $0xc19bf1749bdc06a7
51DATA K256<>+0x080(SB)/8, $0xefbe4786e49b69c1
52DATA K256<>+0x088(SB)/8, $0x240ca1cc0fc19dc6
53DATA K256<>+0x090(SB)/8, $0xefbe4786e49b69c1
54DATA K256<>+0x098(SB)/8, $0x240ca1cc0fc19dc6
55DATA K256<>+0x0a0(SB)/8, $0x4a7484aa2de92c6f
56DATA K256<>+0x0a8(SB)/8, $0x76f988da5cb0a9dc
57DATA K256<>+0x0b0(SB)/8, $0x4a7484aa2de92c6f
58DATA K256<>+0x0b8(SB)/8, $0x76f988da5cb0a9dc
59DATA K256<>+0x0c0(SB)/8, $0xa831c66d983e5152
60DATA K256<>+0x0c8(SB)/8, $0xbf597fc7b00327c8
61DATA K256<>+0x0d0(SB)/8, $0xa831c66d983e5152
62DATA K256<>+0x0d8(SB)/8, $0xbf597fc7b00327c8
63DATA K256<>+0x0e0(SB)/8, $0xd5a79147c6e00bf3
64DATA K256<>+0x0e8(SB)/8, $0x1429296706ca6351
65DATA K256<>+0x0f0(SB)/8, $0xd5a79147c6e00bf3
66DATA K256<>+0x0f8(SB)/8, $0x1429296706ca6351
67DATA K256<>+0x100(SB)/8, $0x2e1b213827b70a85
68DATA K256<>+0x108(SB)/8, $0x53380d134d2c6dfc
69DATA K256<>+0x110(SB)/8, $0x2e1b213827b70a85
70DATA K256<>+0x118(SB)/8, $0x53380d134d2c6dfc
71DATA K256<>+0x120(SB)/8, $0x766a0abb650a7354
72DATA K256<>+0x128(SB)/8, $0x92722c8581c2c92e
73DATA K256<>+0x130(SB)/8, $0x766a0abb650a7354
74DATA K256<>+0x138(SB)/8, $0x92722c8581c2c92e
75DATA K256<>+0x140(SB)/8, $0xa81a664ba2bfe8a1
76DATA K256<>+0x148(SB)/8, $0xc76c51a3c24b8b70
77DATA K256<>+0x150(SB)/8, $0xa81a664ba2bfe8a1
78DATA K256<>+0x158(SB)/8, $0xc76c51a3c24b8b70
79DATA K256<>+0x160(SB)/8, $0xd6990624d192e819
80DATA K256<>+0x168(SB)/8, $0x106aa070f40e3585
81DATA K256<>+0x170(SB)/8, $0xd6990624d192e819
82DATA K256<>+0x178(SB)/8, $0x106aa070f40e3585
83DATA K256<>+0x180(SB)/8, $0x1e376c0819a4c116
84DATA K256<>+0x188(SB)/8, $0x34b0bcb52748774c
85DATA K256<>+0x190(SB)/8, $0x1e376c0819a4c116
86DATA K256<>+0x198(SB)/8, $0x34b0bcb52748774c
87DATA K256<>+0x1a0(SB)/8, $0x4ed8aa4a391c0cb3
88DATA K256<>+0x1a8(SB)/8, $0x682e6ff35b9cca4f
89DATA K256<>+0x1b0(SB)/8, $0x4ed8aa4a391c0cb3
90DATA K256<>+0x1b8(SB)/8, $0x682e6ff35b9cca4f
91DATA K256<>+0x1c0(SB)/8, $0x78a5636f748f82ee
92DATA K256<>+0x1c8(SB)/8, $0x8cc7020884c87814
93DATA K256<>+0x1d0(SB)/8, $0x78a5636f748f82ee
94DATA K256<>+0x1d8(SB)/8, $0x8cc7020884c87814
95DATA K256<>+0x1e0(SB)/8, $0xa4506ceb90befffa
96DATA K256<>+0x1e8(SB)/8, $0xc67178f2bef9a3f7
97DATA K256<>+0x1f0(SB)/8, $0xa4506ceb90befffa
98DATA K256<>+0x1f8(SB)/8, $0xc67178f2bef9a3f7
99
100DATA K256<>+0x200(SB)/8, $0x0405060700010203
101DATA K256<>+0x208(SB)/8, $0x0c0d0e0f08090a0b
102DATA K256<>+0x210(SB)/8, $0x0405060700010203
103DATA K256<>+0x218(SB)/8, $0x0c0d0e0f08090a0b
104DATA K256<>+0x220(SB)/8, $0x0b0a090803020100
105DATA K256<>+0x228(SB)/8, $0xffffffffffffffff
106DATA K256<>+0x230(SB)/8, $0x0b0a090803020100
107DATA K256<>+0x238(SB)/8, $0xffffffffffffffff
108DATA K256<>+0x240(SB)/8, $0xffffffffffffffff
109DATA K256<>+0x248(SB)/8, $0x0b0a090803020100
110DATA K256<>+0x250(SB)/8, $0xffffffffffffffff
111DATA K256<>+0x258(SB)/8, $0x0b0a090803020100
112
113GLOBL K256<>(SB), 8, $608
114
115// We need 0x220 stack space aligned on a 512 boundary, so for the
116// worstcase-aligned SP we need twice this amount, being 1088 (=0x440)
117//
118// SP        aligned   end-aligned  stacksize
119// 100013d0  10001400  10001620     592
120// 100013d8  10001400  10001620     584
121// 100013e0  10001600  10001820     1088
122// 100013e8  10001600  10001820     1080
123
124// func blockAvx2(h []uint32, message []uint8)
125TEXT ·blockAvx2(SB),$1088-48
126
127	MOVQ h+0(FP), DI             // DI: &h
128	MOVQ message_base+24(FP), SI // SI: &message
129	MOVQ message_len+32(FP), DX  // len(message)
130	ADDQ SI, DX                  // end pointer of input
131	MOVQ SP, R11                 // copy stack pointer
132	ADDQ $0x220, SP              // sp += 0x220
133	ANDQ $0xfffffffffffffe00, SP // align stack frame
134	ADDQ $0x1c0, SP
135	MOVQ DI, 0x40(SP)            // save ctx
136	MOVQ SI, 0x48(SP)            // save input
137	MOVQ DX, 0x50(SP)            // save end pointer
138	MOVQ R11, 0x58(SP)           // save copy of stack pointer
139
140	WORD $0xf8c5; BYTE $0x77 // vzeroupper
141	ADDQ $0x40, SI           // input++
142	MOVL (DI), AX
143	MOVQ SI, R12             // borrow $T1
144	MOVL 4(DI), BX
145	CMPQ SI, DX              // $_end
146	MOVL 8(DI), CX
147	LONG $0xe4440f4c         // cmove  r12,rsp            /* next block or random data */
148	MOVL 12(DI), DX
149	MOVL 16(DI), R8
150	MOVL 20(DI), R9
151	MOVL 24(DI), R10
152	MOVL 28(DI), R11
153
154	LEAQ K256<>(SB), BP
155	LONG $0x856f7dc5; LONG $0x00000220 // VMOVDQA YMM8, 0x220[rbp]  /* vmovdqa ymm8,YMMWORD PTR [rip+0x220] */
156	LONG $0x8d6f7dc5; LONG $0x00000240 // VMOVDQA YMM9, 0x240[rbp]  /* vmovdqa ymm9,YMMWORD PTR [rip+0x240] */
157	LONG $0x956f7dc5; LONG $0x00000200 // VMOVDQA YMM10, 0x200[rbp] /* vmovdqa ymm7,YMMWORD PTR [rip+0x200] */
158
159loop0:
160	LONG $0x6f7dc1c4; BYTE $0xfa // VMOVDQA YMM7, YMM10
161
162	// Load first 16 dwords from two blocks
163	MOVOU -64(SI), X0 // vmovdqu xmm0,XMMWORD PTR [rsi-0x40]
164	MOVOU -48(SI), X1 // vmovdqu xmm1,XMMWORD PTR [rsi-0x30]
165	MOVOU -32(SI), X2 // vmovdqu xmm2,XMMWORD PTR [rsi-0x20]
166	MOVOU -16(SI), X3 // vmovdqu xmm3,XMMWORD PTR [rsi-0x10]
167
168	// Byte swap data and transpose data into high/low
169	LONG $0x387dc3c4; WORD $0x2404; BYTE $0x01 // vinserti128 ymm0,ymm0,[r12],0x1
170	LONG $0x3875c3c4; LONG $0x0110244c         // vinserti128 ymm1,ymm1,0x10[r12],0x1
171	LONG $0x007de2c4; BYTE $0xc7               // vpshufb     ymm0,ymm0,ymm7
172	LONG $0x386dc3c4; LONG $0x01202454         // vinserti128 ymm2,ymm2,0x20[r12],0x1
173	LONG $0x0075e2c4; BYTE $0xcf               // vpshufb     ymm1,ymm1,ymm7
174	LONG $0x3865c3c4; LONG $0x0130245c         // vinserti128 ymm3,ymm3,0x30[r12],0x1
175
176	LEAQ K256<>(SB), BP
177	LONG $0x006de2c4; BYTE $0xd7 // vpshufb ymm2,ymm2,ymm7
178	LONG $0x65fefdc5; BYTE $0x00 // vpaddd  ymm4,ymm0,[rbp]
179	LONG $0x0065e2c4; BYTE $0xdf // vpshufb ymm3,ymm3,ymm7
180	LONG $0x6dfef5c5; BYTE $0x20 // vpaddd  ymm5,ymm1,0x20[rbp]
181	LONG $0x75feedc5; BYTE $0x40 // vpaddd  ymm6,ymm2,0x40[rbp]
182	LONG $0x7dfee5c5; BYTE $0x60 // vpaddd  ymm7,ymm3,0x60[rbp]
183
184	LONG $0x247ffdc5; BYTE $0x24   // vmovdqa [rsp],ymm4
185	XORQ R14, R14
186	LONG $0x6c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm5
187
188	ADDQ $-0x40, SP
189	MOVQ BX, DI
190	LONG $0x347ffdc5; BYTE $0x24   // vmovdqa [rsp],ymm6
191	XORQ CX, DI                    // magic
192	LONG $0x7c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm7
193	MOVQ R9, R12
194	ADDQ $0x80, BP
195
196loop1:
197	// Schedule 48 input dwords, by doing 3 rounds of 12 each
198	// Note: SIMD instructions are interleaved with the SHA calculations
199	ADDQ $-0x40, SP
200	LONG $0x0f75e3c4; WORD $0x04e0 // vpalignr ymm4,ymm1,ymm0,0x4
201
202	// ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
203	LONG $0x249c0344; LONG $0x00000080 // add    r11d,[rsp+0x80]
204	WORD $0x2145; BYTE $0xc4           // and    r12d,r8d
205	LONG $0xf07b43c4; WORD $0x19e8     // rorx   r13d,r8d,0x19
206	LONG $0x0f65e3c4; WORD $0x04fa     // vpalignr ymm7,ymm3,ymm2,0x4
207	LONG $0xf07b43c4; WORD $0x0bf8     // rorx   r15d,r8d,0xb
208	LONG $0x30048d42                   // lea    eax,[rax+r14*1]
209	LONG $0x231c8d47                   // lea    r11d,[r11+r12*1]
210	LONG $0xd472cdc5; BYTE $0x07       // vpsrld ymm6,ymm4,0x7
211	LONG $0xf23842c4; BYTE $0xe2       // andn   r12d,r8d,r10d
212	WORD $0x3145; BYTE $0xfd           // xor    r13d,r15d
213	LONG $0xf07b43c4; WORD $0x06f0     // rorx   r14d,r8d,0x6
214	LONG $0xc7fefdc5                   // vpaddd ymm0,ymm0,ymm7
215	LONG $0x231c8d47                   // lea    r11d,[r11+r12*1]
216	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
217	WORD $0x8941; BYTE $0xc7           // mov    r15d,eax
218	LONG $0xd472c5c5; BYTE $0x03       // vpsrld ymm7,ymm4,0x3
219	LONG $0xf07b63c4; WORD $0x16e0     // rorx   r12d,eax,0x16
220	LONG $0x2b1c8d47                   // lea    r11d,[r11+r13*1]
221	WORD $0x3141; BYTE $0xdf           // xor    r15d,ebx
222	LONG $0xf472d5c5; BYTE $0x0e       // vpslld ymm5,ymm4,0xe
223	LONG $0xf07b63c4; WORD $0x0df0     // rorx   r14d,eax,0xd
224	LONG $0xf07b63c4; WORD $0x02e8     // rorx   r13d,eax,0x2
225	LONG $0x1a148d42                   // lea    edx,[rdx+r11*1]
226	LONG $0xe6efc5c5                   // vpxor  ymm4,ymm7,ymm6
227	WORD $0x2144; BYTE $0xff           // and    edi,r15d
228	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
229	WORD $0xdf31                       // xor    edi,ebx
230	LONG $0xfb70fdc5; BYTE $0xfa       // vpshufd ymm7,ymm3,0xfa
231	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
232	LONG $0x3b1c8d45                   // lea    r11d,[r11+rdi*1]
233	WORD $0x8945; BYTE $0xc4           // mov    r12d,r8d
234	LONG $0xd672cdc5; BYTE $0x0b       // vpsrld ymm6,ymm6,0xb
235
236	// ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
237	LONG $0x24940344; LONG $0x00000084 // add    r10d,[rsp+0x84]
238	WORD $0x2141; BYTE $0xd4           // and    r12d,edx
239	LONG $0xf07b63c4; WORD $0x19ea     // rorx   r13d,edx,0x19
240	LONG $0xe5efddc5                   // vpxor  ymm4,ymm4,ymm5
241	LONG $0xf07be3c4; WORD $0x0bfa     // rorx   edi,edx,0xb
242	LONG $0x331c8d47                   // lea    r11d,[r11+r14*1]
243	LONG $0x22148d47                   // lea    r10d,[r10+r12*1]
244	LONG $0xf572d5c5; BYTE $0x0b       // vpslld ymm5,ymm5,0xb
245	LONG $0xf26842c4; BYTE $0xe1       // andn   r12d,edx,r9d
246	WORD $0x3141; BYTE $0xfd           // xor    r13d,edi
247	LONG $0xf07b63c4; WORD $0x06f2     // rorx   r14d,edx,0x6
248	LONG $0xe6efddc5                   // vpxor  ymm4,ymm4,ymm6
249	LONG $0x22148d47                   // lea    r10d,[r10+r12*1]
250	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
251	WORD $0x8944; BYTE $0xdf           // mov    edi,r11d
252	LONG $0xd772cdc5; BYTE $0x0a       // vpsrld ymm6,ymm7,0xa
253	LONG $0xf07b43c4; WORD $0x16e3     // rorx   r12d,r11d,0x16
254	LONG $0x2a148d47                   // lea    r10d,[r10+r13*1]
255	WORD $0xc731                       // xor    edi,eax
256	LONG $0xe5efddc5                   // vpxor  ymm4,ymm4,ymm5
257	LONG $0xf07b43c4; WORD $0x0df3     // rorx   r14d,r11d,0xd
258	LONG $0xf07b43c4; WORD $0x02eb     // rorx   r13d,r11d,0x2
259	LONG $0x110c8d42                   // lea    ecx,[rcx+r10*1]
260	LONG $0xd773c5c5; BYTE $0x11       // vpsrlq ymm7,ymm7,0x11
261	WORD $0x2141; BYTE $0xff           // and    r15d,edi
262	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
263	WORD $0x3141; BYTE $0xc7           // xor    r15d,eax
264	LONG $0xc4fefdc5                   // vpaddd ymm0,ymm0,ymm4
265	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
266	LONG $0x3a148d47                   // lea    r10d,[r10+r15*1]
267	WORD $0x8941; BYTE $0xd4           // mov    r12d,edx
268	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
269
270	// ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
271	LONG $0x248c0344; LONG $0x00000088 // add    r9d,[rsp+0x88]
272	WORD $0x2141; BYTE $0xcc           // and    r12d,ecx
273	LONG $0xf07b63c4; WORD $0x19e9     // rorx   r13d,ecx,0x19
274	LONG $0xd773c5c5; BYTE $0x02       // vpsrlq ymm7,ymm7,0x2
275	LONG $0xf07b63c4; WORD $0x0bf9     // rorx   r15d,ecx,0xb
276	LONG $0x32148d47                   // lea    r10d,[r10+r14*1]
277	LONG $0x210c8d47                   // lea    r9d,[r9+r12*1]
278	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
279	LONG $0xf27042c4; BYTE $0xe0       // andn   r12d,ecx,r8d
280	WORD $0x3145; BYTE $0xfd           // xor    r13d,r15d
281	LONG $0xf07b63c4; WORD $0x06f1     // rorx   r14d,ecx,0x6
282	LONG $0x004dc2c4; BYTE $0xf0       // vpshufb ymm6,ymm6,ymm8
283	LONG $0x210c8d47                   // lea    r9d,[r9+r12*1]
284	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
285	WORD $0x8945; BYTE $0xd7           // mov    r15d,r10d
286	LONG $0xc6fefdc5                   // vpaddd ymm0,ymm0,ymm6
287	LONG $0xf07b43c4; WORD $0x16e2     // rorx   r12d,r10d,0x16
288	LONG $0x290c8d47                   // lea    r9d,[r9+r13*1]
289	WORD $0x3145; BYTE $0xdf           // xor    r15d,r11d
290	LONG $0xf870fdc5; BYTE $0x50       // vpshufd ymm7,ymm0,0x50
291	LONG $0xf07b43c4; WORD $0x0df2     // rorx   r14d,r10d,0xd
292	LONG $0xf07b43c4; WORD $0x02ea     // rorx   r13d,r10d,0x2
293	LONG $0x0b1c8d42                   // lea    ebx,[rbx+r9*1]
294	LONG $0xd772cdc5; BYTE $0x0a       // vpsrld ymm6,ymm7,0xa
295	WORD $0x2144; BYTE $0xff           // and    edi,r15d
296	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
297	WORD $0x3144; BYTE $0xdf           // xor    edi,r11d
298	LONG $0xd773c5c5; BYTE $0x11       // vpsrlq ymm7,ymm7,0x11
299	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
300	LONG $0x390c8d45                   // lea    r9d,[r9+rdi*1]
301	WORD $0x8941; BYTE $0xcc           // mov    r12d,ecx
302	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
303
304	// ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
305	LONG $0x24840344; LONG $0x0000008c // add    r8d,[rsp+0x8c]
306	WORD $0x2141; BYTE $0xdc           // and    r12d,ebx
307	LONG $0xf07b63c4; WORD $0x19eb     // rorx   r13d,ebx,0x19
308	LONG $0xd773c5c5; BYTE $0x02       // vpsrlq ymm7,ymm7,0x2
309	LONG $0xf07be3c4; WORD $0x0bfb     // rorx   edi,ebx,0xb
310	LONG $0x310c8d47                   // lea    r9d,[r9+r14*1]
311	LONG $0x20048d47                   // lea    r8d,[r8+r12*1]
312	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
313	LONG $0xf26062c4; BYTE $0xe2       // andn   r12d,ebx,edx
314	WORD $0x3141; BYTE $0xfd           // xor    r13d,edi
315	LONG $0xf07b63c4; WORD $0x06f3     // rorx   r14d,ebx,0x6
316	LONG $0x004dc2c4; BYTE $0xf1       // vpshufb ymm6,ymm6,ymm9
317	LONG $0x20048d47                   // lea    r8d,[r8+r12*1]
318	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
319	WORD $0x8944; BYTE $0xcf           // mov    edi,r9d
320	LONG $0xc6fefdc5                   // vpaddd ymm0,ymm0,ymm6
321	LONG $0xf07b43c4; WORD $0x16e1     // rorx   r12d,r9d,0x16
322	LONG $0x28048d47                   // lea    r8d,[r8+r13*1]
323	WORD $0x3144; BYTE $0xd7           // xor    edi,r10d
324	LONG $0x75fefdc5; BYTE $0x00       // vpaddd ymm6,ymm0,[rbp+0x0]
325	LONG $0xf07b43c4; WORD $0x0df1     // rorx   r14d,r9d,0xd
326	LONG $0xf07b43c4; WORD $0x02e9     // rorx   r13d,r9d,0x2
327	LONG $0x00048d42                   // lea    eax,[rax+r8*1]
328	WORD $0x2141; BYTE $0xff           // and    r15d,edi
329	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
330	WORD $0x3145; BYTE $0xd7           // xor    r15d,r10d
331	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
332	LONG $0x38048d47                   // lea    r8d,[r8+r15*1]
333	WORD $0x8941; BYTE $0xdc           // mov    r12d,ebx
334
335	LONG $0x347ffdc5; BYTE $0x24   // vmovdqa [rsp],ymm6
336	LONG $0x0f6de3c4; WORD $0x04e1 // vpalignr ymm4,ymm2,ymm1,0x4
337
338	// ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
339	LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add    edx,[rsp+0xa0]
340	WORD $0x2141; BYTE $0xc4                   // and    r12d,eax
341	LONG $0xf07b63c4; WORD $0x19e8             // rorx   r13d,eax,0x19
342	LONG $0x0f7de3c4; WORD $0x04fb             // vpalignr ymm7,ymm0,ymm3,0x4
343	LONG $0xf07b63c4; WORD $0x0bf8             // rorx   r15d,eax,0xb
344	LONG $0x30048d47                           // lea    r8d,[r8+r14*1]
345	LONG $0x22148d42                           // lea    edx,[rdx+r12*1]
346	LONG $0xd472cdc5; BYTE $0x07               // vpsrld ymm6,ymm4,0x7
347	LONG $0xf27862c4; BYTE $0xe1               // andn   r12d,eax,ecx
348	WORD $0x3145; BYTE $0xfd                   // xor    r13d,r15d
349	LONG $0xf07b63c4; WORD $0x06f0             // rorx   r14d,eax,0x6
350	LONG $0xcffef5c5                           // vpaddd ymm1,ymm1,ymm7
351	LONG $0x22148d42                           // lea    edx,[rdx+r12*1]
352	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
353	WORD $0x8945; BYTE $0xc7                   // mov    r15d,r8d
354	LONG $0xd472c5c5; BYTE $0x03               // vpsrld ymm7,ymm4,0x3
355	LONG $0xf07b43c4; WORD $0x16e0             // rorx   r12d,r8d,0x16
356	LONG $0x2a148d42                           // lea    edx,[rdx+r13*1]
357	WORD $0x3145; BYTE $0xcf                   // xor    r15d,r9d
358	LONG $0xf472d5c5; BYTE $0x0e               // vpslld ymm5,ymm4,0xe
359	LONG $0xf07b43c4; WORD $0x0df0             // rorx   r14d,r8d,0xd
360	LONG $0xf07b43c4; WORD $0x02e8             // rorx   r13d,r8d,0x2
361	LONG $0x131c8d45                           // lea    r11d,[r11+rdx*1]
362	LONG $0xe6efc5c5                           // vpxor  ymm4,ymm7,ymm6
363	WORD $0x2144; BYTE $0xff                   // and    edi,r15d
364	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
365	WORD $0x3144; BYTE $0xcf                   // xor    edi,r9d
366	LONG $0xf870fdc5; BYTE $0xfa               // vpshufd ymm7,ymm0,0xfa
367	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
368	WORD $0x148d; BYTE $0x3a                   // lea    edx,[rdx+rdi*1]
369	WORD $0x8941; BYTE $0xc4                   // mov    r12d,eax
370	LONG $0xd672cdc5; BYTE $0x0b               // vpsrld ymm6,ymm6,0xb
371
372	// ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
373	LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add    ecx,[rsp+0xa4]
374	WORD $0x2145; BYTE $0xdc                   // and    r12d,r11d
375	LONG $0xf07b43c4; WORD $0x19eb             // rorx   r13d,r11d,0x19
376	LONG $0xe5efddc5                           // vpxor  ymm4,ymm4,ymm5
377	LONG $0xf07bc3c4; WORD $0x0bfb             // rorx   edi,r11d,0xb
378	LONG $0x32148d42                           // lea    edx,[rdx+r14*1]
379	LONG $0x210c8d42                           // lea    ecx,[rcx+r12*1]
380	LONG $0xf572d5c5; BYTE $0x0b               // vpslld ymm5,ymm5,0xb
381	LONG $0xf22062c4; BYTE $0xe3               // andn   r12d,r11d,ebx
382	WORD $0x3141; BYTE $0xfd                   // xor    r13d,edi
383	LONG $0xf07b43c4; WORD $0x06f3             // rorx   r14d,r11d,0x6
384	LONG $0xe6efddc5                           // vpxor  ymm4,ymm4,ymm6
385	LONG $0x210c8d42                           // lea    ecx,[rcx+r12*1]
386	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
387	WORD $0xd789                               // mov    edi,edx
388	LONG $0xd772cdc5; BYTE $0x0a               // vpsrld ymm6,ymm7,0xa
389	LONG $0xf07b63c4; WORD $0x16e2             // rorx   r12d,edx,0x16
390	LONG $0x290c8d42                           // lea    ecx,[rcx+r13*1]
391	WORD $0x3144; BYTE $0xc7                   // xor    edi,r8d
392	LONG $0xe5efddc5                           // vpxor  ymm4,ymm4,ymm5
393	LONG $0xf07b63c4; WORD $0x0df2             // rorx   r14d,edx,0xd
394	LONG $0xf07b63c4; WORD $0x02ea             // rorx   r13d,edx,0x2
395	LONG $0x0a148d45                           // lea    r10d,[r10+rcx*1]
396	LONG $0xd773c5c5; BYTE $0x11               // vpsrlq ymm7,ymm7,0x11
397	WORD $0x2141; BYTE $0xff                   // and    r15d,edi
398	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
399	WORD $0x3145; BYTE $0xc7                   // xor    r15d,r8d
400	LONG $0xccfef5c5                           // vpaddd ymm1,ymm1,ymm4
401	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
402	LONG $0x390c8d42                           // lea    ecx,[rcx+r15*1]
403	WORD $0x8945; BYTE $0xdc                   // mov    r12d,r11d
404	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
405
406	// ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
407	LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add    ebx,[rsp+0xa8]
408	WORD $0x2145; BYTE $0xd4                   // and    r12d,r10d
409	LONG $0xf07b43c4; WORD $0x19ea             // rorx   r13d,r10d,0x19
410	LONG $0xd773c5c5; BYTE $0x02               // vpsrlq ymm7,ymm7,0x2
411	LONG $0xf07b43c4; WORD $0x0bfa             // rorx   r15d,r10d,0xb
412	LONG $0x310c8d42                           // lea    ecx,[rcx+r14*1]
413	LONG $0x231c8d42                           // lea    ebx,[rbx+r12*1]
414	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
415	LONG $0xf22862c4; BYTE $0xe0               // andn   r12d,r10d,eax
416	WORD $0x3145; BYTE $0xfd                   // xor    r13d,r15d
417	LONG $0xf07b43c4; WORD $0x06f2             // rorx   r14d,r10d,0x6
418	LONG $0x004dc2c4; BYTE $0xf0               // vpshufb ymm6,ymm6,ymm8
419	LONG $0x231c8d42                           // lea    ebx,[rbx+r12*1]
420	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
421	WORD $0x8941; BYTE $0xcf                   // mov    r15d,ecx
422	LONG $0xcefef5c5                           // vpaddd ymm1,ymm1,ymm6
423	LONG $0xf07b63c4; WORD $0x16e1             // rorx   r12d,ecx,0x16
424	LONG $0x2b1c8d42                           // lea    ebx,[rbx+r13*1]
425	WORD $0x3141; BYTE $0xd7                   // xor    r15d,edx
426	LONG $0xf970fdc5; BYTE $0x50               // vpshufd ymm7,ymm1,0x50
427	LONG $0xf07b63c4; WORD $0x0df1             // rorx   r14d,ecx,0xd
428	LONG $0xf07b63c4; WORD $0x02e9             // rorx   r13d,ecx,0x2
429	LONG $0x190c8d45                           // lea    r9d,[r9+rbx*1]
430	LONG $0xd772cdc5; BYTE $0x0a               // vpsrld ymm6,ymm7,0xa
431	WORD $0x2144; BYTE $0xff                   // and    edi,r15d
432	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
433	WORD $0xd731                               // xor    edi,edx
434	LONG $0xd773c5c5; BYTE $0x11               // vpsrlq ymm7,ymm7,0x11
435	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
436	WORD $0x1c8d; BYTE $0x3b                   // lea    ebx,[rbx+rdi*1]
437	WORD $0x8945; BYTE $0xd4                   // mov    r12d,r10d
438	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
439
440	// ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
441	LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add    eax,[rsp+0xac]
442	WORD $0x2145; BYTE $0xcc                   // and    r12d,r9d
443	LONG $0xf07b43c4; WORD $0x19e9             // rorx   r13d,r9d,0x19
444	LONG $0xd773c5c5; BYTE $0x02               // vpsrlq ymm7,ymm7,0x2
445	LONG $0xf07bc3c4; WORD $0x0bf9             // rorx   edi,r9d,0xb
446	LONG $0x331c8d42                           // lea    ebx,[rbx+r14*1]
447	LONG $0x20048d42                           // lea    eax,[rax+r12*1]
448	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
449	LONG $0xf23042c4; BYTE $0xe3               // andn   r12d,r9d,r11d
450	WORD $0x3141; BYTE $0xfd                   // xor    r13d,edi
451	LONG $0xf07b43c4; WORD $0x06f1             // rorx   r14d,r9d,0x6
452	LONG $0x004dc2c4; BYTE $0xf1               // vpshufb ymm6,ymm6,ymm9
453	LONG $0x20048d42                           // lea    eax,[rax+r12*1]
454	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
455	WORD $0xdf89                               // mov    edi,ebx
456	LONG $0xcefef5c5                           // vpaddd ymm1,ymm1,ymm6
457	LONG $0xf07b63c4; WORD $0x16e3             // rorx   r12d,ebx,0x16
458	LONG $0x28048d42                           // lea    eax,[rax+r13*1]
459	WORD $0xcf31                               // xor    edi,ecx
460	LONG $0x75fef5c5; BYTE $0x20               // vpaddd ymm6,ymm1,[rbp+0x20]
461	LONG $0xf07b63c4; WORD $0x0df3             // rorx   r14d,ebx,0xd
462	LONG $0xf07b63c4; WORD $0x02eb             // rorx   r13d,ebx,0x2
463	LONG $0x00048d45                           // lea    r8d,[r8+rax*1]
464	WORD $0x2141; BYTE $0xff                   // and    r15d,edi
465	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
466	WORD $0x3141; BYTE $0xcf                   // xor    r15d,ecx
467	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
468	LONG $0x38048d42                           // lea    eax,[rax+r15*1]
469	WORD $0x8945; BYTE $0xcc                   // mov    r12d,r9d
470
471	LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
472
473	LONG $0x24648d48; BYTE $0xc0   // lea    rsp,[rsp-0x40]
474	LONG $0x0f65e3c4; WORD $0x04e2 // vpalignr ymm4,ymm3,ymm2,0x4
475
476	// ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
477	LONG $0x249c0344; LONG $0x00000080 // add    r11d,[rsp+0x80]
478	WORD $0x2145; BYTE $0xc4           // and    r12d,r8d
479	LONG $0xf07b43c4; WORD $0x19e8     // rorx   r13d,r8d,0x19
480	LONG $0x0f75e3c4; WORD $0x04f8     // vpalignr ymm7,ymm1,ymm0,0x4
481	LONG $0xf07b43c4; WORD $0x0bf8     // rorx   r15d,r8d,0xb
482	LONG $0x30048d42                   // lea    eax,[rax+r14*1]
483	LONG $0x231c8d47                   // lea    r11d,[r11+r12*1]
484	LONG $0xd472cdc5; BYTE $0x07       // vpsrld ymm6,ymm4,0x7
485	LONG $0xf23842c4; BYTE $0xe2       // andn   r12d,r8d,r10d
486	WORD $0x3145; BYTE $0xfd           // xor    r13d,r15d
487	LONG $0xf07b43c4; WORD $0x06f0     // rorx   r14d,r8d,0x6
488	LONG $0xd7feedc5                   // vpaddd ymm2,ymm2,ymm7
489	LONG $0x231c8d47                   // lea    r11d,[r11+r12*1]
490	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
491	WORD $0x8941; BYTE $0xc7           // mov    r15d,eax
492	LONG $0xd472c5c5; BYTE $0x03       // vpsrld ymm7,ymm4,0x3
493	LONG $0xf07b63c4; WORD $0x16e0     // rorx   r12d,eax,0x16
494	LONG $0x2b1c8d47                   // lea    r11d,[r11+r13*1]
495	WORD $0x3141; BYTE $0xdf           // xor    r15d,ebx
496	LONG $0xf472d5c5; BYTE $0x0e       // vpslld ymm5,ymm4,0xe
497	LONG $0xf07b63c4; WORD $0x0df0     // rorx   r14d,eax,0xd
498	LONG $0xf07b63c4; WORD $0x02e8     // rorx   r13d,eax,0x2
499	LONG $0x1a148d42                   // lea    edx,[rdx+r11*1]
500	LONG $0xe6efc5c5                   // vpxor  ymm4,ymm7,ymm6
501	WORD $0x2144; BYTE $0xff           // and    edi,r15d
502	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
503	WORD $0xdf31                       // xor    edi,ebx
504	LONG $0xf970fdc5; BYTE $0xfa       // vpshufd ymm7,ymm1,0xfa
505	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
506	LONG $0x3b1c8d45                   // lea    r11d,[r11+rdi*1]
507	WORD $0x8945; BYTE $0xc4           // mov    r12d,r8d
508	LONG $0xd672cdc5; BYTE $0x0b       // vpsrld ymm6,ymm6,0xb
509
510	// ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
511	LONG $0x24940344; LONG $0x00000084 // add    r10d,[rsp+0x84]
512	WORD $0x2141; BYTE $0xd4           // and    r12d,edx
513	LONG $0xf07b63c4; WORD $0x19ea     // rorx   r13d,edx,0x19
514	LONG $0xe5efddc5                   // vpxor  ymm4,ymm4,ymm5
515	LONG $0xf07be3c4; WORD $0x0bfa     // rorx   edi,edx,0xb
516	LONG $0x331c8d47                   // lea    r11d,[r11+r14*1]
517	LONG $0x22148d47                   // lea    r10d,[r10+r12*1]
518	LONG $0xf572d5c5; BYTE $0x0b       // vpslld ymm5,ymm5,0xb
519	LONG $0xf26842c4; BYTE $0xe1       // andn   r12d,edx,r9d
520	WORD $0x3141; BYTE $0xfd           // xor    r13d,edi
521	LONG $0xf07b63c4; WORD $0x06f2     // rorx   r14d,edx,0x6
522	LONG $0xe6efddc5                   // vpxor  ymm4,ymm4,ymm6
523	LONG $0x22148d47                   // lea    r10d,[r10+r12*1]
524	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
525	WORD $0x8944; BYTE $0xdf           // mov    edi,r11d
526	LONG $0xd772cdc5; BYTE $0x0a       // vpsrld ymm6,ymm7,0xa
527	LONG $0xf07b43c4; WORD $0x16e3     // rorx   r12d,r11d,0x16
528	LONG $0x2a148d47                   // lea    r10d,[r10+r13*1]
529	WORD $0xc731                       // xor    edi,eax
530	LONG $0xe5efddc5                   // vpxor  ymm4,ymm4,ymm5
531	LONG $0xf07b43c4; WORD $0x0df3     // rorx   r14d,r11d,0xd
532	LONG $0xf07b43c4; WORD $0x02eb     // rorx   r13d,r11d,0x2
533	LONG $0x110c8d42                   // lea    ecx,[rcx+r10*1]
534	LONG $0xd773c5c5; BYTE $0x11       // vpsrlq ymm7,ymm7,0x11
535	WORD $0x2141; BYTE $0xff           // and    r15d,edi
536	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
537	WORD $0x3141; BYTE $0xc7           // xor    r15d,eax
538	LONG $0xd4feedc5                   // vpaddd ymm2,ymm2,ymm4
539	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
540	LONG $0x3a148d47                   // lea    r10d,[r10+r15*1]
541	WORD $0x8941; BYTE $0xd4           // mov    r12d,edx
542	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
543
544	// ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
545	LONG $0x248c0344; LONG $0x00000088 // add    r9d,[rsp+0x88]
546	WORD $0x2141; BYTE $0xcc           // and    r12d,ecx
547	LONG $0xf07b63c4; WORD $0x19e9     // rorx   r13d,ecx,0x19
548	LONG $0xd773c5c5; BYTE $0x02       // vpsrlq ymm7,ymm7,0x2
549	LONG $0xf07b63c4; WORD $0x0bf9     // rorx   r15d,ecx,0xb
550	LONG $0x32148d47                   // lea    r10d,[r10+r14*1]
551	LONG $0x210c8d47                   // lea    r9d,[r9+r12*1]
552	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
553	LONG $0xf27042c4; BYTE $0xe0       // andn   r12d,ecx,r8d
554	WORD $0x3145; BYTE $0xfd           // xor    r13d,r15d
555	LONG $0xf07b63c4; WORD $0x06f1     // rorx   r14d,ecx,0x6
556	LONG $0x004dc2c4; BYTE $0xf0       // vpshufb ymm6,ymm6,ymm8
557	LONG $0x210c8d47                   // lea    r9d,[r9+r12*1]
558	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
559	WORD $0x8945; BYTE $0xd7           // mov    r15d,r10d
560	LONG $0xd6feedc5                   // vpaddd ymm2,ymm2,ymm6
561	LONG $0xf07b43c4; WORD $0x16e2     // rorx   r12d,r10d,0x16
562	LONG $0x290c8d47                   // lea    r9d,[r9+r13*1]
563	WORD $0x3145; BYTE $0xdf           // xor    r15d,r11d
564	LONG $0xfa70fdc5; BYTE $0x50       // vpshufd ymm7,ymm2,0x50
565	LONG $0xf07b43c4; WORD $0x0df2     // rorx   r14d,r10d,0xd
566	LONG $0xf07b43c4; WORD $0x02ea     // rorx   r13d,r10d,0x2
567	LONG $0x0b1c8d42                   // lea    ebx,[rbx+r9*1]
568	LONG $0xd772cdc5; BYTE $0x0a       // vpsrld ymm6,ymm7,0xa
569	WORD $0x2144; BYTE $0xff           // and    edi,r15d
570	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
571	WORD $0x3144; BYTE $0xdf           // xor    edi,r11d
572	LONG $0xd773c5c5; BYTE $0x11       // vpsrlq ymm7,ymm7,0x11
573	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
574	LONG $0x390c8d45                   // lea    r9d,[r9+rdi*1]
575	WORD $0x8941; BYTE $0xcc           // mov    r12d,ecx
576	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
577
578	// ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
579	LONG $0x24840344; LONG $0x0000008c // add    r8d,[rsp+0x8c]
580	WORD $0x2141; BYTE $0xdc           // and    r12d,ebx
581	LONG $0xf07b63c4; WORD $0x19eb     // rorx   r13d,ebx,0x19
582	LONG $0xd773c5c5; BYTE $0x02       // vpsrlq ymm7,ymm7,0x2
583	LONG $0xf07be3c4; WORD $0x0bfb     // rorx   edi,ebx,0xb
584	LONG $0x310c8d47                   // lea    r9d,[r9+r14*1]
585	LONG $0x20048d47                   // lea    r8d,[r8+r12*1]
586	LONG $0xf7efcdc5                   // vpxor  ymm6,ymm6,ymm7
587	LONG $0xf26062c4; BYTE $0xe2       // andn   r12d,ebx,edx
588	WORD $0x3141; BYTE $0xfd           // xor    r13d,edi
589	LONG $0xf07b63c4; WORD $0x06f3     // rorx   r14d,ebx,0x6
590	LONG $0x004dc2c4; BYTE $0xf1       // vpshufb ymm6,ymm6,ymm9
591	LONG $0x20048d47                   // lea    r8d,[r8+r12*1]
592	WORD $0x3145; BYTE $0xf5           // xor    r13d,r14d
593	WORD $0x8944; BYTE $0xcf           // mov    edi,r9d
594	LONG $0xd6feedc5                   // vpaddd ymm2,ymm2,ymm6
595	LONG $0xf07b43c4; WORD $0x16e1     // rorx   r12d,r9d,0x16
596	LONG $0x28048d47                   // lea    r8d,[r8+r13*1]
597	WORD $0x3144; BYTE $0xd7           // xor    edi,r10d
598	LONG $0x75feedc5; BYTE $0x40       // vpaddd ymm6,ymm2,[rbp+0x40]
599	LONG $0xf07b43c4; WORD $0x0df1     // rorx   r14d,r9d,0xd
600	LONG $0xf07b43c4; WORD $0x02e9     // rorx   r13d,r9d,0x2
601	LONG $0x00048d42                   // lea    eax,[rax+r8*1]
602	WORD $0x2141; BYTE $0xff           // and    r15d,edi
603	WORD $0x3145; BYTE $0xe6           // xor    r14d,r12d
604	WORD $0x3145; BYTE $0xd7           // xor    r15d,r10d
605	WORD $0x3145; BYTE $0xee           // xor    r14d,r13d
606	LONG $0x38048d47                   // lea    r8d,[r8+r15*1]
607	WORD $0x8941; BYTE $0xdc           // mov    r12d,ebx
608
609	LONG $0x347ffdc5; BYTE $0x24   // vmovdqa [rsp],ymm6
610	LONG $0x0f7de3c4; WORD $0x04e3 // vpalignr ymm4,ymm0,ymm3,0x4
611
612	// ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
613	LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add    edx,[rsp+0xa0]
614	WORD $0x2141; BYTE $0xc4                   // and    r12d,eax
615	LONG $0xf07b63c4; WORD $0x19e8             // rorx   r13d,eax,0x19
616	LONG $0x0f6de3c4; WORD $0x04f9             // vpalignr ymm7,ymm2,ymm1,0x4
617	LONG $0xf07b63c4; WORD $0x0bf8             // rorx   r15d,eax,0xb
618	LONG $0x30048d47                           // lea    r8d,[r8+r14*1]
619	LONG $0x22148d42                           // lea    edx,[rdx+r12*1]
620	LONG $0xd472cdc5; BYTE $0x07               // vpsrld ymm6,ymm4,0x7
621	LONG $0xf27862c4; BYTE $0xe1               // andn   r12d,eax,ecx
622	WORD $0x3145; BYTE $0xfd                   // xor    r13d,r15d
623	LONG $0xf07b63c4; WORD $0x06f0             // rorx   r14d,eax,0x6
624	LONG $0xdffee5c5                           // vpaddd ymm3,ymm3,ymm7
625	LONG $0x22148d42                           // lea    edx,[rdx+r12*1]
626	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
627	WORD $0x8945; BYTE $0xc7                   // mov    r15d,r8d
628	LONG $0xd472c5c5; BYTE $0x03               // vpsrld ymm7,ymm4,0x3
629	LONG $0xf07b43c4; WORD $0x16e0             // rorx   r12d,r8d,0x16
630	LONG $0x2a148d42                           // lea    edx,[rdx+r13*1]
631	WORD $0x3145; BYTE $0xcf                   // xor    r15d,r9d
632	LONG $0xf472d5c5; BYTE $0x0e               // vpslld ymm5,ymm4,0xe
633	LONG $0xf07b43c4; WORD $0x0df0             // rorx   r14d,r8d,0xd
634	LONG $0xf07b43c4; WORD $0x02e8             // rorx   r13d,r8d,0x2
635	LONG $0x131c8d45                           // lea    r11d,[r11+rdx*1]
636	LONG $0xe6efc5c5                           // vpxor  ymm4,ymm7,ymm6
637	WORD $0x2144; BYTE $0xff                   // and    edi,r15d
638	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
639	WORD $0x3144; BYTE $0xcf                   // xor    edi,r9d
640	LONG $0xfa70fdc5; BYTE $0xfa               // vpshufd ymm7,ymm2,0xfa
641	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
642	WORD $0x148d; BYTE $0x3a                   // lea    edx,[rdx+rdi*1]
643	WORD $0x8941; BYTE $0xc4                   // mov    r12d,eax
644	LONG $0xd672cdc5; BYTE $0x0b               // vpsrld ymm6,ymm6,0xb
645
646	// ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
647	LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add    ecx,[rsp+0xa4]
648	WORD $0x2145; BYTE $0xdc                   // and    r12d,r11d
649	LONG $0xf07b43c4; WORD $0x19eb             // rorx   r13d,r11d,0x19
650	LONG $0xe5efddc5                           // vpxor  ymm4,ymm4,ymm5
651	LONG $0xf07bc3c4; WORD $0x0bfb             // rorx   edi,r11d,0xb
652	LONG $0x32148d42                           // lea    edx,[rdx+r14*1]
653	LONG $0x210c8d42                           // lea    ecx,[rcx+r12*1]
654	LONG $0xf572d5c5; BYTE $0x0b               // vpslld ymm5,ymm5,0xb
655	LONG $0xf22062c4; BYTE $0xe3               // andn   r12d,r11d,ebx
656	WORD $0x3141; BYTE $0xfd                   // xor    r13d,edi
657	LONG $0xf07b43c4; WORD $0x06f3             // rorx   r14d,r11d,0x6
658	LONG $0xe6efddc5                           // vpxor  ymm4,ymm4,ymm6
659	LONG $0x210c8d42                           // lea    ecx,[rcx+r12*1]
660	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
661	WORD $0xd789                               // mov    edi,edx
662	LONG $0xd772cdc5; BYTE $0x0a               // vpsrld ymm6,ymm7,0xa
663	LONG $0xf07b63c4; WORD $0x16e2             // rorx   r12d,edx,0x16
664	LONG $0x290c8d42                           // lea    ecx,[rcx+r13*1]
665	WORD $0x3144; BYTE $0xc7                   // xor    edi,r8d
666	LONG $0xe5efddc5                           // vpxor  ymm4,ymm4,ymm5
667	LONG $0xf07b63c4; WORD $0x0df2             // rorx   r14d,edx,0xd
668	LONG $0xf07b63c4; WORD $0x02ea             // rorx   r13d,edx,0x2
669	LONG $0x0a148d45                           // lea    r10d,[r10+rcx*1]
670	LONG $0xd773c5c5; BYTE $0x11               // vpsrlq ymm7,ymm7,0x11
671	WORD $0x2141; BYTE $0xff                   // and    r15d,edi
672	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
673	WORD $0x3145; BYTE $0xc7                   // xor    r15d,r8d
674	LONG $0xdcfee5c5                           // vpaddd ymm3,ymm3,ymm4
675	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
676	LONG $0x390c8d42                           // lea    ecx,[rcx+r15*1]
677	WORD $0x8945; BYTE $0xdc                   // mov    r12d,r11d
678	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
679
680	// ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
681	LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add    ebx,[rsp+0xa8]
682	WORD $0x2145; BYTE $0xd4                   // and    r12d,r10d
683	LONG $0xf07b43c4; WORD $0x19ea             // rorx   r13d,r10d,0x19
684	LONG $0xd773c5c5; BYTE $0x02               // vpsrlq ymm7,ymm7,0x2
685	LONG $0xf07b43c4; WORD $0x0bfa             // rorx   r15d,r10d,0xb
686	LONG $0x310c8d42                           // lea    ecx,[rcx+r14*1]
687	LONG $0x231c8d42                           // lea    ebx,[rbx+r12*1]
688	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
689	LONG $0xf22862c4; BYTE $0xe0               // andn   r12d,r10d,eax
690	WORD $0x3145; BYTE $0xfd                   // xor    r13d,r15d
691	LONG $0xf07b43c4; WORD $0x06f2             // rorx   r14d,r10d,0x6
692	LONG $0x004dc2c4; BYTE $0xf0               // vpshufb ymm6,ymm6,ymm8
693	LONG $0x231c8d42                           // lea    ebx,[rbx+r12*1]
694	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
695	WORD $0x8941; BYTE $0xcf                   // mov    r15d,ecx
696	LONG $0xdefee5c5                           // vpaddd ymm3,ymm3,ymm6
697	LONG $0xf07b63c4; WORD $0x16e1             // rorx   r12d,ecx,0x16
698	LONG $0x2b1c8d42                           // lea    ebx,[rbx+r13*1]
699	WORD $0x3141; BYTE $0xd7                   // xor    r15d,edx
700	LONG $0xfb70fdc5; BYTE $0x50               // vpshufd ymm7,ymm3,0x50
701	LONG $0xf07b63c4; WORD $0x0df1             // rorx   r14d,ecx,0xd
702	LONG $0xf07b63c4; WORD $0x02e9             // rorx   r13d,ecx,0x2
703	LONG $0x190c8d45                           // lea    r9d,[r9+rbx*1]
704	LONG $0xd772cdc5; BYTE $0x0a               // vpsrld ymm6,ymm7,0xa
705	WORD $0x2144; BYTE $0xff                   // and    edi,r15d
706	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
707	WORD $0xd731                               // xor    edi,edx
708	LONG $0xd773c5c5; BYTE $0x11               // vpsrlq ymm7,ymm7,0x11
709	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
710	WORD $0x1c8d; BYTE $0x3b                   // lea    ebx,[rbx+rdi*1]
711	WORD $0x8945; BYTE $0xd4                   // mov    r12d,r10d
712	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
713
714	// ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
715	LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add    eax,[rsp+0xac]
716	WORD $0x2145; BYTE $0xcc                   // and    r12d,r9d
717	LONG $0xf07b43c4; WORD $0x19e9             // rorx   r13d,r9d,0x19
718	LONG $0xd773c5c5; BYTE $0x02               // vpsrlq ymm7,ymm7,0x2
719	LONG $0xf07bc3c4; WORD $0x0bf9             // rorx   edi,r9d,0xb
720	LONG $0x331c8d42                           // lea    ebx,[rbx+r14*1]
721	LONG $0x20048d42                           // lea    eax,[rax+r12*1]
722	LONG $0xf7efcdc5                           // vpxor  ymm6,ymm6,ymm7
723	LONG $0xf23042c4; BYTE $0xe3               // andn   r12d,r9d,r11d
724	WORD $0x3141; BYTE $0xfd                   // xor    r13d,edi
725	LONG $0xf07b43c4; WORD $0x06f1             // rorx   r14d,r9d,0x6
726	LONG $0x004dc2c4; BYTE $0xf1               // vpshufb ymm6,ymm6,ymm9
727	LONG $0x20048d42                           // lea    eax,[rax+r12*1]
728	WORD $0x3145; BYTE $0xf5                   // xor    r13d,r14d
729	WORD $0xdf89                               // mov    edi,ebx
730	LONG $0xdefee5c5                           // vpaddd ymm3,ymm3,ymm6
731	LONG $0xf07b63c4; WORD $0x16e3             // rorx   r12d,ebx,0x16
732	LONG $0x28048d42                           // lea    eax,[rax+r13*1]
733	WORD $0xcf31                               // xor    edi,ecx
734	LONG $0x75fee5c5; BYTE $0x60               // vpaddd ymm6,ymm3,[rbp+0x60]
735	LONG $0xf07b63c4; WORD $0x0df3             // rorx   r14d,ebx,0xd
736	LONG $0xf07b63c4; WORD $0x02eb             // rorx   r13d,ebx,0x2
737	LONG $0x00048d45                           // lea    r8d,[r8+rax*1]
738	WORD $0x2141; BYTE $0xff                   // and    r15d,edi
739	WORD $0x3145; BYTE $0xe6                   // xor    r14d,r12d
740	WORD $0x3141; BYTE $0xcf                   // xor    r15d,ecx
741	WORD $0x3145; BYTE $0xee                   // xor    r14d,r13d
742	LONG $0x38048d42                           // lea    eax,[rax+r15*1]
743	WORD $0x8945; BYTE $0xcc                   // mov    r12d,r9d
744
745	LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
746	ADDQ $0x80, BP
747
748	CMPB 0x3(BP), $0x0
749	JNE  loop1
750
751	// ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x40)
752	LONG $0x245c0344; BYTE $0x40   // add    r11d,[rsp+0x40]
753	WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
754	LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
755	LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
756	LONG $0x30048d42               // lea    eax,[rax+r14*1]
757	LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
758	LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
759	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
760	LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
761	LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
762	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
763	WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
764	LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
765	LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
766	WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
767	LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
768	LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
769	LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
770	WORD $0x2144; BYTE $0xff       // and    edi,r15d
771	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
772	WORD $0xdf31                   // xor    edi,ebx
773	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
774	LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
775	WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
776
777	// ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x44)
778	LONG $0x24540344; BYTE $0x44   // add    r10d,[rsp+0x44]
779	WORD $0x2141; BYTE $0xd4       // and    r12d,edx
780	LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
781	LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
782	LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
783	LONG $0x22148d47               // lea    r10d,[r10+r12*1]
784	LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
785	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
786	LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
787	LONG $0x22148d47               // lea    r10d,[r10+r12*1]
788	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
789	WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
790	LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
791	LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
792	WORD $0xc731                   // xor    edi,eax
793	LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
794	LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
795	LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
796	WORD $0x2141; BYTE $0xff       // and    r15d,edi
797	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
798	WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
799	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
800	LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
801	WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
802
803	// ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x48)
804	LONG $0x244c0344; BYTE $0x48   // add    r9d,[rsp+0x48]
805	WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
806	LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
807	LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
808	LONG $0x32148d47               // lea    r10d,[r10+r14*1]
809	LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
810	LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
811	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
812	LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
813	LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
814	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
815	WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
816	LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
817	LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
818	WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
819	LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
820	LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
821	LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
822	WORD $0x2144; BYTE $0xff       // and    edi,r15d
823	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
824	WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
825	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
826	LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
827	WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
828
829	// ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x4c)
830	LONG $0x24440344; BYTE $0x4c   // add    r8d,[rsp+0x4c]
831	WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
832	LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
833	LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
834	LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
835	LONG $0x20048d47               // lea    r8d,[r8+r12*1]
836	LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
837	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
838	LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
839	LONG $0x20048d47               // lea    r8d,[r8+r12*1]
840	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
841	WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
842	LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
843	LONG $0x28048d47               // lea    r8d,[r8+r13*1]
844	WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
845	LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
846	LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
847	LONG $0x00048d42               // lea    eax,[rax+r8*1]
848	WORD $0x2141; BYTE $0xff       // and    r15d,edi
849	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
850	WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
851	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
852	LONG $0x38048d47               // lea    r8d,[r8+r15*1]
853	WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
854
855	// ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x60)
856	LONG $0x60245403               // add    edx,[rsp+0x60]
857	WORD $0x2141; BYTE $0xc4       // and    r12d,eax
858	LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
859	LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
860	LONG $0x30048d47               // lea    r8d,[r8+r14*1]
861	LONG $0x22148d42               // lea    edx,[rdx+r12*1]
862	LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
863	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
864	LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
865	LONG $0x22148d42               // lea    edx,[rdx+r12*1]
866	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
867	WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
868	LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
869	LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
870	WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
871	LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
872	LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
873	LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
874	WORD $0x2144; BYTE $0xff       // and    edi,r15d
875	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
876	WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
877	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
878	WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
879	WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
880
881	// ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x64)
882	LONG $0x64244c03               // add    ecx,[rsp+0x64]
883	WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
884	LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
885	LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
886	LONG $0x32148d42               // lea    edx,[rdx+r14*1]
887	LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
888	LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
889	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
890	LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
891	LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
892	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
893	WORD $0xd789                   // mov    edi,edx
894	LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
895	LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
896	WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
897	LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
898	LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
899	LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
900	WORD $0x2141; BYTE $0xff       // and    r15d,edi
901	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
902	WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
903	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
904	LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
905	WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
906
907	// ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x68)
908	LONG $0x68245c03               // add    ebx,[rsp+0x68]
909	WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
910	LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
911	LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
912	LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
913	LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
914	LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
915	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
916	LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
917	LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
918	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
919	WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
920	LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
921	LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
922	WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
923	LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
924	LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
925	LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
926	WORD $0x2144; BYTE $0xff       // and    edi,r15d
927	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
928	WORD $0xd731                   // xor    edi,edx
929	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
930	WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
931	WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
932
933	// ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x6c)
934	LONG $0x6c244403               // add    eax,[rsp+0x6c]
935	WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
936	LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
937	LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
938	LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
939	LONG $0x20048d42               // lea    eax,[rax+r12*1]
940	LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
941	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
942	LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
943	LONG $0x20048d42               // lea    eax,[rax+r12*1]
944	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
945	WORD $0xdf89                   // mov    edi,ebx
946	LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
947	LONG $0x28048d42               // lea    eax,[rax+r13*1]
948	WORD $0xcf31                   // xor    edi,ecx
949	LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
950	LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
951	LONG $0x00048d45               // lea    r8d,[r8+rax*1]
952	WORD $0x2141; BYTE $0xff       // and    r15d,edi
953	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
954	WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
955	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
956	LONG $0x38048d42               // lea    eax,[rax+r15*1]
957	WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
958
959	// ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x00)
960	LONG $0x241c0344               // add    r11d,[rsp]
961	WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
962	LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
963	LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
964	LONG $0x30048d42               // lea    eax,[rax+r14*1]
965	LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
966	LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
967	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
968	LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
969	LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
970	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
971	WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
972	LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
973	LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
974	WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
975	LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
976	LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
977	LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
978	WORD $0x2144; BYTE $0xff       // and    edi,r15d
979	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
980	WORD $0xdf31                   // xor    edi,ebx
981	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
982	LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
983	WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
984
985	// ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x04)
986	LONG $0x24540344; BYTE $0x04   // add    r10d,[rsp+0x4]
987	WORD $0x2141; BYTE $0xd4       // and    r12d,edx
988	LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
989	LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
990	LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
991	LONG $0x22148d47               // lea    r10d,[r10+r12*1]
992	LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
993	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
994	LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
995	LONG $0x22148d47               // lea    r10d,[r10+r12*1]
996	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
997	WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
998	LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
999	LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
1000	WORD $0xc731                   // xor    edi,eax
1001	LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
1002	LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
1003	LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
1004	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1005	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1006	WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
1007	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1008	LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
1009	WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
1010
1011	// ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x08)
1012	LONG $0x244c0344; BYTE $0x08   // add    r9d,[rsp+0x8]
1013	WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
1014	LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
1015	LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
1016	LONG $0x32148d47               // lea    r10d,[r10+r14*1]
1017	LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
1018	LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
1019	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
1020	LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
1021	LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
1022	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1023	WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
1024	LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
1025	LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
1026	WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
1027	LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
1028	LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
1029	LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
1030	WORD $0x2144; BYTE $0xff       // and    edi,r15d
1031	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1032	WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
1033	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1034	LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
1035	WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
1036
1037	// ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x0c)
1038	LONG $0x24440344; BYTE $0x0c   // add    r8d,[rsp+0xc]
1039	WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
1040	LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
1041	LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
1042	LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
1043	LONG $0x20048d47               // lea    r8d,[r8+r12*1]
1044	LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
1045	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
1046	LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
1047	LONG $0x20048d47               // lea    r8d,[r8+r12*1]
1048	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1049	WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
1050	LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
1051	LONG $0x28048d47               // lea    r8d,[r8+r13*1]
1052	WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
1053	LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
1054	LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
1055	LONG $0x00048d42               // lea    eax,[rax+r8*1]
1056	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1057	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1058	WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
1059	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1060	LONG $0x38048d47               // lea    r8d,[r8+r15*1]
1061	WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
1062
1063	// ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x20)
1064	LONG $0x20245403               // add    edx,[rsp+0x20]
1065	WORD $0x2141; BYTE $0xc4       // and    r12d,eax
1066	LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
1067	LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
1068	LONG $0x30048d47               // lea    r8d,[r8+r14*1]
1069	LONG $0x22148d42               // lea    edx,[rdx+r12*1]
1070	LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
1071	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
1072	LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
1073	LONG $0x22148d42               // lea    edx,[rdx+r12*1]
1074	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1075	WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
1076	LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
1077	LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
1078	WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
1079	LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
1080	LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
1081	LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
1082	WORD $0x2144; BYTE $0xff       // and    edi,r15d
1083	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1084	WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
1085	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1086	WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
1087	WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
1088
1089	// ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x24)
1090	LONG $0x24244c03               // add    ecx,[rsp+0x24]
1091	WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
1092	LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
1093	LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
1094	LONG $0x32148d42               // lea    edx,[rdx+r14*1]
1095	LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
1096	LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
1097	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
1098	LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
1099	LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
1100	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1101	WORD $0xd789                   // mov    edi,edx
1102	LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
1103	LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
1104	WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
1105	LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
1106	LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
1107	LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
1108	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1109	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1110	WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
1111	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1112	LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
1113	WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
1114
1115	// ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x28)
1116	LONG $0x28245c03               // add    ebx,[rsp+0x28]
1117	WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
1118	LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
1119	LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
1120	LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
1121	LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
1122	LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
1123	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
1124	LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
1125	LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
1126	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1127	WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
1128	LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
1129	LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
1130	WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
1131	LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
1132	LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
1133	LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
1134	WORD $0x2144; BYTE $0xff       // and    edi,r15d
1135	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1136	WORD $0xd731                   // xor    edi,edx
1137	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1138	WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
1139	WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
1140
1141	// ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x2c)
1142	LONG $0x2c244403               // add    eax,[rsp+0x2c]
1143	WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
1144	LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
1145	LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
1146	LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
1147	LONG $0x20048d42               // lea    eax,[rax+r12*1]
1148	LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
1149	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
1150	LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
1151	LONG $0x20048d42               // lea    eax,[rax+r12*1]
1152	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1153	WORD $0xdf89                   // mov    edi,ebx
1154	LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
1155	LONG $0x28048d42               // lea    eax,[rax+r13*1]
1156	WORD $0xcf31                   // xor    edi,ecx
1157	LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
1158	LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
1159	LONG $0x00048d45               // lea    r8d,[r8+rax*1]
1160	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1161	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1162	WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
1163	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1164	LONG $0x38048d42               // lea    eax,[rax+r15*1]
1165	WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
1166
1167	MOVQ 0x200(SP), DI // $_ctx
1168	ADDQ R14, AX
1169
1170	LEAQ 0x1c0(SP), BP
1171
1172	ADDL (DI), AX
1173	ADDL 4(DI), BX
1174	ADDL 8(DI), CX
1175	ADDL 12(DI), DX
1176	ADDL 16(DI), R8
1177	ADDL 20(DI), R9
1178	ADDL 24(DI), R10
1179	ADDL 28(DI), R11
1180
1181	MOVL AX, (DI)
1182	MOVL BX, 4(DI)
1183	MOVL CX, 8(DI)
1184	MOVL DX, 12(DI)
1185	MOVL R8, 16(DI)
1186	MOVL R9, 20(DI)
1187	MOVL R10, 24(DI)
1188	MOVL R11, 28(DI)
1189
1190	CMPQ SI, 0x50(BP) // $_end
1191	JE   done
1192
1193	XORQ R14, R14
1194	MOVQ BX, DI
1195	XORQ CX, DI   // magic
1196	MOVQ R9, R12
1197
1198loop2:
1199	// ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, BP, 0x10)
1200	LONG $0x105d0344               // add    r11d,[rbp+0x10]
1201	WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
1202	LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
1203	LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
1204	LONG $0x30048d42               // lea    eax,[rax+r14*1]
1205	LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
1206	LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
1207	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
1208	LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
1209	LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
1210	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1211	WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
1212	LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
1213	LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
1214	WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
1215	LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
1216	LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
1217	LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
1218	WORD $0x2144; BYTE $0xff       // and    edi,r15d
1219	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1220	WORD $0xdf31                   // xor    edi,ebx
1221	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1222	LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
1223	WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
1224
1225	// ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, BP, 0x14)
1226	LONG $0x14550344               // add    r10d,[rbp+0x14]
1227	WORD $0x2141; BYTE $0xd4       // and    r12d,edx
1228	LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
1229	LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
1230	LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
1231	LONG $0x22148d47               // lea    r10d,[r10+r12*1]
1232	LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
1233	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
1234	LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
1235	LONG $0x22148d47               // lea    r10d,[r10+r12*1]
1236	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1237	WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
1238	LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
1239	LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
1240	WORD $0xc731                   // xor    edi,eax
1241	LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
1242	LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
1243	LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
1244	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1245	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1246	WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
1247	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1248	LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
1249	WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
1250
1251	// ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, BP, 0x18)
1252	LONG $0x184d0344               // add    r9d,[rbp+0x18]
1253	WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
1254	LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
1255	LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
1256	LONG $0x32148d47               // lea    r10d,[r10+r14*1]
1257	LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
1258	LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
1259	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
1260	LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
1261	LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
1262	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1263	WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
1264	LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
1265	LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
1266	WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
1267	LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
1268	LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
1269	LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
1270	WORD $0x2144; BYTE $0xff       // and    edi,r15d
1271	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1272	WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
1273	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1274	LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
1275	WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
1276
1277	// ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, BP, 0x1c)
1278	LONG $0x1c450344               // add    r8d,[rbp+0x1c]
1279	WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
1280	LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
1281	LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
1282	LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
1283	LONG $0x20048d47               // lea    r8d,[r8+r12*1]
1284	LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
1285	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
1286	LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
1287	LONG $0x20048d47               // lea    r8d,[r8+r12*1]
1288	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1289	WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
1290	LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
1291	LONG $0x28048d47               // lea    r8d,[r8+r13*1]
1292	WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
1293	LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
1294	LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
1295	LONG $0x00048d42               // lea    eax,[rax+r8*1]
1296	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1297	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1298	WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
1299	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1300	LONG $0x38048d47               // lea    r8d,[r8+r15*1]
1301	WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
1302
1303	// ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, BP, 0x30)
1304	WORD $0x5503; BYTE $0x30       // add    edx,[rbp+0x30]
1305	WORD $0x2141; BYTE $0xc4       // and    r12d,eax
1306	LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
1307	LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
1308	LONG $0x30048d47               // lea    r8d,[r8+r14*1]
1309	LONG $0x22148d42               // lea    edx,[rdx+r12*1]
1310	LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
1311	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
1312	LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
1313	LONG $0x22148d42               // lea    edx,[rdx+r12*1]
1314	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1315	WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
1316	LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
1317	LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
1318	WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
1319	LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
1320	LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
1321	LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
1322	WORD $0x2144; BYTE $0xff       // and    edi,r15d
1323	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1324	WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
1325	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1326	WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
1327	WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
1328
1329	// ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, BP, 0x34)
1330	WORD $0x4d03; BYTE $0x34       // add    ecx,[rbp+0x34]
1331	WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
1332	LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
1333	LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
1334	LONG $0x32148d42               // lea    edx,[rdx+r14*1]
1335	LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
1336	LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
1337	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
1338	LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
1339	LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
1340	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1341	WORD $0xd789                   // mov    edi,edx
1342	LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
1343	LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
1344	WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
1345	LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
1346	LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
1347	LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
1348	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1349	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1350	WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
1351	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1352	LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
1353	WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
1354
1355	// ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, BP, 0x38)
1356	WORD $0x5d03; BYTE $0x38       // add    ebx,[rbp+0x38]
1357	WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
1358	LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
1359	LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
1360	LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
1361	LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
1362	LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
1363	WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
1364	LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
1365	LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
1366	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1367	WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
1368	LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
1369	LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
1370	WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
1371	LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
1372	LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
1373	LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
1374	WORD $0x2144; BYTE $0xff       // and    edi,r15d
1375	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1376	WORD $0xd731                   // xor    edi,edx
1377	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1378	WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
1379	WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
1380
1381	// ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, BP, 0x3c)
1382	WORD $0x4503; BYTE $0x3c       // add    eax,[rbp+0x3c]
1383	WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
1384	LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
1385	LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
1386	LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
1387	LONG $0x20048d42               // lea    eax,[rax+r12*1]
1388	LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
1389	WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
1390	LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
1391	LONG $0x20048d42               // lea    eax,[rax+r12*1]
1392	WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
1393	WORD $0xdf89                   // mov    edi,ebx
1394	LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
1395	LONG $0x28048d42               // lea    eax,[rax+r13*1]
1396	WORD $0xcf31                   // xor    edi,ecx
1397	LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
1398	LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
1399	LONG $0x00048d45               // lea    r8d,[r8+rax*1]
1400	WORD $0x2141; BYTE $0xff       // and    r15d,edi
1401	WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
1402	WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
1403	WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
1404	LONG $0x38048d42               // lea    eax,[rax+r15*1]
1405	WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
1406
1407	ADDQ $-0x40, BP
1408	CMPQ BP, SP
1409	JAE  loop2
1410
1411	MOVQ 0x200(SP), DI // $_ctx
1412	ADDQ R14, AX
1413
1414	ADDQ $0x1c0, SP
1415
1416	ADDL (DI), AX
1417	ADDL 4(DI), BX
1418	ADDL 8(DI), CX
1419	ADDL 12(DI), DX
1420	ADDL 16(DI), R8
1421	ADDL 20(DI), R9
1422
1423	ADDQ $0x80, SI    // input += 2
1424	ADDL 24(DI), R10
1425	MOVQ SI, R12
1426	ADDL 28(DI), R11
1427	CMPQ SI, 0x50(SP) // input == _end
1428
1429	MOVL AX, (DI)
1430	LONG $0xe4440f4c // cmove  r12,rsp                /* next block or stale data */
1431	MOVL AX, (DI)
1432	MOVL BX, 4(DI)
1433	MOVL CX, 8(DI)
1434	MOVL DX, 12(DI)
1435	MOVL R8, 16(DI)
1436	MOVL R9, 20(DI)
1437	MOVL R10, 24(DI)
1438	MOVL R11, 28(DI)
1439
1440	JBE  loop0
1441	LEAQ (SP), BP
1442
1443done:
1444	MOVQ BP, SP
1445	MOVQ 0x58(SP), SP        // restore saved stack pointer
1446	WORD $0xf8c5; BYTE $0x77 // vzeroupper
1447
1448	RET
1449
1450