1 /*
2 * Copyright (c) 2016, Intel Corporation.
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25 
26 #include "precompiled.hpp"
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "runtime/stubRoutines.hpp"
30 #include "macroAssembler_x86.hpp"
31 
32 // ofs and limit are used for multi-block byte array.
33 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
fast_sha1(XMMRegister abcd,XMMRegister e0,XMMRegister e1,XMMRegister msg0,XMMRegister msg1,XMMRegister msg2,XMMRegister msg3,XMMRegister shuf_mask,Register buf,Register state,Register ofs,Register limit,Register rsp,bool multi_block)34 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
35   XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
36   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
37 
38   Label start, done_hash, loop0;
39 
40   address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
41   address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
42 
43   bind(start);
44   movdqu(abcd, Address(state, 0));
45   pinsrd(e0, Address(state, 16), 3);
46   movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
47   pand(e0, shuf_mask);
48   pshufd(abcd, abcd, 0x1B);
49   movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
50 
51   bind(loop0);
52   // Save hash values for addition after rounds
53   movdqu(Address(rsp, 0), e0);
54   movdqu(Address(rsp, 16), abcd);
55 
56 
57   // Rounds 0 - 3
58   movdqu(msg0, Address(buf, 0));
59   pshufb(msg0, shuf_mask);
60   paddd(e0, msg0);
61   movdqa(e1, abcd);
62   sha1rnds4(abcd, e0, 0);
63 
64   // Rounds 4 - 7
65   movdqu(msg1, Address(buf, 16));
66   pshufb(msg1, shuf_mask);
67   sha1nexte(e1, msg1);
68   movdqa(e0, abcd);
69   sha1rnds4(abcd, e1, 0);
70   sha1msg1(msg0, msg1);
71 
72   // Rounds 8 - 11
73   movdqu(msg2, Address(buf, 32));
74   pshufb(msg2, shuf_mask);
75   sha1nexte(e0, msg2);
76   movdqa(e1, abcd);
77   sha1rnds4(abcd, e0, 0);
78   sha1msg1(msg1, msg2);
79   pxor(msg0, msg2);
80 
81   // Rounds 12 - 15
82   movdqu(msg3, Address(buf, 48));
83   pshufb(msg3, shuf_mask);
84   sha1nexte(e1, msg3);
85   movdqa(e0, abcd);
86   sha1msg2(msg0, msg3);
87   sha1rnds4(abcd, e1, 0);
88   sha1msg1(msg2, msg3);
89   pxor(msg1, msg3);
90 
91   // Rounds 16 - 19
92   sha1nexte(e0, msg0);
93   movdqa(e1, abcd);
94   sha1msg2(msg1, msg0);
95   sha1rnds4(abcd, e0, 0);
96   sha1msg1(msg3, msg0);
97   pxor(msg2, msg0);
98 
99   // Rounds 20 - 23
100   sha1nexte(e1, msg1);
101   movdqa(e0, abcd);
102   sha1msg2(msg2, msg1);
103   sha1rnds4(abcd, e1, 1);
104   sha1msg1(msg0, msg1);
105   pxor(msg3, msg1);
106 
107   // Rounds 24 - 27
108   sha1nexte(e0, msg2);
109   movdqa(e1, abcd);
110   sha1msg2(msg3, msg2);
111   sha1rnds4(abcd, e0, 1);
112   sha1msg1(msg1, msg2);
113   pxor(msg0, msg2);
114 
115   // Rounds 28 - 31
116   sha1nexte(e1, msg3);
117   movdqa(e0, abcd);
118   sha1msg2(msg0, msg3);
119   sha1rnds4(abcd, e1, 1);
120   sha1msg1(msg2, msg3);
121   pxor(msg1, msg3);
122 
123   // Rounds 32 - 35
124   sha1nexte(e0, msg0);
125   movdqa(e1, abcd);
126   sha1msg2(msg1, msg0);
127   sha1rnds4(abcd, e0, 1);
128   sha1msg1(msg3, msg0);
129   pxor(msg2, msg0);
130 
131   // Rounds 36 - 39
132   sha1nexte(e1, msg1);
133   movdqa(e0, abcd);
134   sha1msg2(msg2, msg1);
135   sha1rnds4(abcd, e1, 1);
136   sha1msg1(msg0, msg1);
137   pxor(msg3, msg1);
138 
139   // Rounds 40 - 43
140   sha1nexte(e0, msg2);
141   movdqa(e1, abcd);
142   sha1msg2(msg3, msg2);
143   sha1rnds4(abcd, e0, 2);
144   sha1msg1(msg1, msg2);
145   pxor(msg0, msg2);
146 
147   // Rounds 44 - 47
148   sha1nexte(e1, msg3);
149   movdqa(e0, abcd);
150   sha1msg2(msg0, msg3);
151   sha1rnds4(abcd, e1, 2);
152   sha1msg1(msg2, msg3);
153   pxor(msg1, msg3);
154 
155   // Rounds 48 - 51
156   sha1nexte(e0, msg0);
157   movdqa(e1, abcd);
158   sha1msg2(msg1, msg0);
159   sha1rnds4(abcd, e0, 2);
160   sha1msg1(msg3, msg0);
161   pxor(msg2, msg0);
162 
163   // Rounds 52 - 55
164   sha1nexte(e1, msg1);
165   movdqa(e0, abcd);
166   sha1msg2(msg2, msg1);
167   sha1rnds4(abcd, e1, 2);
168   sha1msg1(msg0, msg1);
169   pxor(msg3, msg1);
170 
171   // Rounds 56 - 59
172   sha1nexte(e0, msg2);
173   movdqa(e1, abcd);
174   sha1msg2(msg3, msg2);
175   sha1rnds4(abcd, e0, 2);
176   sha1msg1(msg1, msg2);
177   pxor(msg0, msg2);
178 
179   // Rounds 60 - 63
180   sha1nexte(e1, msg3);
181   movdqa(e0, abcd);
182   sha1msg2(msg0, msg3);
183   sha1rnds4(abcd, e1, 3);
184   sha1msg1(msg2, msg3);
185   pxor(msg1, msg3);
186 
187   // Rounds 64 - 67
188   sha1nexte(e0, msg0);
189   movdqa(e1, abcd);
190   sha1msg2(msg1, msg0);
191   sha1rnds4(abcd, e0, 3);
192   sha1msg1(msg3, msg0);
193   pxor(msg2, msg0);
194 
195   // Rounds 68 - 71
196   sha1nexte(e1, msg1);
197   movdqa(e0, abcd);
198   sha1msg2(msg2, msg1);
199   sha1rnds4(abcd, e1, 3);
200   pxor(msg3, msg1);
201 
202   // Rounds 72 - 75
203   sha1nexte(e0, msg2);
204   movdqa(e1, abcd);
205   sha1msg2(msg3, msg2);
206   sha1rnds4(abcd, e0, 3);
207 
208   // Rounds 76 - 79
209   sha1nexte(e1, msg3);
210   movdqa(e0, abcd);
211   sha1rnds4(abcd, e1, 3);
212 
213   // add current hash values with previously saved
214   movdqu(msg0, Address(rsp, 0));
215   sha1nexte(e0, msg0);
216   movdqu(msg0, Address(rsp, 16));
217   paddd(abcd, msg0);
218 
219   if (multi_block) {
220     // increment data pointer and loop if more to process
221     addptr(buf, 64);
222     addptr(ofs, 64);
223     cmpptr(ofs, limit);
224     jcc(Assembler::belowEqual, loop0);
225     movptr(rax, ofs); //return ofs
226   }
227   // write hash values back in the correct order
228   pshufd(abcd, abcd, 0x1b);
229   movdqu(Address(state, 0), abcd);
230   pextrd(Address(state, 16), e0, 3);
231 
232   bind(done_hash);
233 
234 }
235 
236 // xmm0 (msg) is used as an implicit argument to sh256rnds2
237 // and state0 and state1 can never use xmm0 register.
238 // ofs and limit are used for multi-block byte array.
239 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
240 #ifdef _LP64
fast_sha256(XMMRegister msg,XMMRegister state0,XMMRegister state1,XMMRegister msgtmp0,XMMRegister msgtmp1,XMMRegister msgtmp2,XMMRegister msgtmp3,XMMRegister msgtmp4,Register buf,Register state,Register ofs,Register limit,Register rsp,bool multi_block,XMMRegister shuf_mask)241 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
242   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
243   Register buf, Register state, Register ofs, Register limit, Register rsp,
244   bool multi_block, XMMRegister shuf_mask) {
245 #else
246 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
247   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
248   Register buf, Register state, Register ofs, Register limit, Register rsp,
249   bool multi_block) {
250 #endif
251   Label start, done_hash, loop0;
252 
253   address K256 = StubRoutines::x86::k256_addr();
254   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
255 
256   bind(start);
257   movdqu(state0, Address(state, 0));
258   movdqu(state1, Address(state, 16));
259 
260   pshufd(state0, state0, 0xB1);
261   pshufd(state1, state1, 0x1B);
262   movdqa(msgtmp4, state0);
263   palignr(state0, state1, 8);
264   pblendw(state1, msgtmp4, 0xF0);
265 
266 #ifdef _LP64
267   movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
268 #endif
269   lea(rax, ExternalAddress(K256));
270 
271   bind(loop0);
272   movdqu(Address(rsp, 0), state0);
273   movdqu(Address(rsp, 16), state1);
274 
275   // Rounds 0-3
276   movdqu(msg, Address(buf, 0));
277 #ifdef _LP64
278   pshufb(msg, shuf_mask);
279 #else
280   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
281 #endif
282   movdqa(msgtmp0, msg);
283   paddd(msg, Address(rax, 0));
284   sha256rnds2(state1, state0);
285   pshufd(msg, msg, 0x0E);
286   sha256rnds2(state0, state1);
287 
288   // Rounds 4-7
289   movdqu(msg, Address(buf, 16));
290 #ifdef _LP64
291   pshufb(msg, shuf_mask);
292 #else
293   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
294 #endif
295   movdqa(msgtmp1, msg);
296   paddd(msg, Address(rax, 16));
297   sha256rnds2(state1, state0);
298   pshufd(msg, msg, 0x0E);
299   sha256rnds2(state0, state1);
300   sha256msg1(msgtmp0, msgtmp1);
301 
302   // Rounds 8-11
303   movdqu(msg, Address(buf, 32));
304 #ifdef _LP64
305   pshufb(msg, shuf_mask);
306 #else
307   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
308 #endif
309   movdqa(msgtmp2, msg);
310   paddd(msg, Address(rax, 32));
311   sha256rnds2(state1, state0);
312   pshufd(msg, msg, 0x0E);
313   sha256rnds2(state0, state1);
314   sha256msg1(msgtmp1, msgtmp2);
315 
316   // Rounds 12-15
317   movdqu(msg, Address(buf, 48));
318 #ifdef _LP64
319   pshufb(msg, shuf_mask);
320 #else
321   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
322 #endif
323   movdqa(msgtmp3, msg);
324   paddd(msg, Address(rax, 48));
325   sha256rnds2(state1, state0);
326   movdqa(msgtmp4, msgtmp3);
327   palignr(msgtmp4, msgtmp2, 4);
328   paddd(msgtmp0, msgtmp4);
329   sha256msg2(msgtmp0, msgtmp3);
330   pshufd(msg, msg, 0x0E);
331   sha256rnds2(state0, state1);
332   sha256msg1(msgtmp2, msgtmp3);
333 
334   // Rounds 16-19
335   movdqa(msg, msgtmp0);
336   paddd(msg, Address(rax, 64));
337   sha256rnds2(state1, state0);
338   movdqa(msgtmp4, msgtmp0);
339   palignr(msgtmp4, msgtmp3, 4);
340   paddd(msgtmp1, msgtmp4);
341   sha256msg2(msgtmp1, msgtmp0);
342   pshufd(msg, msg, 0x0E);
343   sha256rnds2(state0, state1);
344   sha256msg1(msgtmp3, msgtmp0);
345 
346   // Rounds 20-23
347   movdqa(msg, msgtmp1);
348   paddd(msg, Address(rax, 80));
349   sha256rnds2(state1, state0);
350   movdqa(msgtmp4, msgtmp1);
351   palignr(msgtmp4, msgtmp0, 4);
352   paddd(msgtmp2, msgtmp4);
353   sha256msg2(msgtmp2, msgtmp1);
354   pshufd(msg, msg, 0x0E);
355   sha256rnds2(state0, state1);
356   sha256msg1(msgtmp0, msgtmp1);
357 
358   // Rounds 24-27
359   movdqa(msg, msgtmp2);
360   paddd(msg, Address(rax, 96));
361   sha256rnds2(state1, state0);
362   movdqa(msgtmp4, msgtmp2);
363   palignr(msgtmp4, msgtmp1, 4);
364   paddd(msgtmp3, msgtmp4);
365   sha256msg2(msgtmp3, msgtmp2);
366   pshufd(msg, msg, 0x0E);
367   sha256rnds2(state0, state1);
368   sha256msg1(msgtmp1, msgtmp2);
369 
370   // Rounds 28-31
371   movdqa(msg, msgtmp3);
372   paddd(msg, Address(rax, 112));
373   sha256rnds2(state1, state0);
374   movdqa(msgtmp4, msgtmp3);
375   palignr(msgtmp4, msgtmp2, 4);
376   paddd(msgtmp0, msgtmp4);
377   sha256msg2(msgtmp0, msgtmp3);
378   pshufd(msg, msg, 0x0E);
379   sha256rnds2(state0, state1);
380   sha256msg1(msgtmp2, msgtmp3);
381 
382   // Rounds 32-35
383   movdqa(msg, msgtmp0);
384   paddd(msg, Address(rax, 128));
385   sha256rnds2(state1, state0);
386   movdqa(msgtmp4, msgtmp0);
387   palignr(msgtmp4, msgtmp3, 4);
388   paddd(msgtmp1, msgtmp4);
389   sha256msg2(msgtmp1, msgtmp0);
390   pshufd(msg, msg, 0x0E);
391   sha256rnds2(state0, state1);
392   sha256msg1(msgtmp3, msgtmp0);
393 
394   // Rounds 36-39
395   movdqa(msg, msgtmp1);
396   paddd(msg, Address(rax, 144));
397   sha256rnds2(state1, state0);
398   movdqa(msgtmp4, msgtmp1);
399   palignr(msgtmp4, msgtmp0, 4);
400   paddd(msgtmp2, msgtmp4);
401   sha256msg2(msgtmp2, msgtmp1);
402   pshufd(msg, msg, 0x0E);
403   sha256rnds2(state0, state1);
404   sha256msg1(msgtmp0, msgtmp1);
405 
406   // Rounds 40-43
407   movdqa(msg, msgtmp2);
408   paddd(msg, Address(rax, 160));
409   sha256rnds2(state1, state0);
410   movdqa(msgtmp4, msgtmp2);
411   palignr(msgtmp4, msgtmp1, 4);
412   paddd(msgtmp3, msgtmp4);
413   sha256msg2(msgtmp3, msgtmp2);
414   pshufd(msg, msg, 0x0E);
415   sha256rnds2(state0, state1);
416   sha256msg1(msgtmp1, msgtmp2);
417 
418   // Rounds 44-47
419   movdqa(msg, msgtmp3);
420   paddd(msg, Address(rax, 176));
421   sha256rnds2(state1, state0);
422   movdqa(msgtmp4, msgtmp3);
423   palignr(msgtmp4, msgtmp2, 4);
424   paddd(msgtmp0, msgtmp4);
425   sha256msg2(msgtmp0, msgtmp3);
426   pshufd(msg, msg, 0x0E);
427   sha256rnds2(state0, state1);
428   sha256msg1(msgtmp2, msgtmp3);
429 
430   // Rounds 48-51
431   movdqa(msg, msgtmp0);
432   paddd(msg, Address(rax, 192));
433   sha256rnds2(state1, state0);
434   movdqa(msgtmp4, msgtmp0);
435   palignr(msgtmp4, msgtmp3, 4);
436   paddd(msgtmp1, msgtmp4);
437   sha256msg2(msgtmp1, msgtmp0);
438   pshufd(msg, msg, 0x0E);
439   sha256rnds2(state0, state1);
440   sha256msg1(msgtmp3, msgtmp0);
441 
442   // Rounds 52-55
443   movdqa(msg, msgtmp1);
444   paddd(msg, Address(rax, 208));
445   sha256rnds2(state1, state0);
446   movdqa(msgtmp4, msgtmp1);
447   palignr(msgtmp4, msgtmp0, 4);
448   paddd(msgtmp2, msgtmp4);
449   sha256msg2(msgtmp2, msgtmp1);
450   pshufd(msg, msg, 0x0E);
451   sha256rnds2(state0, state1);
452 
453   // Rounds 56-59
454   movdqa(msg, msgtmp2);
455   paddd(msg, Address(rax, 224));
456   sha256rnds2(state1, state0);
457   movdqa(msgtmp4, msgtmp2);
458   palignr(msgtmp4, msgtmp1, 4);
459   paddd(msgtmp3, msgtmp4);
460   sha256msg2(msgtmp3, msgtmp2);
461   pshufd(msg, msg, 0x0E);
462   sha256rnds2(state0, state1);
463 
464   // Rounds 60-63
465   movdqa(msg, msgtmp3);
466   paddd(msg, Address(rax, 240));
467   sha256rnds2(state1, state0);
468   pshufd(msg, msg, 0x0E);
469   sha256rnds2(state0, state1);
470   movdqu(msg, Address(rsp, 0));
471   paddd(state0, msg);
472   movdqu(msg, Address(rsp, 16));
473   paddd(state1, msg);
474 
475   if (multi_block) {
476     // increment data pointer and loop if more to process
477     addptr(buf, 64);
478     addptr(ofs, 64);
479     cmpptr(ofs, limit);
480     jcc(Assembler::belowEqual, loop0);
481     movptr(rax, ofs); //return ofs
482   }
483 
484   pshufd(state0, state0, 0x1B);
485   pshufd(state1, state1, 0xB1);
486   movdqa(msgtmp4, state0);
487   pblendw(state0, state1, 0xF0);
488   palignr(state1, msgtmp4, 8);
489 
490   movdqu(Address(state, 0), state0);
491   movdqu(Address(state, 16), state1);
492 
493   bind(done_hash);
494 
495 }
496 
497 #ifdef _LP64
498 /*
499   The algorithm below is based on Intel publication:
500   "Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
501   The assembly code was originally provided by Sean Gulley and in many places preserves
502   the original assembly NAMES and comments to simplify matching Java assembly with its original.
503   The Java version was substantially redesigned to replace 1200 assembly instruction with
504   much shorter run-time generator of the same code in memory.
505 */
506 
507 void MacroAssembler::sha256_AVX2_one_round_compute(
508     Register  reg_old_h,
509     Register  reg_a,
510     Register  reg_b,
511     Register  reg_c,
512     Register  reg_d,
513     Register  reg_e,
514     Register  reg_f,
515     Register  reg_g,
516     Register  reg_h,
517     int iter) {
518   const Register& reg_y0     = r13;
519   const Register& reg_y1     = r14;
520   const Register& reg_y2     = r15;
521   const Register& reg_y3     = rcx;
522   const Register& reg_T1     = r12;
523   //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
524   if (iter%4 > 0) {
525     addl(reg_old_h, reg_y2);   // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
526   }
527   movl(reg_y2, reg_f);         // reg_y2 = reg_f                                ; CH
528   rorxd(reg_y0, reg_e, 25);    // reg_y0 = reg_e >> 25   ; S1A
529   rorxd(reg_y1, reg_e, 11);    // reg_y1 = reg_e >> 11    ; S1B
530   xorl(reg_y2, reg_g);         // reg_y2 = reg_f^reg_g                              ; CH
531 
532   xorl(reg_y0, reg_y1);        // reg_y0 = (reg_e>>25) ^ (reg_h>>11)  ; S1
533   rorxd(reg_y1, reg_e, 6);     // reg_y1 = (reg_e >> 6)    ; S1
534   andl(reg_y2, reg_e);         // reg_y2 = (reg_f^reg_g)&reg_e                          ; CH
535 
536   if (iter%4 > 0) {
537     addl(reg_old_h, reg_y3);   // reg_h = t1 + S0 + MAJ                     ; --
538   }
539 
540   xorl(reg_y0, reg_y1);       // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
541   rorxd(reg_T1, reg_a, 13);   // reg_T1 = reg_a >> 13    ; S0B
542   xorl(reg_y2, reg_g);        // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
543   rorxd(reg_y1, reg_a, 22);   // reg_y1 = reg_a >> 22    ; S0A
544   movl(reg_y3, reg_a);        // reg_y3 = reg_a                                ; MAJA
545 
546   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13)  ; S0
547   rorxd(reg_T1, reg_a, 2);    // reg_T1 = (reg_a >> 2)    ; S0
548   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
549   orl(reg_y3, reg_c);         // reg_y3 = reg_a|reg_c                              ; MAJA
550 
551   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
552   movl(reg_T1, reg_a);        // reg_T1 = reg_a                                ; MAJB
553   andl(reg_y3, reg_b);        // reg_y3 = (reg_a|reg_c)&reg_b                          ; MAJA
554   andl(reg_T1, reg_c);        // reg_T1 = reg_a&reg_c                              ; MAJB
555   addl(reg_y2, reg_y0);       // reg_y2 = S1 + CH                          ; --
556 
557 
558   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
559   orl(reg_y3, reg_T1);        // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
560   addl(reg_h, reg_y1);        // reg_h = k + w + reg_h + S0                    ; --
561 
562   addl(reg_d, reg_y2);        // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
563 
564 
565   if (iter%4 == 3) {
566     addl(reg_h, reg_y2);      // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
567     addl(reg_h, reg_y3);      // reg_h = t1 + S0 + MAJ                     ; --
568   }
569 }
570 
571 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
572     sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi,  r8,  r9, r10, r11, start + 0);
573     sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi,  r8,  r9, r10, start + 1);
574     sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi,  r8,  r9, start + 2);
575     sha256_AVX2_one_round_compute(r9,  r9,  r10, r11, rax, rbx, rdi, rsi,  r8, start + 3);
576 }
577 
578 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
579     sha256_AVX2_one_round_compute(r8,  r8,   r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
580     sha256_AVX2_one_round_compute(rsi, rsi,  r8,  r9, r10, r11, rax, rbx, rdi, start + 1);
581     sha256_AVX2_one_round_compute(rdi, rdi, rsi,  r8,  r9, r10, r11, rax, rbx, start + 2);
582     sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi,  r8,  r9, r10, r11, rax, start + 3);
583 }
584 
585 void MacroAssembler::sha256_AVX2_one_round_and_sched(
586         XMMRegister  xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
587         XMMRegister  xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
588         XMMRegister  xmm_2,     /* ymm6 */
589         XMMRegister  xmm_3,     /* ymm7 */
590         Register  reg_a,        /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
591         Register  reg_b,        /* rbx */    /* full cycle is 8 iterations */
592         Register  reg_c,        /* rdi */
593         Register  reg_d,        /* rsi */
594         Register  reg_e,        /* r8 */
595         Register  reg_f,        /* r9d */
596         Register  reg_g,        /* r10d */
597         Register  reg_h,        /* r11d */
598         int iter)
599 {
600   movl(rcx, reg_a);           // rcx = reg_a               ; MAJA
601   rorxd(r13, reg_e, 25);      // r13 = reg_e >> 25    ; S1A
602   rorxd(r14, reg_e, 11);      //  r14 = reg_e >> 11    ; S1B
603   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
604   orl(rcx, reg_c);            // rcx = reg_a|reg_c          ; MAJA
605 
606   movl(r15, reg_f);           // r15 = reg_f               ; CH
607   rorxd(r12, reg_a, 13);      // r12 = reg_a >> 13      ; S0B
608   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11)  ; S1
609   xorl(r15, reg_g);           // r15 = reg_f^reg_g         ; CH
610 
611   rorxd(r14, reg_e, 6);       // r14 = (reg_e >> 6)    ; S1
612   andl(r15, reg_e);           // r15 = (reg_f^reg_g)&reg_e ; CH
613 
614   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
615   rorxd(r14, reg_a, 22);      // r14 = reg_a >> 22    ; S0A
616   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
617 
618   andl(rcx, reg_b);          // rcx = (reg_a|reg_c)&reg_b                          ; MAJA
619   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13)  ; S0
620 
621   rorxd(r12, reg_a, 2);      // r12 = (reg_a >> 2)    ; S0
622   xorl(r15, reg_g);          // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
623 
624   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
625   movl(r12, reg_a);          // r12 = reg_a                                ; MAJB
626   andl(r12, reg_c);          // r12 = reg_a&reg_c                              ; MAJB
627   addl(r15, r13);            // r15 = S1 + CH                          ; --
628 
629   orl(rcx, r12);             // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
630   addl(reg_h, r14);          // reg_h = k + w + reg_h + S0                    ; --
631   addl(reg_d, r15);          // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
632 
633   addl(reg_h, r15);          // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
634   addl(reg_h, rcx);          // reg_h = t1 + S0 + MAJ                     ; --
635 
636   if (iter%4 == 0) {
637     vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit);   // ymm0 = W[-7]
638     vpaddd(xmm0, xmm0, xmm_0, AVX_256bit);         // ymm0 = W[-7] + W[-16]; y1 = (e >> 6)     ; S1
639     vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit);   // ymm1 = W[-15]
640     vpsrld(xmm2, xmm1, 7, AVX_256bit);
641     vpslld(xmm3, xmm1, 32-7, AVX_256bit);
642     vpor(xmm3, xmm3, xmm2, AVX_256bit);            // ymm3 = W[-15] ror 7
643     vpsrld(xmm2, xmm1,18, AVX_256bit);
644   } else if (iter%4 == 1 ) {
645     vpsrld(xmm8, xmm1, 3, AVX_256bit);             // ymm8 = W[-15] >> 3
646     vpslld(xmm1, xmm1, 32-18, AVX_256bit);
647     vpxor(xmm3, xmm3, xmm1, AVX_256bit);
648     vpxor(xmm3, xmm3, xmm2, AVX_256bit);           // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
649     vpxor(xmm1, xmm3, xmm8, AVX_256bit);           // ymm1 = s0
650     vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit);        // 11111010b ; ymm2 = W[-2] {BBAA}
651     vpaddd(xmm0, xmm0, xmm1, AVX_256bit);          // ymm0 = W[-16] + W[-7] + s0
652     vpsrld(xmm8, xmm2, 10, AVX_256bit);            // ymm8 = W[-2] >> 10 {BBAA}
653   } else if (iter%4 == 2) {
654     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xBxA}
655     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xBxA}
656     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
657     vpxor(xmm8, xmm8, xmm2, AVX_256bit);           // ymm8 = s1 {xBxA}
658     vpshufb(xmm8, xmm8, xmm10, AVX_256bit);        // ymm8 = s1 {00BA}
659     vpaddd(xmm0, xmm0, xmm8, AVX_256bit);          // ymm0 = {..., ..., W[1], W[0]}
660     vpshufd(xmm2, xmm0, 0x50, AVX_256bit);         // 01010000b ; ymm2 = W[-2] {DDCC}
661   } else if (iter%4 == 3) {
662     vpsrld(xmm11, xmm2, 10, AVX_256bit);           // ymm11 = W[-2] >> 10 {DDCC}
663     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xDxC}
664     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xDxC}
665     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
666     vpxor(xmm11, xmm11, xmm2, AVX_256bit);         // ymm11 = s1 {xDxC}
667     vpshufb(xmm11, xmm11, xmm12, AVX_256bit);      // ymm11 = s1 {DC00}
668     vpaddd(xmm_0, xmm11, xmm0, AVX_256bit);        // xmm_0 = {W[3], W[2], W[1], W[0]}
669   }
670 }
671 
672 void MacroAssembler::addm(int disp, Register r1, Register r2) {
673   addl(r2, Address(r1, disp));
674   movl(Address(r1, disp), r2);
675 }
676 
677 void MacroAssembler::addmq(int disp, Register r1, Register r2) {
678   addq(r2, Address(r1, disp));
679   movq(Address(r1, disp), r2);
680 }
681 
682 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
683   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
684   Register buf, Register state, Register ofs, Register limit, Register rsp,
685   bool multi_block, XMMRegister shuf_mask) {
686 
687   Label loop0, loop1, loop2, loop3,
688         last_block_enter, do_last_block, only_one_block, done_hash,
689         compute_size, compute_size_end,
690         compute_size1, compute_size_end1;
691 
692   address K256_W = StubRoutines::x86::k256_W_addr();
693   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
694   address pshuffle_byte_flip_mask_addr = 0;
695 
696 const XMMRegister& SHUF_00BA        = xmm10;    // ymm10: shuffle xBxA -> 00BA
697 const XMMRegister& SHUF_DC00        = xmm12;    // ymm12: shuffle xDxC -> DC00
698 const XMMRegister& BYTE_FLIP_MASK   = xmm13;   // ymm13
699 
700 const XMMRegister& X_BYTE_FLIP_MASK = xmm13;   //XMM version of BYTE_FLIP_MASK
701 
702 const Register& NUM_BLKS = r8;   // 3rd arg
703 const Register& CTX      = rdx;  // 2nd arg
704 const Register& INP      = rcx;  // 1st arg
705 
706 const Register& c        = rdi;
707 const Register& d        = rsi;
708 const Register& e        = r8;    // clobbers NUM_BLKS
709 const Register& y3       = rcx;  // clobbers INP
710 
711 const Register& TBL      = rbp;
712 const Register& SRND     = CTX;   // SRND is same register as CTX
713 
714 const Register& a        = rax;
715 const Register& b        = rbx;
716 const Register& f        = r9;
717 const Register& g        = r10;
718 const Register& h        = r11;
719 
720 const Register& T1       = r12;
721 const Register& y0       = r13;
722 const Register& y1       = r14;
723 const Register& y2       = r15;
724 
725 
726 enum {
727   _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
728   _INP_END_SIZE = 8,
729   _INP_SIZE = 8,
730   _CTX_SIZE = 8,
731   _RSP_SIZE = 8,
732 
733   _XFER = 0,
734   _INP_END   = _XFER     + _XFER_SIZE,
735   _INP       = _INP_END  + _INP_END_SIZE,
736   _CTX       = _INP      + _INP_SIZE,
737   _RSP       = _CTX      + _CTX_SIZE,
738   STACK_SIZE = _RSP      + _RSP_SIZE
739 };
740 
741 #ifndef _WIN64
742   push(rcx);    // linux: this is limit, need at the end
743   push(rdx);    // linux: this is ofs
744 #else
745   push(r8);     // win64: this is ofs
746   push(r9);     // win64: this is limit, we need them again at the very and
747 #endif
748 
749 
750   push(rbx);
751 #ifdef _WIN64
752   push(rsi);
753   push(rdi);
754 #endif
755   push(rbp);
756   push(r12);
757   push(r13);
758   push(r14);
759   push(r15);
760 
761   movq(rax, rsp);
762   subq(rsp, STACK_SIZE);
763   andq(rsp, -32);
764   movq(Address(rsp, _RSP), rax);
765 
766 #ifndef _WIN64
767   // copy linux params to win64 params, therefore the rest of code will be the same for both
768   movq(r9,  rcx);
769   movq(r8,  rdx);
770   movq(rdx, rsi);
771   movq(rcx, rdi);
772 #endif
773 
774   // setting original assembly ABI
775   /** message to encrypt in INP */
776   lea(INP, Address(rcx, 0));    // rcx == message (buf)     ;; linux: INP = buf = rdi
777   /** digest in CTX             */
778   movq(CTX, rdx);               // rdx = digest  (state)    ;; linux: CTX = state = rsi
779 
780   /** NUM_BLK is the length of message, need to set it from ofs and limit  */
781   if (multi_block) {
782 
783     // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
784     // on entry r8 = ofs
785     // on exit  r8 = NUM_BLKS
786 
787     xorq(rax, rax);
788 
789     bind(compute_size);
790     cmpptr(r8, r9); // assume the original ofs <= limit ;; linux:  cmp rcx, rdx
791     jccb(Assembler::aboveEqual, compute_size_end);
792     addq(r8, 64);                                          //;; linux: ofs = rdx
793     addq(rax, 64);
794     jmpb(compute_size);
795 
796     bind(compute_size_end);
797     movq(NUM_BLKS, rax);  // NUM_BLK (r8)                  ;; linux: NUM_BLK = rdx
798 
799     cmpq(NUM_BLKS, 0);
800     jcc(Assembler::equal, done_hash);
801 
802     } else {
803     xorq(NUM_BLKS, NUM_BLKS);
804     addq(NUM_BLKS, 64);
805   }//if (!multi_block)
806 
807   lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
808   movq(Address(rsp, _INP_END), NUM_BLKS);  //
809 
810   cmpptr(INP, NUM_BLKS);                   //cmp INP, NUM_BLKS
811   jcc(Assembler::equal, only_one_block);   //je only_one_block
812 
813   // load initial digest
814   movl(a, Address(CTX, 4*0));
815   movl(b, Address(CTX, 4*1));
816   movl(c, Address(CTX, 4*2));
817   movl(d, Address(CTX, 4*3));
818   movl(e, Address(CTX, 4*4));
819   movl(f, Address(CTX, 4*5));
820   // load g - r10 after it is used as scratch
821   movl(h, Address(CTX, 4*7));
822 
823   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
824   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
825   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
826   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
827 
828   movl(g, Address(CTX, 4*6));
829 
830   movq(Address(rsp, _CTX), CTX);           // store
831 
832 bind(loop0);
833   lea(TBL, ExternalAddress(K256_W));
834 
835   // assume buffers not aligned
836 
837   // Load first 16 dwords from two blocks
838   vmovdqu(xmm0, Address(INP, 0*32));
839   vmovdqu(xmm1, Address(INP, 1*32));
840   vmovdqu(xmm2, Address(INP, 2*32));
841   vmovdqu(xmm3, Address(INP, 3*32));
842 
843   // byte swap data
844   vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
845   vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
846   vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
847   vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
848 
849   // transpose data into high/low halves
850   vperm2i128(xmm4, xmm0, xmm2, 0x20);
851   vperm2i128(xmm5, xmm0, xmm2, 0x31);
852   vperm2i128(xmm6, xmm1, xmm3, 0x20);
853   vperm2i128(xmm7, xmm1, xmm3, 0x31);
854 
855 bind(last_block_enter);
856   addq(INP, 64);
857   movq(Address(rsp, _INP), INP);
858 
859   //;; schedule 48 input dwords, by doing 3 rounds of 12 each
860   xorq(SRND, SRND);
861 
862 align(16);
863 bind(loop1);
864   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
865   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
866   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 0);
867   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 1);
868   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  2);
869   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  3);
870 
871   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
872   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
873   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  8+0);
874   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  8+1);
875   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  8+2);
876   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  8+3);
877 
878   vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
879   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
880   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 16+0);
881   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 16+1);
882   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  16+2);
883   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  16+3);
884 
885   vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
886   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
887 
888   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  24+0);
889   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  24+1);
890   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  24+2);
891   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  24+3);
892 
893   addq(SRND, 4*32);
894   cmpq(SRND, 3 * 4*32);
895   jcc(Assembler::below, loop1);
896 
897 bind(loop2);
898   // Do last 16 rounds with no scheduling
899   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
900   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
901   sha256_AVX2_four_rounds_compute_first(0);
902 
903   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
904   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
905   sha256_AVX2_four_rounds_compute_last(0 + 8);
906 
907   addq(SRND, 2*32);
908 
909   vmovdqu(xmm4, xmm6);
910   vmovdqu(xmm5, xmm7);
911 
912   cmpq(SRND, 4 * 4*32);
913   jcc(Assembler::below, loop2);
914 
915   movq(CTX, Address(rsp, _CTX));
916   movq(INP, Address(rsp, _INP));
917 
918   addm(4*0, CTX, a);
919   addm(4*1, CTX, b);
920   addm(4*2, CTX, c);
921   addm(4*3, CTX, d);
922   addm(4*4, CTX, e);
923   addm(4*5, CTX, f);
924   addm(4*6, CTX, g);
925   addm(4*7, CTX, h);
926 
927   cmpq(INP, Address(rsp, _INP_END));
928   jcc(Assembler::above, done_hash);
929 
930   //Do second block using previously scheduled results
931   xorq(SRND, SRND);
932 align(16);
933 bind(loop3);
934   sha256_AVX2_four_rounds_compute_first(4);
935   sha256_AVX2_four_rounds_compute_last(4+8);
936 
937   addq(SRND, 2*32);
938   cmpq(SRND, 4 * 4*32);
939   jcc(Assembler::below, loop3);
940 
941   movq(CTX, Address(rsp, _CTX));
942   movq(INP, Address(rsp, _INP));
943   addq(INP, 64);
944 
945   addm(4*0, CTX, a);
946   addm(4*1, CTX, b);
947   addm(4*2, CTX, c);
948   addm(4*3, CTX, d);
949   addm(4*4, CTX, e);
950   addm(4*5, CTX, f);
951   addm(4*6, CTX, g);
952   addm(4*7, CTX, h);
953 
954   cmpq(INP, Address(rsp, _INP_END));
955   jcc(Assembler::below, loop0);
956   jccb(Assembler::above, done_hash);
957 
958 bind(do_last_block);
959   lea(TBL, ExternalAddress(K256_W));
960 
961   movdqu(xmm4, Address(INP, 0*16));
962   movdqu(xmm5, Address(INP, 1*16));
963   movdqu(xmm6, Address(INP, 2*16));
964   movdqu(xmm7, Address(INP, 3*16));
965 
966   vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
967   vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
968   vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
969   vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
970 
971   jmp(last_block_enter);
972 
973 bind(only_one_block);
974 
975   // load initial digest ;; table should be preloaded with following values
976   movl(a, Address(CTX, 4*0));   // 0x6a09e667
977   movl(b, Address(CTX, 4*1));   // 0xbb67ae85
978   movl(c, Address(CTX, 4*2));   // 0x3c6ef372
979   movl(d, Address(CTX, 4*3));   // 0xa54ff53a
980   movl(e, Address(CTX, 4*4));   // 0x510e527f
981   movl(f, Address(CTX, 4*5));   // 0x9b05688c
982   // load g - r10 after use as scratch
983   movl(h, Address(CTX, 4*7));   // 0x5be0cd19
984 
985 
986   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
987   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
988   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
989   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
990 
991   movl(g, Address(CTX, 4*6));   // 0x1f83d9ab
992 
993   movq(Address(rsp, _CTX), CTX);
994   jmpb(do_last_block);
995 
996 bind(done_hash);
997 
998   movq(rsp, Address(rsp, _RSP));
999 
1000   pop(r15);
1001   pop(r14);
1002   pop(r13);
1003   pop(r12);
1004   pop(rbp);
1005 #ifdef _WIN64
1006   pop(rdi);
1007   pop(rsi);
1008 #endif
1009   pop(rbx);
1010 
1011 #ifdef _WIN64
1012   pop(r9);
1013   pop(r8);
1014 #else
1015   pop(rdx);
1016   pop(rcx);
1017 #endif
1018 
1019   if (multi_block) {
1020 #ifdef _WIN64
1021 const Register& limit_end = r9;
1022 const Register& ofs_end   = r8;
1023 #else
1024 const Register& limit_end = rcx;
1025 const Register& ofs_end   = rdx;
1026 #endif
1027     movq(rax, ofs_end);
1028 
1029 bind(compute_size1);
1030     cmpptr(rax, limit_end); // assume the original ofs <= limit
1031     jccb(Assembler::aboveEqual, compute_size_end1);
1032     addq(rax, 64);
1033     jmpb(compute_size1);
1034 
1035 bind(compute_size_end1);
1036   }
1037 }
1038 
1039 void MacroAssembler::sha512_AVX2_one_round_compute(Register  old_h, Register a, Register b, Register c,
1040                                                    Register d, Register e, Register f, Register g, Register h,
1041                                                    int iteration)
1042 {
1043 
1044     const Register& y0 = r13;
1045     const Register& y1 = r14;
1046     const Register& y2 = r15;
1047 #ifdef _WIN64
1048     const Register& y3 = rcx;
1049 #else
1050     const Register& y3 = rdi;
1051 #endif
1052     const Register& T1 = r12;
1053 
1054     if (iteration % 4 > 0) {
1055       addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
1056     }
1057     movq(y2, f); //y2 = f; CH
1058     rorxq(y0, e, 41); //y0 = e >> 41; S1A
1059     rorxq(y1, e, 18); //y1 = e >> 18; S1B
1060     xorq(y2, g); //y2 = f^g; CH
1061 
1062     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1063     rorxq(y1, e, 14); //y1 = (e >> 14); S1
1064     andq(y2, e); //y2 = (f^g)&e; CH
1065 
1066     if (iteration % 4 > 0 ) {
1067       addq(old_h, y3); //h = t1 + S0 + MAJ
1068     }
1069     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1070     rorxq(T1, a, 34); //T1 = a >> 34; S0B
1071     xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
1072     rorxq(y1, a, 39); //y1 = a >> 39; S0A
1073     movq(y3, a); //y3 = a; MAJA
1074 
1075     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1076     rorxq(T1, a, 28); //T1 = (a >> 28); S0
1077     addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
1078     orq(y3, c); //y3 = a | c; MAJA
1079 
1080     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1081     movq(T1, a); //T1 = a; MAJB
1082     andq(y3, b); //y3 = (a | c)&b; MAJA
1083     andq(T1, c); //T1 = a&c; MAJB
1084     addq(y2, y0); //y2 = S1 + CH; --
1085 
1086     addq(d, h); //d = k + w + h + d; --
1087     orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
1088     addq(h, y1); //h = k + w + h + S0; --
1089 
1090     addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1091 
1092     if (iteration % 4 == 3) {
1093       addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1094       addq(h, y3); //h = t1 + S0 + MAJ; --
1095     }
1096 }
1097 
1098 void MacroAssembler::sha512_AVX2_one_round_and_schedule(
1099     XMMRegister xmm4, // ymm4
1100     XMMRegister xmm5, // ymm5
1101     XMMRegister xmm6, // ymm6
1102     XMMRegister xmm7, // ymm7
1103     Register a, //rax
1104     Register b, //rbx
1105     Register c, //rdi
1106     Register d, //rsi
1107     Register e, //r8
1108     Register f, //r9
1109     Register g, //r10
1110     Register h, //r11
1111     int iteration)
1112 {
1113 
1114     const Register& y0 = r13;
1115     const Register& y1 = r14;
1116     const Register& y2 = r15;
1117 #ifdef _WIN64
1118     const Register& y3 = rcx;
1119 #else
1120     const Register& y3 = rdi;
1121 #endif
1122     const Register& T1 = r12;
1123 
1124     if (iteration % 4 == 0) {
1125       // Extract w[t - 7]
1126       // xmm0 = W[-7]
1127       vperm2f128(xmm0, xmm7, xmm6, 3);
1128       vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
1129 
1130       // Calculate w[t - 16] + w[t - 7]
1131       vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
1132       // Extract w[t - 15]
1133       //xmm1 = W[-15]
1134       vperm2f128(xmm1, xmm5, xmm4, 3);
1135       vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
1136 
1137       // Calculate sigma0
1138       // Calculate w[t - 15] ror 1
1139       vpsrlq(xmm2, xmm1, 1, AVX_256bit);
1140       vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
1141       vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
1142       // Calculate w[t - 15] shr 7
1143       vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
1144 
1145     } else if (iteration % 4 == 1) {
1146       //Calculate w[t - 15] ror 8
1147       vpsrlq(xmm2, xmm1, 8, AVX_256bit);
1148       vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
1149       vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
1150 
1151       //XOR the three components
1152       vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
1153       vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
1154 
1155       //Add three components, w[t - 16], w[t - 7] and sigma0
1156       vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
1157 
1158       // Move to appropriate lanes for calculating w[16] and w[17]
1159       vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
1160 
1161       //Move to appropriate lanes for calculating w[18] and w[19]
1162       vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
1163       //Calculate w[16] and w[17] in both 128 bit lanes
1164       //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
1165       vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
1166       vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
1167 
1168     } else if (iteration % 4 == 2) {
1169       vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
1170       vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
1171       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
1172       vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
1173       vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
1174       vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
1175       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
1176       vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
1177 
1178       //Add sigma1 to the other components to get w[16] and w[17]
1179       vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
1180 
1181       //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
1182       vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
1183 
1184     } else if (iteration % 4 == 3){
1185       vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
1186       vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
1187       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
1188       vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
1189       vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
1190       vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
1191       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
1192       vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
1193 
1194       //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
1195       vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
1196 
1197       //Form w[19, w[18], w17], w[16]
1198       vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
1199     }
1200 
1201     movq(y3, a); //y3 = a; MAJA
1202     rorxq(y0, e, 41); // y0 = e >> 41; S1A
1203     rorxq(y1, e, 18); //y1 = e >> 18; S1B
1204     addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
1205     orq(y3, c); //y3 = a | c; MAJA
1206     movq(y2, f); //y2 = f; CH
1207 
1208     xorq(y2, g); //y2 = f^g; CH
1209 
1210     rorxq(T1, a, 34); //T1 = a >> 34; S0B
1211     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1212 
1213     rorxq(y1, e, 14); //y1 = (e >> 14); S1
1214 
1215     andq(y2, e); //y2 = (f^g) & e; CH
1216     addq(d, h); //d = k + w + h + d; --
1217 
1218     andq(y3, b); //y3 = (a | c)&b; MAJA
1219     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1220     rorxq(y1, a, 39); //y1 = a >> 39; S0A
1221 
1222     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1223     rorxq(T1, a, 28); //T1 = (a >> 28); S0
1224     xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
1225 
1226     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1227     movq(T1, a); //T1 = a; MAJB
1228 
1229     andq(T1, c); //T1 = a&c; MAJB
1230     addq(y2, y0); //y2 = S1 + CH; --
1231 
1232     orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
1233     addq(h, y1); //h = k + w + h + S0; --
1234 
1235     addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1236     addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1237     addq(h, y3); //h = t1 + S0 + MAJ; --
1238 }
1239 
1240 void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1241                                  XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1242                                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1243                                  bool multi_block, XMMRegister shuf_mask)
1244 {
1245 
1246     Label loop0, loop1, loop2, done_hash,
1247     compute_block_size, compute_size,
1248     compute_block_size_end, compute_size_end;
1249 
1250     address K512_W = StubRoutines::x86::k512_W_addr();
1251     address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
1252     address pshuffle_byte_flip_mask_addr = 0;
1253 
1254     const XMMRegister& XFER = xmm0; // YTMP0
1255     const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
1256     const XMMRegister& YMM_MASK_LO = xmm10; // ymm10
1257 #ifdef _WIN64
1258     const Register& INP = rcx; //1st arg
1259     const Register& CTX = rdx; //2nd arg
1260     const Register& NUM_BLKS = r8; //3rd arg
1261     const Register& c = rdi;
1262     const Register& d = rsi;
1263     const Register& e = r8;
1264     const Register& y3 = rcx;
1265     const Register& offset = r8;
1266     const Register& input_limit = r9;
1267 #else
1268     const Register& INP = rdi; //1st arg
1269     const Register& CTX = rsi; //2nd arg
1270     const Register& NUM_BLKS = rdx; //3rd arg
1271     const Register& c  = rcx;
1272     const Register& d  = r8;
1273     const Register& e  = rdx;
1274     const Register& y3 = rdi;
1275     const Register& offset = rdx;
1276     const Register& input_limit = rcx;
1277 #endif
1278 
1279     const Register& TBL = rbp;
1280 
1281     const Register& a = rax;
1282     const Register& b = rbx;
1283 
1284     const Register& f = r9;
1285     const Register& g = r10;
1286     const Register& h = r11;
1287 
1288     //Local variables as defined in assembly file.
1289     enum
1290     {
1291       _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
1292       _SRND_SIZE = 8, // resq 1
1293       _INP_SIZE = 8,
1294       _INP_END_SIZE = 8,
1295       _RSP_SAVE_SIZE = 8,  // defined as resq 1
1296 
1297 #ifdef _WIN64
1298       _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
1299 #else
1300       _GPR_SAVE_SIZE = 6 * 8 // resq 6
1301 #endif
1302     };
1303 
1304     enum
1305     {
1306       _XFER = 0,
1307       _SRND = _XFER + _XFER_SIZE, // 32
1308       _INP = _SRND + _SRND_SIZE, // 40
1309       _INP_END = _INP + _INP_SIZE, // 48
1310       _RSP = _INP_END + _INP_END_SIZE, // 56
1311       _GPR = _RSP + _RSP_SAVE_SIZE, // 64
1312       _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
1313     };
1314 
1315 //Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
1316 #ifdef _WIN64
1317     push(r8);    // win64: this is ofs
1318     push(r9);    // win64: this is limit, we need them again at the very end.
1319 #else
1320     push(rdx);   // linux : this is ofs, need at the end for multiblock calculation
1321     push(rcx);   // linux: This is the limit.
1322 #endif
1323 
1324     //Allocate Stack Space
1325     movq(rax, rsp);
1326     subq(rsp, _STACK_SIZE);
1327     andq(rsp, -32);
1328     movq(Address(rsp, _RSP), rax);
1329 
1330     //Save GPRs
1331     movq(Address(rsp, _GPR), rbp);
1332     movq(Address(rsp, (_GPR + 8)), rbx);
1333     movq(Address(rsp, (_GPR + 16)), r12);
1334     movq(Address(rsp, (_GPR + 24)), r13);
1335     movq(Address(rsp, (_GPR + 32)), r14);
1336     movq(Address(rsp, (_GPR + 40)), r15);
1337 
1338 #ifdef _WIN64
1339     movq(Address(rsp, (_GPR + 48)), rsi);
1340     movq(Address(rsp, (_GPR + 56)), rdi);
1341 #endif
1342 
1343     vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
1344     vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
1345 
1346     if (multi_block) {
1347       xorq(rax, rax);
1348       bind(compute_block_size);
1349       cmpptr(offset, input_limit); // Assuming that offset is less than limit.
1350       jccb(Assembler::aboveEqual, compute_block_size_end);
1351       addq(offset, 128);
1352       addq(rax, 128);
1353       jmpb(compute_block_size);
1354 
1355       bind(compute_block_size_end);
1356       movq(NUM_BLKS, rax);
1357 
1358       cmpq(NUM_BLKS, 0);
1359       jcc(Assembler::equal, done_hash);
1360     } else {
1361       xorq(NUM_BLKS, NUM_BLKS); //If single block.
1362       addq(NUM_BLKS, 128);
1363     }
1364 
1365     addq(NUM_BLKS, INP); //pointer to end of data
1366     movq(Address(rsp, _INP_END), NUM_BLKS);
1367 
1368     //load initial digest
1369     movq(a, Address(CTX, 8 * 0));
1370     movq(b, Address(CTX, 8 * 1));
1371     movq(c, Address(CTX, 8 * 2));
1372     movq(d, Address(CTX, 8 * 3));
1373     movq(e, Address(CTX, 8 * 4));
1374     movq(f, Address(CTX, 8 * 5));
1375     // load g - r10 after it is used as scratch
1376     movq(h, Address(CTX, 8 * 7));
1377 
1378     pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
1379     vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
1380     vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));
1381 
1382     movq(g, Address(CTX, 8 * 6));
1383 
1384     bind(loop0);
1385     lea(TBL, ExternalAddress(K512_W));
1386 
1387     //byte swap first 16 dwords
1388     vmovdqu(xmm4, Address(INP, 32 * 0));
1389     vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
1390     vmovdqu(xmm5, Address(INP, 32 * 1));
1391     vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
1392     vmovdqu(xmm6, Address(INP, 32 * 2));
1393     vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
1394     vmovdqu(xmm7, Address(INP, 32 * 3));
1395     vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
1396 
1397     movq(Address(rsp, _INP), INP);
1398 
1399     movslq(Address(rsp, _SRND), 4);
1400     align(16);
1401 
1402     //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
1403     bind(loop1);
1404     vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
1405     vmovdqu(Address(rsp, _XFER), xmm0);
1406     //four rounds and schedule
1407     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
1408     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
1409     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
1410     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
1411 
1412     vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
1413     vmovdqu(Address(rsp, _XFER), xmm0);
1414     //four rounds and schedule
1415     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
1416     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
1417     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
1418     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
1419 
1420     vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
1421     vmovdqu(Address(rsp, _XFER), xmm0);
1422     //four rounds and schedule
1423     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
1424     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
1425     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
1426     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
1427 
1428     vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
1429     vmovdqu(Address(rsp, _XFER), xmm0);
1430     addq(TBL, 4 * 32);
1431     //four rounds and schedule
1432     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
1433     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
1434     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
1435     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
1436 
1437     subq(Address(rsp, _SRND), 1);
1438     jcc(Assembler::notEqual, loop1);
1439 
1440     movslq(Address(rsp, _SRND), 2);
1441 
1442     bind(loop2);
1443     vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
1444     vmovdqu(Address(rsp, _XFER), xmm0);
1445     //four rounds and compute.
1446     sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
1447     sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
1448     sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
1449     sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
1450 
1451     vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
1452     vmovdqu(Address(rsp, _XFER), xmm0);
1453     addq(TBL, 2 * 32);
1454     // four rounds and compute.
1455     sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
1456     sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
1457     sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
1458     sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
1459 
1460     vmovdqu(xmm4, xmm6);
1461     vmovdqu(xmm5, xmm7);
1462 
1463     subq(Address(rsp, _SRND), 1);
1464     jcc(Assembler::notEqual, loop2);
1465 
1466     addmq(8 * 0, CTX, a);
1467     addmq(8 * 1, CTX, b);
1468     addmq(8 * 2, CTX, c);
1469     addmq(8 * 3, CTX, d);
1470     addmq(8 * 4, CTX, e);
1471     addmq(8 * 5, CTX, f);
1472     addmq(8 * 6, CTX, g);
1473     addmq(8 * 7, CTX, h);
1474 
1475     movq(INP, Address(rsp, _INP));
1476     addq(INP, 128);
1477     cmpq(INP, Address(rsp, _INP_END));
1478     jcc(Assembler::notEqual, loop0);
1479 
1480     bind(done_hash);
1481 
1482     //Restore GPRs
1483     movq(rbp, Address(rsp, (_GPR + 0)));
1484     movq(rbx, Address(rsp, (_GPR + 8)));
1485     movq(r12, Address(rsp, (_GPR + 16)));
1486     movq(r13, Address(rsp, (_GPR + 24)));
1487     movq(r14, Address(rsp, (_GPR + 32)));
1488     movq(r15, Address(rsp, (_GPR + 40)));
1489 
1490 #ifdef _WIN64
1491     movq(rsi, Address(rsp, (_GPR + 48)));
1492     movq(rdi, Address(rsp, (_GPR + 56)));
1493 #endif
1494 
1495     //Restore Stack Pointer
1496     movq(rsp, Address(rsp, _RSP));
1497 
1498 #ifdef _WIN64
1499     pop(r9);
1500     pop(r8);
1501 #else
1502     pop(rcx);
1503     pop(rdx);
1504 #endif
1505 
1506     if (multi_block) {
1507 #ifdef _WIN64
1508       const Register& limit_end = r9;
1509       const Register& ofs_end = r8;
1510 #else
1511       const Register& limit_end = rcx;
1512       const Register& ofs_end   = rdx;
1513 #endif
1514       movq(rax, ofs_end);
1515       bind(compute_size);
1516       cmpptr(rax, limit_end);
1517       jccb(Assembler::aboveEqual, compute_size_end);
1518       addq(rax, 128);
1519       jmpb(compute_size);
1520       bind(compute_size_end);
1521     }
1522 }
1523 
1524 #endif //#ifdef _LP64
1525 
1526