1 /*
2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "ci/ciUtilities.hpp"
29 #include "gc/shared/barrierSet.hpp"
30 #include "gc/shared/barrierSetAssembler.hpp"
31 #include "gc/shared/barrierSetNMethod.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "memory/universe.hpp"
34 #include "nativeInst_x86.hpp"
35 #include "oops/instanceOop.hpp"
36 #include "oops/method.hpp"
37 #include "oops/objArrayKlass.hpp"
38 #include "oops/oop.inline.hpp"
39 #include "prims/methodHandles.hpp"
40 #include "runtime/frame.inline.hpp"
41 #include "runtime/handles.inline.hpp"
42 #include "runtime/sharedRuntime.hpp"
43 #include "runtime/stubCodeGenerator.hpp"
44 #include "runtime/stubRoutines.hpp"
45 #include "runtime/thread.inline.hpp"
46 #ifdef COMPILER2
47 #include "opto/runtime.hpp"
48 #endif
49 #if INCLUDE_ZGC
50 #include "gc/z/zThreadLocalData.hpp"
51 #endif
52
53 // Declaration and definition of StubGenerator (no .hpp file).
54 // For a more detailed description of the stub routine structure
55 // see the comment in stubRoutines.hpp
56
57 #define __ _masm->
58 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
59 #define a__ ((Assembler*)_masm)->
60
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #else
64 #define BLOCK_COMMENT(str) __ block_comment(str)
65 #endif
66
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
69
70 // Stub Code definitions
71
72 class StubGenerator: public StubCodeGenerator {
73 private:
74
75 #ifdef PRODUCT
76 #define inc_counter_np(counter) ((void)0)
77 #else
78 void inc_counter_np_(int& counter) {
79 // This can destroy rscratch1 if counter is far from the code cache
80 __ incrementl(ExternalAddress((address)&counter));
81 }
82 #define inc_counter_np(counter) \
83 BLOCK_COMMENT("inc_counter " #counter); \
84 inc_counter_np_(counter);
85 #endif
86
87 // Call stubs are used to call Java from C
88 //
89 // Linux Arguments:
90 // c_rarg0: call wrapper address address
91 // c_rarg1: result address
92 // c_rarg2: result type BasicType
93 // c_rarg3: method Method*
94 // c_rarg4: (interpreter) entry point address
95 // c_rarg5: parameters intptr_t*
96 // 16(rbp): parameter size (in words) int
97 // 24(rbp): thread Thread*
98 //
99 // [ return_from_Java ] <--- rsp
100 // [ argument word n ]
101 // ...
102 // -12 [ argument word 1 ]
103 // -11 [ saved r15 ] <--- rsp_after_call
104 // -10 [ saved r14 ]
105 // -9 [ saved r13 ]
106 // -8 [ saved r12 ]
107 // -7 [ saved rbx ]
108 // -6 [ call wrapper ]
109 // -5 [ result ]
110 // -4 [ result type ]
111 // -3 [ method ]
112 // -2 [ entry point ]
113 // -1 [ parameters ]
114 // 0 [ saved rbp ] <--- rbp
115 // 1 [ return address ]
116 // 2 [ parameter size ]
117 // 3 [ thread ]
118 //
119 // Windows Arguments:
120 // c_rarg0: call wrapper address address
121 // c_rarg1: result address
122 // c_rarg2: result type BasicType
123 // c_rarg3: method Method*
124 // 48(rbp): (interpreter) entry point address
125 // 56(rbp): parameters intptr_t*
126 // 64(rbp): parameter size (in words) int
127 // 72(rbp): thread Thread*
128 //
129 // [ return_from_Java ] <--- rsp
130 // [ argument word n ]
131 // ...
132 // -60 [ argument word 1 ]
133 // -59 [ saved xmm31 ] <--- rsp after_call
134 // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank)
135 // -27 [ saved xmm15 ]
136 // [ saved xmm7-xmm14 ]
137 // -9 [ saved xmm6 ] (each xmm register takes 2 slots)
138 // -7 [ saved r15 ]
139 // -6 [ saved r14 ]
140 // -5 [ saved r13 ]
141 // -4 [ saved r12 ]
142 // -3 [ saved rdi ]
143 // -2 [ saved rsi ]
144 // -1 [ saved rbx ]
145 // 0 [ saved rbp ] <--- rbp
146 // 1 [ return address ]
147 // 2 [ call wrapper ]
148 // 3 [ result ]
149 // 4 [ result type ]
150 // 5 [ method ]
151 // 6 [ entry point ]
152 // 7 [ parameters ]
153 // 8 [ parameter size ]
154 // 9 [ thread ]
155 //
156 // Windows reserves the callers stack space for arguments 1-4.
157 // We spill c_rarg0-c_rarg3 to this space.
158
159 // Call stub stack layout word offsets from rbp
160 enum call_stub_layout {
161 #ifdef _WIN64
162 xmm_save_first = 6, // save from xmm6
163 xmm_save_last = 31, // to xmm31
164 xmm_save_base = -9,
165 rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
166 r15_off = -7,
167 r14_off = -6,
168 r13_off = -5,
169 r12_off = -4,
170 rdi_off = -3,
171 rsi_off = -2,
172 rbx_off = -1,
173 rbp_off = 0,
174 retaddr_off = 1,
175 call_wrapper_off = 2,
176 result_off = 3,
177 result_type_off = 4,
178 method_off = 5,
179 entry_point_off = 6,
180 parameters_off = 7,
181 parameter_size_off = 8,
182 thread_off = 9
183 #else
184 rsp_after_call_off = -12,
185 mxcsr_off = rsp_after_call_off,
186 r15_off = -11,
187 r14_off = -10,
188 r13_off = -9,
189 r12_off = -8,
190 rbx_off = -7,
191 call_wrapper_off = -6,
192 result_off = -5,
193 result_type_off = -4,
194 method_off = -3,
195 entry_point_off = -2,
196 parameters_off = -1,
197 rbp_off = 0,
198 retaddr_off = 1,
199 parameter_size_off = 2,
200 thread_off = 3
201 #endif
202 };
203
204 #ifdef _WIN64
xmm_save(int reg)205 Address xmm_save(int reg) {
206 assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
207 return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
208 }
209 #endif
210
generate_call_stub(address & return_address)211 address generate_call_stub(address& return_address) {
212 assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
213 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
214 "adjust this code");
215 StubCodeMark mark(this, "StubRoutines", "call_stub");
216 address start = __ pc();
217
218 // same as in generate_catch_exception()!
219 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
220
221 const Address call_wrapper (rbp, call_wrapper_off * wordSize);
222 const Address result (rbp, result_off * wordSize);
223 const Address result_type (rbp, result_type_off * wordSize);
224 const Address method (rbp, method_off * wordSize);
225 const Address entry_point (rbp, entry_point_off * wordSize);
226 const Address parameters (rbp, parameters_off * wordSize);
227 const Address parameter_size(rbp, parameter_size_off * wordSize);
228
229 // same as in generate_catch_exception()!
230 const Address thread (rbp, thread_off * wordSize);
231
232 const Address r15_save(rbp, r15_off * wordSize);
233 const Address r14_save(rbp, r14_off * wordSize);
234 const Address r13_save(rbp, r13_off * wordSize);
235 const Address r12_save(rbp, r12_off * wordSize);
236 const Address rbx_save(rbp, rbx_off * wordSize);
237
238 // stub code
239 __ enter();
240 __ subptr(rsp, -rsp_after_call_off * wordSize);
241
242 // save register parameters
243 #ifndef _WIN64
244 __ movptr(parameters, c_rarg5); // parameters
245 __ movptr(entry_point, c_rarg4); // entry_point
246 #endif
247
248 __ movptr(method, c_rarg3); // method
249 __ movl(result_type, c_rarg2); // result type
250 __ movptr(result, c_rarg1); // result
251 __ movptr(call_wrapper, c_rarg0); // call wrapper
252
253 // save regs belonging to calling function
254 __ movptr(rbx_save, rbx);
255 __ movptr(r12_save, r12);
256 __ movptr(r13_save, r13);
257 __ movptr(r14_save, r14);
258 __ movptr(r15_save, r15);
259
260 #ifdef _WIN64
261 int last_reg = 15;
262 if (UseAVX > 2) {
263 last_reg = 31;
264 }
265 if (VM_Version::supports_evex()) {
266 for (int i = xmm_save_first; i <= last_reg; i++) {
267 __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
268 }
269 } else {
270 for (int i = xmm_save_first; i <= last_reg; i++) {
271 __ movdqu(xmm_save(i), as_XMMRegister(i));
272 }
273 }
274
275 const Address rdi_save(rbp, rdi_off * wordSize);
276 const Address rsi_save(rbp, rsi_off * wordSize);
277
278 __ movptr(rsi_save, rsi);
279 __ movptr(rdi_save, rdi);
280 #else
281 const Address mxcsr_save(rbp, mxcsr_off * wordSize);
282 {
283 Label skip_ldmx;
284 __ stmxcsr(mxcsr_save);
285 __ movl(rax, mxcsr_save);
286 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
287 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
288 __ cmp32(rax, mxcsr_std);
289 __ jcc(Assembler::equal, skip_ldmx);
290 __ ldmxcsr(mxcsr_std);
291 __ bind(skip_ldmx);
292 }
293 #endif
294
295 // Load up thread register
296 __ movptr(r15_thread, thread);
297 __ reinit_heapbase();
298
299 #ifdef ASSERT
300 // make sure we have no pending exceptions
301 {
302 Label L;
303 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
304 __ jcc(Assembler::equal, L);
305 __ stop("StubRoutines::call_stub: entered with pending exception");
306 __ bind(L);
307 }
308 #endif
309
310 // pass parameters if any
311 BLOCK_COMMENT("pass parameters if any");
312 Label parameters_done;
313 __ movl(c_rarg3, parameter_size);
314 __ testl(c_rarg3, c_rarg3);
315 __ jcc(Assembler::zero, parameters_done);
316
317 Label loop;
318 __ movptr(c_rarg2, parameters); // parameter pointer
319 __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1
320 __ BIND(loop);
321 __ movptr(rax, Address(c_rarg2, 0));// get parameter
322 __ addptr(c_rarg2, wordSize); // advance to next parameter
323 __ decrementl(c_rarg1); // decrement counter
324 __ push(rax); // pass parameter
325 __ jcc(Assembler::notZero, loop);
326
327 // call Java function
328 __ BIND(parameters_done);
329 __ movptr(rbx, method); // get Method*
330 __ movptr(c_rarg1, entry_point); // get entry_point
331 __ mov(r13, rsp); // set sender sp
332 BLOCK_COMMENT("call Java function");
333 __ call(c_rarg1);
334
335 BLOCK_COMMENT("call_stub_return_address:");
336 return_address = __ pc();
337
338 // store result depending on type (everything that is not
339 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
340 __ movptr(c_rarg0, result);
341 Label is_long, is_float, is_double, exit;
342 __ movl(c_rarg1, result_type);
343 __ cmpl(c_rarg1, T_OBJECT);
344 __ jcc(Assembler::equal, is_long);
345 __ cmpl(c_rarg1, T_LONG);
346 __ jcc(Assembler::equal, is_long);
347 __ cmpl(c_rarg1, T_FLOAT);
348 __ jcc(Assembler::equal, is_float);
349 __ cmpl(c_rarg1, T_DOUBLE);
350 __ jcc(Assembler::equal, is_double);
351
352 // handle T_INT case
353 __ movl(Address(c_rarg0, 0), rax);
354
355 __ BIND(exit);
356
357 // pop parameters
358 __ lea(rsp, rsp_after_call);
359
360 #ifdef ASSERT
361 // verify that threads correspond
362 {
363 Label L1, L2, L3;
364 __ cmpptr(r15_thread, thread);
365 __ jcc(Assembler::equal, L1);
366 __ stop("StubRoutines::call_stub: r15_thread is corrupted");
367 __ bind(L1);
368 __ get_thread(rbx);
369 __ cmpptr(r15_thread, thread);
370 __ jcc(Assembler::equal, L2);
371 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
372 __ bind(L2);
373 __ cmpptr(r15_thread, rbx);
374 __ jcc(Assembler::equal, L3);
375 __ stop("StubRoutines::call_stub: threads must correspond");
376 __ bind(L3);
377 }
378 #endif
379
380 // restore regs belonging to calling function
381 #ifdef _WIN64
382 // emit the restores for xmm regs
383 if (VM_Version::supports_evex()) {
384 for (int i = xmm_save_first; i <= last_reg; i++) {
385 __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
386 }
387 } else {
388 for (int i = xmm_save_first; i <= last_reg; i++) {
389 __ movdqu(as_XMMRegister(i), xmm_save(i));
390 }
391 }
392 #endif
393 __ movptr(r15, r15_save);
394 __ movptr(r14, r14_save);
395 __ movptr(r13, r13_save);
396 __ movptr(r12, r12_save);
397 __ movptr(rbx, rbx_save);
398
399 #ifdef _WIN64
400 __ movptr(rdi, rdi_save);
401 __ movptr(rsi, rsi_save);
402 #else
403 __ ldmxcsr(mxcsr_save);
404 #endif
405
406 // restore rsp
407 __ addptr(rsp, -rsp_after_call_off * wordSize);
408
409 // return
410 __ vzeroupper();
411 __ pop(rbp);
412 __ ret(0);
413
414 // handle return types different from T_INT
415 __ BIND(is_long);
416 __ movq(Address(c_rarg0, 0), rax);
417 __ jmp(exit);
418
419 __ BIND(is_float);
420 __ movflt(Address(c_rarg0, 0), xmm0);
421 __ jmp(exit);
422
423 __ BIND(is_double);
424 __ movdbl(Address(c_rarg0, 0), xmm0);
425 __ jmp(exit);
426
427 return start;
428 }
429
430 // Return point for a Java call if there's an exception thrown in
431 // Java code. The exception is caught and transformed into a
432 // pending exception stored in JavaThread that can be tested from
433 // within the VM.
434 //
435 // Note: Usually the parameters are removed by the callee. In case
436 // of an exception crossing an activation frame boundary, that is
437 // not the case if the callee is compiled code => need to setup the
438 // rsp.
439 //
440 // rax: exception oop
441
generate_catch_exception()442 address generate_catch_exception() {
443 StubCodeMark mark(this, "StubRoutines", "catch_exception");
444 address start = __ pc();
445
446 // same as in generate_call_stub():
447 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
448 const Address thread (rbp, thread_off * wordSize);
449
450 #ifdef ASSERT
451 // verify that threads correspond
452 {
453 Label L1, L2, L3;
454 __ cmpptr(r15_thread, thread);
455 __ jcc(Assembler::equal, L1);
456 __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
457 __ bind(L1);
458 __ get_thread(rbx);
459 __ cmpptr(r15_thread, thread);
460 __ jcc(Assembler::equal, L2);
461 __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
462 __ bind(L2);
463 __ cmpptr(r15_thread, rbx);
464 __ jcc(Assembler::equal, L3);
465 __ stop("StubRoutines::catch_exception: threads must correspond");
466 __ bind(L3);
467 }
468 #endif
469
470 // set pending exception
471 __ verify_oop(rax);
472
473 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
474 __ lea(rscratch1, ExternalAddress((address)__FILE__));
475 __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
476 __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__);
477
478 // complete return to VM
479 assert(StubRoutines::_call_stub_return_address != NULL,
480 "_call_stub_return_address must have been generated before");
481 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
482
483 return start;
484 }
485
486 // Continuation point for runtime calls returning with a pending
487 // exception. The pending exception check happened in the runtime
488 // or native call stub. The pending exception in Thread is
489 // converted into a Java-level exception.
490 //
491 // Contract with Java-level exception handlers:
492 // rax: exception
493 // rdx: throwing pc
494 //
495 // NOTE: At entry of this stub, exception-pc must be on stack !!
496
generate_forward_exception()497 address generate_forward_exception() {
498 StubCodeMark mark(this, "StubRoutines", "forward exception");
499 address start = __ pc();
500
501 // Upon entry, the sp points to the return address returning into
502 // Java (interpreted or compiled) code; i.e., the return address
503 // becomes the throwing pc.
504 //
505 // Arguments pushed before the runtime call are still on the stack
506 // but the exception handler will reset the stack pointer ->
507 // ignore them. A potential result in registers can be ignored as
508 // well.
509
510 #ifdef ASSERT
511 // make sure this code is only executed if there is a pending exception
512 {
513 Label L;
514 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
515 __ jcc(Assembler::notEqual, L);
516 __ stop("StubRoutines::forward exception: no pending exception (1)");
517 __ bind(L);
518 }
519 #endif
520
521 // compute exception handler into rbx
522 __ movptr(c_rarg0, Address(rsp, 0));
523 BLOCK_COMMENT("call exception_handler_for_return_address");
524 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
525 SharedRuntime::exception_handler_for_return_address),
526 r15_thread, c_rarg0);
527 __ mov(rbx, rax);
528
529 // setup rax & rdx, remove return address & clear pending exception
530 __ pop(rdx);
531 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
532 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
533
534 #ifdef ASSERT
535 // make sure exception is set
536 {
537 Label L;
538 __ testptr(rax, rax);
539 __ jcc(Assembler::notEqual, L);
540 __ stop("StubRoutines::forward exception: no pending exception (2)");
541 __ bind(L);
542 }
543 #endif
544
545 // continue at exception handler (return address removed)
546 // rax: exception
547 // rbx: exception handler
548 // rdx: throwing pc
549 __ verify_oop(rax);
550 __ jmp(rbx);
551
552 return start;
553 }
554
555 // Implementation of jint atomic_xchg(jint add_value, volatile jint* dest)
556 // used by Atomic::xchg(volatile jint* dest, jint exchange_value)
557 //
558 // Arguments :
559 // c_rarg0: exchange_value
560 // c_rarg0: dest
561 //
562 // Result:
563 // *dest <- ex, return (orig *dest)
generate_atomic_xchg()564 address generate_atomic_xchg() {
565 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
566 address start = __ pc();
567
568 __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
569 __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
570 __ ret(0);
571
572 return start;
573 }
574
575 // Implementation of intptr_t atomic_xchg(jlong add_value, volatile jlong* dest)
576 // used by Atomic::xchg(volatile jlong* dest, jlong exchange_value)
577 //
578 // Arguments :
579 // c_rarg0: exchange_value
580 // c_rarg1: dest
581 //
582 // Result:
583 // *dest <- ex, return (orig *dest)
generate_atomic_xchg_long()584 address generate_atomic_xchg_long() {
585 StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
586 address start = __ pc();
587
588 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
589 __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
590 __ ret(0);
591
592 return start;
593 }
594
595 // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
596 // jint compare_value)
597 //
598 // Arguments :
599 // c_rarg0: exchange_value
600 // c_rarg1: dest
601 // c_rarg2: compare_value
602 //
603 // Result:
604 // if ( compare_value == *dest ) {
605 // *dest = exchange_value
606 // return compare_value;
607 // else
608 // return *dest;
generate_atomic_cmpxchg()609 address generate_atomic_cmpxchg() {
610 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
611 address start = __ pc();
612
613 __ movl(rax, c_rarg2);
614 __ lock();
615 __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
616 __ ret(0);
617
618 return start;
619 }
620
621 // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
622 // int8_t compare_value)
623 //
624 // Arguments :
625 // c_rarg0: exchange_value
626 // c_rarg1: dest
627 // c_rarg2: compare_value
628 //
629 // Result:
630 // if ( compare_value == *dest ) {
631 // *dest = exchange_value
632 // return compare_value;
633 // else
634 // return *dest;
generate_atomic_cmpxchg_byte()635 address generate_atomic_cmpxchg_byte() {
636 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
637 address start = __ pc();
638
639 __ movsbq(rax, c_rarg2);
640 __ lock();
641 __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
642 __ ret(0);
643
644 return start;
645 }
646
647 // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
648 // volatile int64_t* dest,
649 // int64_t compare_value)
650 // Arguments :
651 // c_rarg0: exchange_value
652 // c_rarg1: dest
653 // c_rarg2: compare_value
654 //
655 // Result:
656 // if ( compare_value == *dest ) {
657 // *dest = exchange_value
658 // return compare_value;
659 // else
660 // return *dest;
generate_atomic_cmpxchg_long()661 address generate_atomic_cmpxchg_long() {
662 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
663 address start = __ pc();
664
665 __ movq(rax, c_rarg2);
666 __ lock();
667 __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
668 __ ret(0);
669
670 return start;
671 }
672
673 // Implementation of jint atomic_add(jint add_value, volatile jint* dest)
674 // used by Atomic::add(volatile jint* dest, jint add_value)
675 //
676 // Arguments :
677 // c_rarg0: add_value
678 // c_rarg1: dest
679 //
680 // Result:
681 // *dest += add_value
682 // return *dest;
generate_atomic_add()683 address generate_atomic_add() {
684 StubCodeMark mark(this, "StubRoutines", "atomic_add");
685 address start = __ pc();
686
687 __ movl(rax, c_rarg0);
688 __ lock();
689 __ xaddl(Address(c_rarg1, 0), c_rarg0);
690 __ addl(rax, c_rarg0);
691 __ ret(0);
692
693 return start;
694 }
695
696 // Implementation of intptr_t atomic_add(intptr_t add_value, volatile intptr_t* dest)
697 // used by Atomic::add(volatile intptr_t* dest, intptr_t add_value)
698 //
699 // Arguments :
700 // c_rarg0: add_value
701 // c_rarg1: dest
702 //
703 // Result:
704 // *dest += add_value
705 // return *dest;
generate_atomic_add_long()706 address generate_atomic_add_long() {
707 StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
708 address start = __ pc();
709
710 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
711 __ lock();
712 __ xaddptr(Address(c_rarg1, 0), c_rarg0);
713 __ addptr(rax, c_rarg0);
714 __ ret(0);
715
716 return start;
717 }
718
719 // Support for intptr_t OrderAccess::fence()
720 //
721 // Arguments :
722 //
723 // Result:
generate_orderaccess_fence()724 address generate_orderaccess_fence() {
725 StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
726 address start = __ pc();
727 __ membar(Assembler::StoreLoad);
728 __ ret(0);
729
730 return start;
731 }
732
733 // Support for intptr_t get_previous_fp()
734 //
735 // This routine is used to find the previous frame pointer for the
736 // caller (current_frame_guess). This is used as part of debugging
737 // ps() is seemingly lost trying to find frames.
738 // This code assumes that caller current_frame_guess) has a frame.
generate_get_previous_fp()739 address generate_get_previous_fp() {
740 StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
741 const Address old_fp(rbp, 0);
742 const Address older_fp(rax, 0);
743 address start = __ pc();
744
745 __ enter();
746 __ movptr(rax, old_fp); // callers fp
747 __ movptr(rax, older_fp); // the frame for ps()
748 __ pop(rbp);
749 __ ret(0);
750
751 return start;
752 }
753
754 // Support for intptr_t get_previous_sp()
755 //
756 // This routine is used to find the previous stack pointer for the
757 // caller.
generate_get_previous_sp()758 address generate_get_previous_sp() {
759 StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
760 address start = __ pc();
761
762 __ movptr(rax, rsp);
763 __ addptr(rax, 8); // return address is at the top of the stack.
764 __ ret(0);
765
766 return start;
767 }
768
769 //----------------------------------------------------------------------------------------------------
770 // Support for void verify_mxcsr()
771 //
772 // This routine is used with -Xcheck:jni to verify that native
773 // JNI code does not return to Java code without restoring the
774 // MXCSR register to our expected state.
775
generate_verify_mxcsr()776 address generate_verify_mxcsr() {
777 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
778 address start = __ pc();
779
780 const Address mxcsr_save(rsp, 0);
781
782 if (CheckJNICalls) {
783 Label ok_ret;
784 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
785 __ push(rax);
786 __ subptr(rsp, wordSize); // allocate a temp location
787 __ stmxcsr(mxcsr_save);
788 __ movl(rax, mxcsr_save);
789 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
790 __ cmp32(rax, mxcsr_std);
791 __ jcc(Assembler::equal, ok_ret);
792
793 __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
794
795 __ ldmxcsr(mxcsr_std);
796
797 __ bind(ok_ret);
798 __ addptr(rsp, wordSize);
799 __ pop(rax);
800 }
801
802 __ ret(0);
803
804 return start;
805 }
806
generate_f2i_fixup()807 address generate_f2i_fixup() {
808 StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
809 Address inout(rsp, 5 * wordSize); // return address + 4 saves
810
811 address start = __ pc();
812
813 Label L;
814
815 __ push(rax);
816 __ push(c_rarg3);
817 __ push(c_rarg2);
818 __ push(c_rarg1);
819
820 __ movl(rax, 0x7f800000);
821 __ xorl(c_rarg3, c_rarg3);
822 __ movl(c_rarg2, inout);
823 __ movl(c_rarg1, c_rarg2);
824 __ andl(c_rarg1, 0x7fffffff);
825 __ cmpl(rax, c_rarg1); // NaN? -> 0
826 __ jcc(Assembler::negative, L);
827 __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
828 __ movl(c_rarg3, 0x80000000);
829 __ movl(rax, 0x7fffffff);
830 __ cmovl(Assembler::positive, c_rarg3, rax);
831
832 __ bind(L);
833 __ movptr(inout, c_rarg3);
834
835 __ pop(c_rarg1);
836 __ pop(c_rarg2);
837 __ pop(c_rarg3);
838 __ pop(rax);
839
840 __ ret(0);
841
842 return start;
843 }
844
generate_f2l_fixup()845 address generate_f2l_fixup() {
846 StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
847 Address inout(rsp, 5 * wordSize); // return address + 4 saves
848 address start = __ pc();
849
850 Label L;
851
852 __ push(rax);
853 __ push(c_rarg3);
854 __ push(c_rarg2);
855 __ push(c_rarg1);
856
857 __ movl(rax, 0x7f800000);
858 __ xorl(c_rarg3, c_rarg3);
859 __ movl(c_rarg2, inout);
860 __ movl(c_rarg1, c_rarg2);
861 __ andl(c_rarg1, 0x7fffffff);
862 __ cmpl(rax, c_rarg1); // NaN? -> 0
863 __ jcc(Assembler::negative, L);
864 __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
865 __ mov64(c_rarg3, 0x8000000000000000);
866 __ mov64(rax, 0x7fffffffffffffff);
867 __ cmov(Assembler::positive, c_rarg3, rax);
868
869 __ bind(L);
870 __ movptr(inout, c_rarg3);
871
872 __ pop(c_rarg1);
873 __ pop(c_rarg2);
874 __ pop(c_rarg3);
875 __ pop(rax);
876
877 __ ret(0);
878
879 return start;
880 }
881
generate_d2i_fixup()882 address generate_d2i_fixup() {
883 StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
884 Address inout(rsp, 6 * wordSize); // return address + 5 saves
885
886 address start = __ pc();
887
888 Label L;
889
890 __ push(rax);
891 __ push(c_rarg3);
892 __ push(c_rarg2);
893 __ push(c_rarg1);
894 __ push(c_rarg0);
895
896 __ movl(rax, 0x7ff00000);
897 __ movq(c_rarg2, inout);
898 __ movl(c_rarg3, c_rarg2);
899 __ mov(c_rarg1, c_rarg2);
900 __ mov(c_rarg0, c_rarg2);
901 __ negl(c_rarg3);
902 __ shrptr(c_rarg1, 0x20);
903 __ orl(c_rarg3, c_rarg2);
904 __ andl(c_rarg1, 0x7fffffff);
905 __ xorl(c_rarg2, c_rarg2);
906 __ shrl(c_rarg3, 0x1f);
907 __ orl(c_rarg1, c_rarg3);
908 __ cmpl(rax, c_rarg1);
909 __ jcc(Assembler::negative, L); // NaN -> 0
910 __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
911 __ movl(c_rarg2, 0x80000000);
912 __ movl(rax, 0x7fffffff);
913 __ cmov(Assembler::positive, c_rarg2, rax);
914
915 __ bind(L);
916 __ movptr(inout, c_rarg2);
917
918 __ pop(c_rarg0);
919 __ pop(c_rarg1);
920 __ pop(c_rarg2);
921 __ pop(c_rarg3);
922 __ pop(rax);
923
924 __ ret(0);
925
926 return start;
927 }
928
generate_d2l_fixup()929 address generate_d2l_fixup() {
930 StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
931 Address inout(rsp, 6 * wordSize); // return address + 5 saves
932
933 address start = __ pc();
934
935 Label L;
936
937 __ push(rax);
938 __ push(c_rarg3);
939 __ push(c_rarg2);
940 __ push(c_rarg1);
941 __ push(c_rarg0);
942
943 __ movl(rax, 0x7ff00000);
944 __ movq(c_rarg2, inout);
945 __ movl(c_rarg3, c_rarg2);
946 __ mov(c_rarg1, c_rarg2);
947 __ mov(c_rarg0, c_rarg2);
948 __ negl(c_rarg3);
949 __ shrptr(c_rarg1, 0x20);
950 __ orl(c_rarg3, c_rarg2);
951 __ andl(c_rarg1, 0x7fffffff);
952 __ xorl(c_rarg2, c_rarg2);
953 __ shrl(c_rarg3, 0x1f);
954 __ orl(c_rarg1, c_rarg3);
955 __ cmpl(rax, c_rarg1);
956 __ jcc(Assembler::negative, L); // NaN -> 0
957 __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
958 __ mov64(c_rarg2, 0x8000000000000000);
959 __ mov64(rax, 0x7fffffffffffffff);
960 __ cmovq(Assembler::positive, c_rarg2, rax);
961
962 __ bind(L);
963 __ movq(inout, c_rarg2);
964
965 __ pop(c_rarg0);
966 __ pop(c_rarg1);
967 __ pop(c_rarg2);
968 __ pop(c_rarg3);
969 __ pop(rax);
970
971 __ ret(0);
972
973 return start;
974 }
975
generate_fp_mask(const char * stub_name,int64_t mask)976 address generate_fp_mask(const char *stub_name, int64_t mask) {
977 __ align(CodeEntryAlignment);
978 StubCodeMark mark(this, "StubRoutines", stub_name);
979 address start = __ pc();
980
981 __ emit_data64( mask, relocInfo::none );
982 __ emit_data64( mask, relocInfo::none );
983
984 return start;
985 }
986
generate_vector_mask(const char * stub_name,int64_t mask)987 address generate_vector_mask(const char *stub_name, int64_t mask) {
988 __ align(CodeEntryAlignment);
989 StubCodeMark mark(this, "StubRoutines", stub_name);
990 address start = __ pc();
991
992 __ emit_data64(mask, relocInfo::none);
993 __ emit_data64(mask, relocInfo::none);
994 __ emit_data64(mask, relocInfo::none);
995 __ emit_data64(mask, relocInfo::none);
996 __ emit_data64(mask, relocInfo::none);
997 __ emit_data64(mask, relocInfo::none);
998 __ emit_data64(mask, relocInfo::none);
999 __ emit_data64(mask, relocInfo::none);
1000
1001 return start;
1002 }
1003
generate_vector_byte_perm_mask(const char * stub_name)1004 address generate_vector_byte_perm_mask(const char *stub_name) {
1005 __ align(CodeEntryAlignment);
1006 StubCodeMark mark(this, "StubRoutines", stub_name);
1007 address start = __ pc();
1008
1009 __ emit_data64(0x0000000000000001, relocInfo::none);
1010 __ emit_data64(0x0000000000000003, relocInfo::none);
1011 __ emit_data64(0x0000000000000005, relocInfo::none);
1012 __ emit_data64(0x0000000000000007, relocInfo::none);
1013 __ emit_data64(0x0000000000000000, relocInfo::none);
1014 __ emit_data64(0x0000000000000002, relocInfo::none);
1015 __ emit_data64(0x0000000000000004, relocInfo::none);
1016 __ emit_data64(0x0000000000000006, relocInfo::none);
1017
1018 return start;
1019 }
1020
1021 // Non-destructive plausibility checks for oops
1022 //
1023 // Arguments:
1024 // all args on stack!
1025 //
1026 // Stack after saving c_rarg3:
1027 // [tos + 0]: saved c_rarg3
1028 // [tos + 1]: saved c_rarg2
1029 // [tos + 2]: saved r12 (several TemplateTable methods use it)
1030 // [tos + 3]: saved flags
1031 // [tos + 4]: return address
1032 // * [tos + 5]: error message (char*)
1033 // * [tos + 6]: object to verify (oop)
1034 // * [tos + 7]: saved rax - saved by caller and bashed
1035 // * [tos + 8]: saved r10 (rscratch1) - saved by caller
1036 // * = popped on exit
generate_verify_oop()1037 address generate_verify_oop() {
1038 StubCodeMark mark(this, "StubRoutines", "verify_oop");
1039 address start = __ pc();
1040
1041 Label exit, error;
1042
1043 __ pushf();
1044 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1045
1046 __ push(r12);
1047
1048 // save c_rarg2 and c_rarg3
1049 __ push(c_rarg2);
1050 __ push(c_rarg3);
1051
1052 enum {
1053 // After previous pushes.
1054 oop_to_verify = 6 * wordSize,
1055 saved_rax = 7 * wordSize,
1056 saved_r10 = 8 * wordSize,
1057
1058 // Before the call to MacroAssembler::debug(), see below.
1059 return_addr = 16 * wordSize,
1060 error_msg = 17 * wordSize
1061 };
1062
1063 // get object
1064 __ movptr(rax, Address(rsp, oop_to_verify));
1065
1066 // make sure object is 'reasonable'
1067 __ testptr(rax, rax);
1068 __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1069
1070 #if INCLUDE_ZGC
1071 if (UseZGC) {
1072 // Check if metadata bits indicate a bad oop
1073 __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1074 __ jcc(Assembler::notZero, error);
1075 }
1076 #endif
1077
1078 // Check if the oop is in the right area of memory
1079 __ movptr(c_rarg2, rax);
1080 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1081 __ andptr(c_rarg2, c_rarg3);
1082 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1083 __ cmpptr(c_rarg2, c_rarg3);
1084 __ jcc(Assembler::notZero, error);
1085
1086 // set r12 to heapbase for load_klass()
1087 __ reinit_heapbase();
1088
1089 // make sure klass is 'reasonable', which is not zero.
1090 __ load_klass(rax, rax); // get klass
1091 __ testptr(rax, rax);
1092 __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1093
1094 // return if everything seems ok
1095 __ bind(exit);
1096 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1097 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1098 __ pop(c_rarg3); // restore c_rarg3
1099 __ pop(c_rarg2); // restore c_rarg2
1100 __ pop(r12); // restore r12
1101 __ popf(); // restore flags
1102 __ ret(4 * wordSize); // pop caller saved stuff
1103
1104 // handle errors
1105 __ bind(error);
1106 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1107 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1108 __ pop(c_rarg3); // get saved c_rarg3 back
1109 __ pop(c_rarg2); // get saved c_rarg2 back
1110 __ pop(r12); // get saved r12 back
1111 __ popf(); // get saved flags off stack --
1112 // will be ignored
1113
1114 __ pusha(); // push registers
1115 // (rip is already
1116 // already pushed)
1117 // debug(char* msg, int64_t pc, int64_t regs[])
1118 // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1119 // pushed all the registers, so now the stack looks like:
1120 // [tos + 0] 16 saved registers
1121 // [tos + 16] return address
1122 // * [tos + 17] error message (char*)
1123 // * [tos + 18] object to verify (oop)
1124 // * [tos + 19] saved rax - saved by caller and bashed
1125 // * [tos + 20] saved r10 (rscratch1) - saved by caller
1126 // * = popped on exit
1127
1128 __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message
1129 __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address
1130 __ movq(c_rarg2, rsp); // pass address of regs on stack
1131 __ mov(r12, rsp); // remember rsp
1132 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1133 __ andptr(rsp, -16); // align stack as required by ABI
1134 BLOCK_COMMENT("call MacroAssembler::debug");
1135 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1136 __ hlt();
1137 return start;
1138 }
1139
1140 //
1141 // Verify that a register contains clean 32-bits positive value
1142 // (high 32-bits are 0) so it could be used in 64-bits shifts.
1143 //
1144 // Input:
1145 // Rint - 32-bits value
1146 // Rtmp - scratch
1147 //
assert_clean_int(Register Rint,Register Rtmp)1148 void assert_clean_int(Register Rint, Register Rtmp) {
1149 #ifdef ASSERT
1150 Label L;
1151 assert_different_registers(Rtmp, Rint);
1152 __ movslq(Rtmp, Rint);
1153 __ cmpq(Rtmp, Rint);
1154 __ jcc(Assembler::equal, L);
1155 __ stop("high 32-bits of int value are not 0");
1156 __ bind(L);
1157 #endif
1158 }
1159
1160 // Generate overlap test for array copy stubs
1161 //
1162 // Input:
1163 // c_rarg0 - from
1164 // c_rarg1 - to
1165 // c_rarg2 - element count
1166 //
1167 // Output:
1168 // rax - &from[element count - 1]
1169 //
array_overlap_test(address no_overlap_target,Address::ScaleFactor sf)1170 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1171 assert(no_overlap_target != NULL, "must be generated");
1172 array_overlap_test(no_overlap_target, NULL, sf);
1173 }
array_overlap_test(Label & L_no_overlap,Address::ScaleFactor sf)1174 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1175 array_overlap_test(NULL, &L_no_overlap, sf);
1176 }
array_overlap_test(address no_overlap_target,Label * NOLp,Address::ScaleFactor sf)1177 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1178 const Register from = c_rarg0;
1179 const Register to = c_rarg1;
1180 const Register count = c_rarg2;
1181 const Register end_from = rax;
1182
1183 __ cmpptr(to, from);
1184 __ lea(end_from, Address(from, count, sf, 0));
1185 if (NOLp == NULL) {
1186 ExternalAddress no_overlap(no_overlap_target);
1187 __ jump_cc(Assembler::belowEqual, no_overlap);
1188 __ cmpptr(to, end_from);
1189 __ jump_cc(Assembler::aboveEqual, no_overlap);
1190 } else {
1191 __ jcc(Assembler::belowEqual, (*NOLp));
1192 __ cmpptr(to, end_from);
1193 __ jcc(Assembler::aboveEqual, (*NOLp));
1194 }
1195 }
1196
1197 // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1198 //
1199 // Outputs:
1200 // rdi - rcx
1201 // rsi - rdx
1202 // rdx - r8
1203 // rcx - r9
1204 //
1205 // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1206 // are non-volatile. r9 and r10 should not be used by the caller.
1207 //
DEBUG_ONLY(bool regs_in_thread;)1208 DEBUG_ONLY(bool regs_in_thread;)
1209
1210 void setup_arg_regs(int nargs = 3) {
1211 const Register saved_rdi = r9;
1212 const Register saved_rsi = r10;
1213 assert(nargs == 3 || nargs == 4, "else fix");
1214 #ifdef _WIN64
1215 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1216 "unexpected argument registers");
1217 if (nargs >= 4)
1218 __ mov(rax, r9); // r9 is also saved_rdi
1219 __ movptr(saved_rdi, rdi);
1220 __ movptr(saved_rsi, rsi);
1221 __ mov(rdi, rcx); // c_rarg0
1222 __ mov(rsi, rdx); // c_rarg1
1223 __ mov(rdx, r8); // c_rarg2
1224 if (nargs >= 4)
1225 __ mov(rcx, rax); // c_rarg3 (via rax)
1226 #else
1227 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1228 "unexpected argument registers");
1229 #endif
1230 DEBUG_ONLY(regs_in_thread = false;)
1231 }
1232
restore_arg_regs()1233 void restore_arg_regs() {
1234 assert(!regs_in_thread, "wrong call to restore_arg_regs");
1235 const Register saved_rdi = r9;
1236 const Register saved_rsi = r10;
1237 #ifdef _WIN64
1238 __ movptr(rdi, saved_rdi);
1239 __ movptr(rsi, saved_rsi);
1240 #endif
1241 }
1242
1243 // This is used in places where r10 is a scratch register, and can
1244 // be adapted if r9 is needed also.
setup_arg_regs_using_thread()1245 void setup_arg_regs_using_thread() {
1246 const Register saved_r15 = r9;
1247 #ifdef _WIN64
1248 __ mov(saved_r15, r15); // r15 is callee saved and needs to be restored
1249 __ get_thread(r15_thread);
1250 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1251 "unexpected argument registers");
1252 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1253 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1254
1255 __ mov(rdi, rcx); // c_rarg0
1256 __ mov(rsi, rdx); // c_rarg1
1257 __ mov(rdx, r8); // c_rarg2
1258 #else
1259 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1260 "unexpected argument registers");
1261 #endif
1262 DEBUG_ONLY(regs_in_thread = true;)
1263 }
1264
restore_arg_regs_using_thread()1265 void restore_arg_regs_using_thread() {
1266 assert(regs_in_thread, "wrong call to restore_arg_regs");
1267 const Register saved_r15 = r9;
1268 #ifdef _WIN64
1269 __ get_thread(r15_thread);
1270 __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1271 __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1272 __ mov(r15, saved_r15); // r15 is callee saved and needs to be restored
1273 #endif
1274 }
1275
1276 // Copy big chunks forward
1277 //
1278 // Inputs:
1279 // end_from - source arrays end address
1280 // end_to - destination array end address
1281 // qword_count - 64-bits element count, negative
1282 // to - scratch
1283 // L_copy_bytes - entry label
1284 // L_copy_8_bytes - exit label
1285 //
copy_bytes_forward(Register end_from,Register end_to,Register qword_count,Register to,Label & L_copy_bytes,Label & L_copy_8_bytes)1286 void copy_bytes_forward(Register end_from, Register end_to,
1287 Register qword_count, Register to,
1288 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1289 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1290 Label L_loop;
1291 __ align(OptoLoopAlignment);
1292 if (UseUnalignedLoadStores) {
1293 Label L_end;
1294 // Copy 64-bytes per iteration
1295 if (UseAVX > 2) {
1296 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
1297
1298 __ BIND(L_copy_bytes);
1299 __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
1300 __ jccb(Assembler::less, L_above_threshold);
1301 __ jmpb(L_below_threshold);
1302
1303 __ bind(L_loop_avx512);
1304 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1305 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1306 __ bind(L_above_threshold);
1307 __ addptr(qword_count, 8);
1308 __ jcc(Assembler::lessEqual, L_loop_avx512);
1309 __ jmpb(L_32_byte_head);
1310
1311 __ bind(L_loop_avx2);
1312 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1313 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1314 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1315 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1316 __ bind(L_below_threshold);
1317 __ addptr(qword_count, 8);
1318 __ jcc(Assembler::lessEqual, L_loop_avx2);
1319
1320 __ bind(L_32_byte_head);
1321 __ subptr(qword_count, 4); // sub(8) and add(4)
1322 __ jccb(Assembler::greater, L_end);
1323 } else {
1324 __ BIND(L_loop);
1325 if (UseAVX == 2) {
1326 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1327 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1328 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1329 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1330 } else {
1331 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1332 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1333 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1334 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1335 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1336 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1337 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1338 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1339 }
1340
1341 __ BIND(L_copy_bytes);
1342 __ addptr(qword_count, 8);
1343 __ jcc(Assembler::lessEqual, L_loop);
1344 __ subptr(qword_count, 4); // sub(8) and add(4)
1345 __ jccb(Assembler::greater, L_end);
1346 }
1347 // Copy trailing 32 bytes
1348 if (UseAVX >= 2) {
1349 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1350 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1351 } else {
1352 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1353 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1354 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1355 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1356 }
1357 __ addptr(qword_count, 4);
1358 __ BIND(L_end);
1359 if (UseAVX >= 2) {
1360 // clean upper bits of YMM registers
1361 __ vpxor(xmm0, xmm0);
1362 __ vpxor(xmm1, xmm1);
1363 }
1364 } else {
1365 // Copy 32-bytes per iteration
1366 __ BIND(L_loop);
1367 __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1368 __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1369 __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1370 __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1371 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1372 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1373 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1374 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1375
1376 __ BIND(L_copy_bytes);
1377 __ addptr(qword_count, 4);
1378 __ jcc(Assembler::lessEqual, L_loop);
1379 }
1380 __ subptr(qword_count, 4);
1381 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1382 }
1383
1384 // Copy big chunks backward
1385 //
1386 // Inputs:
1387 // from - source arrays address
1388 // dest - destination array address
1389 // qword_count - 64-bits element count
1390 // to - scratch
1391 // L_copy_bytes - entry label
1392 // L_copy_8_bytes - exit label
1393 //
copy_bytes_backward(Register from,Register dest,Register qword_count,Register to,Label & L_copy_bytes,Label & L_copy_8_bytes)1394 void copy_bytes_backward(Register from, Register dest,
1395 Register qword_count, Register to,
1396 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1397 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1398 Label L_loop;
1399 __ align(OptoLoopAlignment);
1400 if (UseUnalignedLoadStores) {
1401 Label L_end;
1402 // Copy 64-bytes per iteration
1403 if (UseAVX > 2) {
1404 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
1405
1406 __ BIND(L_copy_bytes);
1407 __ cmpptr(qword_count, (AVX3Threshold / 8));
1408 __ jccb(Assembler::greater, L_above_threshold);
1409 __ jmpb(L_below_threshold);
1410
1411 __ BIND(L_loop_avx512);
1412 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1413 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1414 __ bind(L_above_threshold);
1415 __ subptr(qword_count, 8);
1416 __ jcc(Assembler::greaterEqual, L_loop_avx512);
1417 __ jmpb(L_32_byte_head);
1418
1419 __ bind(L_loop_avx2);
1420 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1421 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1422 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1423 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1424 __ bind(L_below_threshold);
1425 __ subptr(qword_count, 8);
1426 __ jcc(Assembler::greaterEqual, L_loop_avx2);
1427
1428 __ bind(L_32_byte_head);
1429 __ addptr(qword_count, 4); // add(8) and sub(4)
1430 __ jccb(Assembler::less, L_end);
1431 } else {
1432 __ BIND(L_loop);
1433 if (UseAVX == 2) {
1434 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1435 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1436 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1437 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1438 } else {
1439 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1440 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1441 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1442 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1443 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1444 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1445 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
1446 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
1447 }
1448
1449 __ BIND(L_copy_bytes);
1450 __ subptr(qword_count, 8);
1451 __ jcc(Assembler::greaterEqual, L_loop);
1452
1453 __ addptr(qword_count, 4); // add(8) and sub(4)
1454 __ jccb(Assembler::less, L_end);
1455 }
1456 // Copy trailing 32 bytes
1457 if (UseAVX >= 2) {
1458 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1459 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1460 } else {
1461 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1462 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1463 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1464 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1465 }
1466 __ subptr(qword_count, 4);
1467 __ BIND(L_end);
1468 if (UseAVX >= 2) {
1469 // clean upper bits of YMM registers
1470 __ vpxor(xmm0, xmm0);
1471 __ vpxor(xmm1, xmm1);
1472 }
1473 } else {
1474 // Copy 32-bytes per iteration
1475 __ BIND(L_loop);
1476 __ movq(to, Address(from, qword_count, Address::times_8, 24));
1477 __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1478 __ movq(to, Address(from, qword_count, Address::times_8, 16));
1479 __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1480 __ movq(to, Address(from, qword_count, Address::times_8, 8));
1481 __ movq(Address(dest, qword_count, Address::times_8, 8), to);
1482 __ movq(to, Address(from, qword_count, Address::times_8, 0));
1483 __ movq(Address(dest, qword_count, Address::times_8, 0), to);
1484
1485 __ BIND(L_copy_bytes);
1486 __ subptr(qword_count, 4);
1487 __ jcc(Assembler::greaterEqual, L_loop);
1488 }
1489 __ addptr(qword_count, 4);
1490 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1491 }
1492
1493 // Arguments:
1494 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1495 // ignored
1496 // name - stub name string
1497 //
1498 // Inputs:
1499 // c_rarg0 - source array address
1500 // c_rarg1 - destination array address
1501 // c_rarg2 - element count, treated as ssize_t, can be zero
1502 //
1503 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1504 // we let the hardware handle it. The one to eight bytes within words,
1505 // dwords or qwords that span cache line boundaries will still be loaded
1506 // and stored atomically.
1507 //
1508 // Side Effects:
1509 // disjoint_byte_copy_entry is set to the no-overlap entry point
1510 // used by generate_conjoint_byte_copy().
1511 //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1512 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1513 __ align(CodeEntryAlignment);
1514 StubCodeMark mark(this, "StubRoutines", name);
1515 address start = __ pc();
1516
1517 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1518 Label L_copy_byte, L_exit;
1519 const Register from = rdi; // source array address
1520 const Register to = rsi; // destination array address
1521 const Register count = rdx; // elements count
1522 const Register byte_count = rcx;
1523 const Register qword_count = count;
1524 const Register end_from = from; // source array end address
1525 const Register end_to = to; // destination array end address
1526 // End pointers are inclusive, and if count is not zero they point
1527 // to the last unit copied: end_to[0] := end_from[0]
1528
1529 __ enter(); // required for proper stackwalking of RuntimeStub frame
1530 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1531
1532 if (entry != NULL) {
1533 *entry = __ pc();
1534 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1535 BLOCK_COMMENT("Entry:");
1536 }
1537
1538 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1539 // r9 and r10 may be used to save non-volatile registers
1540
1541 {
1542 // UnsafeCopyMemory page error: continue after ucm
1543 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1544 // 'from', 'to' and 'count' are now valid
1545 __ movptr(byte_count, count);
1546 __ shrptr(count, 3); // count => qword_count
1547
1548 // Copy from low to high addresses. Use 'to' as scratch.
1549 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1550 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1551 __ negptr(qword_count); // make the count negative
1552 __ jmp(L_copy_bytes);
1553
1554 // Copy trailing qwords
1555 __ BIND(L_copy_8_bytes);
1556 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1557 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1558 __ increment(qword_count);
1559 __ jcc(Assembler::notZero, L_copy_8_bytes);
1560
1561 // Check for and copy trailing dword
1562 __ BIND(L_copy_4_bytes);
1563 __ testl(byte_count, 4);
1564 __ jccb(Assembler::zero, L_copy_2_bytes);
1565 __ movl(rax, Address(end_from, 8));
1566 __ movl(Address(end_to, 8), rax);
1567
1568 __ addptr(end_from, 4);
1569 __ addptr(end_to, 4);
1570
1571 // Check for and copy trailing word
1572 __ BIND(L_copy_2_bytes);
1573 __ testl(byte_count, 2);
1574 __ jccb(Assembler::zero, L_copy_byte);
1575 __ movw(rax, Address(end_from, 8));
1576 __ movw(Address(end_to, 8), rax);
1577
1578 __ addptr(end_from, 2);
1579 __ addptr(end_to, 2);
1580
1581 // Check for and copy trailing byte
1582 __ BIND(L_copy_byte);
1583 __ testl(byte_count, 1);
1584 __ jccb(Assembler::zero, L_exit);
1585 __ movb(rax, Address(end_from, 8));
1586 __ movb(Address(end_to, 8), rax);
1587 }
1588 __ BIND(L_exit);
1589 address ucme_exit_pc = __ pc();
1590 restore_arg_regs();
1591 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1592 __ xorptr(rax, rax); // return 0
1593 __ vzeroupper();
1594 __ leave(); // required for proper stackwalking of RuntimeStub frame
1595 __ ret(0);
1596
1597 {
1598 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1599 // Copy in multi-bytes chunks
1600 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1601 __ jmp(L_copy_4_bytes);
1602 }
1603 return start;
1604 }
1605
1606 // Arguments:
1607 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1608 // ignored
1609 // name - stub name string
1610 //
1611 // Inputs:
1612 // c_rarg0 - source array address
1613 // c_rarg1 - destination array address
1614 // c_rarg2 - element count, treated as ssize_t, can be zero
1615 //
1616 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1617 // we let the hardware handle it. The one to eight bytes within words,
1618 // dwords or qwords that span cache line boundaries will still be loaded
1619 // and stored atomically.
1620 //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1621 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1622 address* entry, const char *name) {
1623 __ align(CodeEntryAlignment);
1624 StubCodeMark mark(this, "StubRoutines", name);
1625 address start = __ pc();
1626
1627 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1628 const Register from = rdi; // source array address
1629 const Register to = rsi; // destination array address
1630 const Register count = rdx; // elements count
1631 const Register byte_count = rcx;
1632 const Register qword_count = count;
1633
1634 __ enter(); // required for proper stackwalking of RuntimeStub frame
1635 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1636
1637 if (entry != NULL) {
1638 *entry = __ pc();
1639 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1640 BLOCK_COMMENT("Entry:");
1641 }
1642
1643 array_overlap_test(nooverlap_target, Address::times_1);
1644 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1645 // r9 and r10 may be used to save non-volatile registers
1646
1647 {
1648 // UnsafeCopyMemory page error: continue after ucm
1649 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1650 // 'from', 'to' and 'count' are now valid
1651 __ movptr(byte_count, count);
1652 __ shrptr(count, 3); // count => qword_count
1653
1654 // Copy from high to low addresses.
1655
1656 // Check for and copy trailing byte
1657 __ testl(byte_count, 1);
1658 __ jcc(Assembler::zero, L_copy_2_bytes);
1659 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1660 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1661 __ decrement(byte_count); // Adjust for possible trailing word
1662
1663 // Check for and copy trailing word
1664 __ BIND(L_copy_2_bytes);
1665 __ testl(byte_count, 2);
1666 __ jcc(Assembler::zero, L_copy_4_bytes);
1667 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1668 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1669
1670 // Check for and copy trailing dword
1671 __ BIND(L_copy_4_bytes);
1672 __ testl(byte_count, 4);
1673 __ jcc(Assembler::zero, L_copy_bytes);
1674 __ movl(rax, Address(from, qword_count, Address::times_8));
1675 __ movl(Address(to, qword_count, Address::times_8), rax);
1676 __ jmp(L_copy_bytes);
1677
1678 // Copy trailing qwords
1679 __ BIND(L_copy_8_bytes);
1680 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1681 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1682 __ decrement(qword_count);
1683 __ jcc(Assembler::notZero, L_copy_8_bytes);
1684 }
1685 restore_arg_regs();
1686 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1687 __ xorptr(rax, rax); // return 0
1688 __ vzeroupper();
1689 __ leave(); // required for proper stackwalking of RuntimeStub frame
1690 __ ret(0);
1691
1692 {
1693 // UnsafeCopyMemory page error: continue after ucm
1694 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1695 // Copy in multi-bytes chunks
1696 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1697 }
1698 restore_arg_regs();
1699 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1700 __ xorptr(rax, rax); // return 0
1701 __ vzeroupper();
1702 __ leave(); // required for proper stackwalking of RuntimeStub frame
1703 __ ret(0);
1704
1705 return start;
1706 }
1707
1708 // Arguments:
1709 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1710 // ignored
1711 // name - stub name string
1712 //
1713 // Inputs:
1714 // c_rarg0 - source array address
1715 // c_rarg1 - destination array address
1716 // c_rarg2 - element count, treated as ssize_t, can be zero
1717 //
1718 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1719 // let the hardware handle it. The two or four words within dwords
1720 // or qwords that span cache line boundaries will still be loaded
1721 // and stored atomically.
1722 //
1723 // Side Effects:
1724 // disjoint_short_copy_entry is set to the no-overlap entry point
1725 // used by generate_conjoint_short_copy().
1726 //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1727 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1728 __ align(CodeEntryAlignment);
1729 StubCodeMark mark(this, "StubRoutines", name);
1730 address start = __ pc();
1731
1732 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1733 const Register from = rdi; // source array address
1734 const Register to = rsi; // destination array address
1735 const Register count = rdx; // elements count
1736 const Register word_count = rcx;
1737 const Register qword_count = count;
1738 const Register end_from = from; // source array end address
1739 const Register end_to = to; // destination array end address
1740 // End pointers are inclusive, and if count is not zero they point
1741 // to the last unit copied: end_to[0] := end_from[0]
1742
1743 __ enter(); // required for proper stackwalking of RuntimeStub frame
1744 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1745
1746 if (entry != NULL) {
1747 *entry = __ pc();
1748 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1749 BLOCK_COMMENT("Entry:");
1750 }
1751
1752 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1753 // r9 and r10 may be used to save non-volatile registers
1754
1755 {
1756 // UnsafeCopyMemory page error: continue after ucm
1757 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1758 // 'from', 'to' and 'count' are now valid
1759 __ movptr(word_count, count);
1760 __ shrptr(count, 2); // count => qword_count
1761
1762 // Copy from low to high addresses. Use 'to' as scratch.
1763 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1764 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1765 __ negptr(qword_count);
1766 __ jmp(L_copy_bytes);
1767
1768 // Copy trailing qwords
1769 __ BIND(L_copy_8_bytes);
1770 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1771 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1772 __ increment(qword_count);
1773 __ jcc(Assembler::notZero, L_copy_8_bytes);
1774
1775 // Original 'dest' is trashed, so we can't use it as a
1776 // base register for a possible trailing word copy
1777
1778 // Check for and copy trailing dword
1779 __ BIND(L_copy_4_bytes);
1780 __ testl(word_count, 2);
1781 __ jccb(Assembler::zero, L_copy_2_bytes);
1782 __ movl(rax, Address(end_from, 8));
1783 __ movl(Address(end_to, 8), rax);
1784
1785 __ addptr(end_from, 4);
1786 __ addptr(end_to, 4);
1787
1788 // Check for and copy trailing word
1789 __ BIND(L_copy_2_bytes);
1790 __ testl(word_count, 1);
1791 __ jccb(Assembler::zero, L_exit);
1792 __ movw(rax, Address(end_from, 8));
1793 __ movw(Address(end_to, 8), rax);
1794 }
1795 __ BIND(L_exit);
1796 address ucme_exit_pc = __ pc();
1797 restore_arg_regs();
1798 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1799 __ xorptr(rax, rax); // return 0
1800 __ vzeroupper();
1801 __ leave(); // required for proper stackwalking of RuntimeStub frame
1802 __ ret(0);
1803
1804 {
1805 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1806 // Copy in multi-bytes chunks
1807 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1808 __ jmp(L_copy_4_bytes);
1809 }
1810
1811 return start;
1812 }
1813
generate_fill(BasicType t,bool aligned,const char * name)1814 address generate_fill(BasicType t, bool aligned, const char *name) {
1815 __ align(CodeEntryAlignment);
1816 StubCodeMark mark(this, "StubRoutines", name);
1817 address start = __ pc();
1818
1819 BLOCK_COMMENT("Entry:");
1820
1821 const Register to = c_rarg0; // source array address
1822 const Register value = c_rarg1; // value
1823 const Register count = c_rarg2; // elements count
1824
1825 __ enter(); // required for proper stackwalking of RuntimeStub frame
1826
1827 __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1828
1829 __ vzeroupper();
1830 __ leave(); // required for proper stackwalking of RuntimeStub frame
1831 __ ret(0);
1832 return start;
1833 }
1834
1835 // Arguments:
1836 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1837 // ignored
1838 // name - stub name string
1839 //
1840 // Inputs:
1841 // c_rarg0 - source array address
1842 // c_rarg1 - destination array address
1843 // c_rarg2 - element count, treated as ssize_t, can be zero
1844 //
1845 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1846 // let the hardware handle it. The two or four words within dwords
1847 // or qwords that span cache line boundaries will still be loaded
1848 // and stored atomically.
1849 //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1850 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1851 address *entry, const char *name) {
1852 __ align(CodeEntryAlignment);
1853 StubCodeMark mark(this, "StubRoutines", name);
1854 address start = __ pc();
1855
1856 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1857 const Register from = rdi; // source array address
1858 const Register to = rsi; // destination array address
1859 const Register count = rdx; // elements count
1860 const Register word_count = rcx;
1861 const Register qword_count = count;
1862
1863 __ enter(); // required for proper stackwalking of RuntimeStub frame
1864 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1865
1866 if (entry != NULL) {
1867 *entry = __ pc();
1868 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1869 BLOCK_COMMENT("Entry:");
1870 }
1871
1872 array_overlap_test(nooverlap_target, Address::times_2);
1873 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1874 // r9 and r10 may be used to save non-volatile registers
1875
1876 {
1877 // UnsafeCopyMemory page error: continue after ucm
1878 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1879 // 'from', 'to' and 'count' are now valid
1880 __ movptr(word_count, count);
1881 __ shrptr(count, 2); // count => qword_count
1882
1883 // Copy from high to low addresses. Use 'to' as scratch.
1884
1885 // Check for and copy trailing word
1886 __ testl(word_count, 1);
1887 __ jccb(Assembler::zero, L_copy_4_bytes);
1888 __ movw(rax, Address(from, word_count, Address::times_2, -2));
1889 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1890
1891 // Check for and copy trailing dword
1892 __ BIND(L_copy_4_bytes);
1893 __ testl(word_count, 2);
1894 __ jcc(Assembler::zero, L_copy_bytes);
1895 __ movl(rax, Address(from, qword_count, Address::times_8));
1896 __ movl(Address(to, qword_count, Address::times_8), rax);
1897 __ jmp(L_copy_bytes);
1898
1899 // Copy trailing qwords
1900 __ BIND(L_copy_8_bytes);
1901 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1902 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1903 __ decrement(qword_count);
1904 __ jcc(Assembler::notZero, L_copy_8_bytes);
1905 }
1906 restore_arg_regs();
1907 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1908 __ xorptr(rax, rax); // return 0
1909 __ vzeroupper();
1910 __ leave(); // required for proper stackwalking of RuntimeStub frame
1911 __ ret(0);
1912
1913 {
1914 // UnsafeCopyMemory page error: continue after ucm
1915 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1916 // Copy in multi-bytes chunks
1917 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1918 }
1919 restore_arg_regs();
1920 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1921 __ xorptr(rax, rax); // return 0
1922 __ vzeroupper();
1923 __ leave(); // required for proper stackwalking of RuntimeStub frame
1924 __ ret(0);
1925
1926 return start;
1927 }
1928
1929 // Arguments:
1930 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1931 // ignored
1932 // is_oop - true => oop array, so generate store check code
1933 // name - stub name string
1934 //
1935 // Inputs:
1936 // c_rarg0 - source array address
1937 // c_rarg1 - destination array address
1938 // c_rarg2 - element count, treated as ssize_t, can be zero
1939 //
1940 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1941 // the hardware handle it. The two dwords within qwords that span
1942 // cache line boundaries will still be loaded and stored atomicly.
1943 //
1944 // Side Effects:
1945 // disjoint_int_copy_entry is set to the no-overlap entry point
1946 // used by generate_conjoint_int_oop_copy().
1947 //
generate_disjoint_int_oop_copy(bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1948 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1949 const char *name, bool dest_uninitialized = false) {
1950 __ align(CodeEntryAlignment);
1951 StubCodeMark mark(this, "StubRoutines", name);
1952 address start = __ pc();
1953
1954 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1955 const Register from = rdi; // source array address
1956 const Register to = rsi; // destination array address
1957 const Register count = rdx; // elements count
1958 const Register dword_count = rcx;
1959 const Register qword_count = count;
1960 const Register end_from = from; // source array end address
1961 const Register end_to = to; // destination array end address
1962 // End pointers are inclusive, and if count is not zero they point
1963 // to the last unit copied: end_to[0] := end_from[0]
1964
1965 __ enter(); // required for proper stackwalking of RuntimeStub frame
1966 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1967
1968 if (entry != NULL) {
1969 *entry = __ pc();
1970 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1971 BLOCK_COMMENT("Entry:");
1972 }
1973
1974 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1975 // r9 is used to save r15_thread
1976
1977 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1978 if (dest_uninitialized) {
1979 decorators |= IS_DEST_UNINITIALIZED;
1980 }
1981 if (aligned) {
1982 decorators |= ARRAYCOPY_ALIGNED;
1983 }
1984
1985 BasicType type = is_oop ? T_OBJECT : T_INT;
1986 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1987 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1988
1989 {
1990 // UnsafeCopyMemory page error: continue after ucm
1991 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1992 // 'from', 'to' and 'count' are now valid
1993 __ movptr(dword_count, count);
1994 __ shrptr(count, 1); // count => qword_count
1995
1996 // Copy from low to high addresses. Use 'to' as scratch.
1997 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1998 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1999 __ negptr(qword_count);
2000 __ jmp(L_copy_bytes);
2001
2002 // Copy trailing qwords
2003 __ BIND(L_copy_8_bytes);
2004 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2005 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2006 __ increment(qword_count);
2007 __ jcc(Assembler::notZero, L_copy_8_bytes);
2008
2009 // Check for and copy trailing dword
2010 __ BIND(L_copy_4_bytes);
2011 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2012 __ jccb(Assembler::zero, L_exit);
2013 __ movl(rax, Address(end_from, 8));
2014 __ movl(Address(end_to, 8), rax);
2015 }
2016 __ BIND(L_exit);
2017 address ucme_exit_pc = __ pc();
2018 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2019 restore_arg_regs_using_thread();
2020 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2021 __ vzeroupper();
2022 __ xorptr(rax, rax); // return 0
2023 __ leave(); // required for proper stackwalking of RuntimeStub frame
2024 __ ret(0);
2025
2026 {
2027 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2028 // Copy in multi-bytes chunks
2029 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2030 __ jmp(L_copy_4_bytes);
2031 }
2032
2033 return start;
2034 }
2035
2036 // Arguments:
2037 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2038 // ignored
2039 // is_oop - true => oop array, so generate store check code
2040 // name - stub name string
2041 //
2042 // Inputs:
2043 // c_rarg0 - source array address
2044 // c_rarg1 - destination array address
2045 // c_rarg2 - element count, treated as ssize_t, can be zero
2046 //
2047 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2048 // the hardware handle it. The two dwords within qwords that span
2049 // cache line boundaries will still be loaded and stored atomicly.
2050 //
generate_conjoint_int_oop_copy(bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)2051 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2052 address *entry, const char *name,
2053 bool dest_uninitialized = false) {
2054 __ align(CodeEntryAlignment);
2055 StubCodeMark mark(this, "StubRoutines", name);
2056 address start = __ pc();
2057
2058 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2059 const Register from = rdi; // source array address
2060 const Register to = rsi; // destination array address
2061 const Register count = rdx; // elements count
2062 const Register dword_count = rcx;
2063 const Register qword_count = count;
2064
2065 __ enter(); // required for proper stackwalking of RuntimeStub frame
2066 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2067
2068 if (entry != NULL) {
2069 *entry = __ pc();
2070 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2071 BLOCK_COMMENT("Entry:");
2072 }
2073
2074 array_overlap_test(nooverlap_target, Address::times_4);
2075 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2076 // r9 is used to save r15_thread
2077
2078 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2079 if (dest_uninitialized) {
2080 decorators |= IS_DEST_UNINITIALIZED;
2081 }
2082 if (aligned) {
2083 decorators |= ARRAYCOPY_ALIGNED;
2084 }
2085
2086 BasicType type = is_oop ? T_OBJECT : T_INT;
2087 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2088 // no registers are destroyed by this call
2089 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2090
2091 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2092 {
2093 // UnsafeCopyMemory page error: continue after ucm
2094 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2095 // 'from', 'to' and 'count' are now valid
2096 __ movptr(dword_count, count);
2097 __ shrptr(count, 1); // count => qword_count
2098
2099 // Copy from high to low addresses. Use 'to' as scratch.
2100
2101 // Check for and copy trailing dword
2102 __ testl(dword_count, 1);
2103 __ jcc(Assembler::zero, L_copy_bytes);
2104 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2105 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2106 __ jmp(L_copy_bytes);
2107
2108 // Copy trailing qwords
2109 __ BIND(L_copy_8_bytes);
2110 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2111 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2112 __ decrement(qword_count);
2113 __ jcc(Assembler::notZero, L_copy_8_bytes);
2114 }
2115 if (is_oop) {
2116 __ jmp(L_exit);
2117 }
2118 restore_arg_regs_using_thread();
2119 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2120 __ xorptr(rax, rax); // return 0
2121 __ vzeroupper();
2122 __ leave(); // required for proper stackwalking of RuntimeStub frame
2123 __ ret(0);
2124
2125 {
2126 // UnsafeCopyMemory page error: continue after ucm
2127 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2128 // Copy in multi-bytes chunks
2129 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2130 }
2131
2132 __ BIND(L_exit);
2133 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2134 restore_arg_regs_using_thread();
2135 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2136 __ xorptr(rax, rax); // return 0
2137 __ vzeroupper();
2138 __ leave(); // required for proper stackwalking of RuntimeStub frame
2139 __ ret(0);
2140
2141 return start;
2142 }
2143
2144 // Arguments:
2145 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2146 // ignored
2147 // is_oop - true => oop array, so generate store check code
2148 // name - stub name string
2149 //
2150 // Inputs:
2151 // c_rarg0 - source array address
2152 // c_rarg1 - destination array address
2153 // c_rarg2 - element count, treated as ssize_t, can be zero
2154 //
2155 // Side Effects:
2156 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2157 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2158 //
generate_disjoint_long_oop_copy(bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)2159 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2160 const char *name, bool dest_uninitialized = false) {
2161 __ align(CodeEntryAlignment);
2162 StubCodeMark mark(this, "StubRoutines", name);
2163 address start = __ pc();
2164
2165 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2166 const Register from = rdi; // source array address
2167 const Register to = rsi; // destination array address
2168 const Register qword_count = rdx; // elements count
2169 const Register end_from = from; // source array end address
2170 const Register end_to = rcx; // destination array end address
2171 const Register saved_count = r11;
2172 // End pointers are inclusive, and if count is not zero they point
2173 // to the last unit copied: end_to[0] := end_from[0]
2174
2175 __ enter(); // required for proper stackwalking of RuntimeStub frame
2176 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2177 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2178
2179 if (entry != NULL) {
2180 *entry = __ pc();
2181 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2182 BLOCK_COMMENT("Entry:");
2183 }
2184
2185 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2186 // r9 is used to save r15_thread
2187 // 'from', 'to' and 'qword_count' are now valid
2188
2189 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2190 if (dest_uninitialized) {
2191 decorators |= IS_DEST_UNINITIALIZED;
2192 }
2193 if (aligned) {
2194 decorators |= ARRAYCOPY_ALIGNED;
2195 }
2196
2197 BasicType type = is_oop ? T_OBJECT : T_LONG;
2198 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2199 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2200 {
2201 // UnsafeCopyMemory page error: continue after ucm
2202 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2203
2204 // Copy from low to high addresses. Use 'to' as scratch.
2205 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2206 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2207 __ negptr(qword_count);
2208 __ jmp(L_copy_bytes);
2209
2210 // Copy trailing qwords
2211 __ BIND(L_copy_8_bytes);
2212 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2213 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2214 __ increment(qword_count);
2215 __ jcc(Assembler::notZero, L_copy_8_bytes);
2216 }
2217 if (is_oop) {
2218 __ jmp(L_exit);
2219 } else {
2220 restore_arg_regs_using_thread();
2221 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2222 __ xorptr(rax, rax); // return 0
2223 __ vzeroupper();
2224 __ leave(); // required for proper stackwalking of RuntimeStub frame
2225 __ ret(0);
2226 }
2227
2228 {
2229 // UnsafeCopyMemory page error: continue after ucm
2230 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2231 // Copy in multi-bytes chunks
2232 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2233 }
2234
2235 __ BIND(L_exit);
2236 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2237 restore_arg_regs_using_thread();
2238 if (is_oop) {
2239 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2240 } else {
2241 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2242 }
2243 __ vzeroupper();
2244 __ xorptr(rax, rax); // return 0
2245 __ leave(); // required for proper stackwalking of RuntimeStub frame
2246 __ ret(0);
2247
2248 return start;
2249 }
2250
2251 // Arguments:
2252 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2253 // ignored
2254 // is_oop - true => oop array, so generate store check code
2255 // name - stub name string
2256 //
2257 // Inputs:
2258 // c_rarg0 - source array address
2259 // c_rarg1 - destination array address
2260 // c_rarg2 - element count, treated as ssize_t, can be zero
2261 //
generate_conjoint_long_oop_copy(bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)2262 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2263 address nooverlap_target, address *entry,
2264 const char *name, bool dest_uninitialized = false) {
2265 __ align(CodeEntryAlignment);
2266 StubCodeMark mark(this, "StubRoutines", name);
2267 address start = __ pc();
2268
2269 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2270 const Register from = rdi; // source array address
2271 const Register to = rsi; // destination array address
2272 const Register qword_count = rdx; // elements count
2273 const Register saved_count = rcx;
2274
2275 __ enter(); // required for proper stackwalking of RuntimeStub frame
2276 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2277
2278 if (entry != NULL) {
2279 *entry = __ pc();
2280 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2281 BLOCK_COMMENT("Entry:");
2282 }
2283
2284 array_overlap_test(nooverlap_target, Address::times_8);
2285 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2286 // r9 is used to save r15_thread
2287 // 'from', 'to' and 'qword_count' are now valid
2288
2289 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2290 if (dest_uninitialized) {
2291 decorators |= IS_DEST_UNINITIALIZED;
2292 }
2293 if (aligned) {
2294 decorators |= ARRAYCOPY_ALIGNED;
2295 }
2296
2297 BasicType type = is_oop ? T_OBJECT : T_LONG;
2298 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2299 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2300 {
2301 // UnsafeCopyMemory page error: continue after ucm
2302 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2303
2304 __ jmp(L_copy_bytes);
2305
2306 // Copy trailing qwords
2307 __ BIND(L_copy_8_bytes);
2308 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2309 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2310 __ decrement(qword_count);
2311 __ jcc(Assembler::notZero, L_copy_8_bytes);
2312 }
2313 if (is_oop) {
2314 __ jmp(L_exit);
2315 } else {
2316 restore_arg_regs_using_thread();
2317 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2318 __ xorptr(rax, rax); // return 0
2319 __ vzeroupper();
2320 __ leave(); // required for proper stackwalking of RuntimeStub frame
2321 __ ret(0);
2322 }
2323 {
2324 // UnsafeCopyMemory page error: continue after ucm
2325 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2326
2327 // Copy in multi-bytes chunks
2328 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2329 }
2330 __ BIND(L_exit);
2331 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2332 restore_arg_regs_using_thread();
2333 if (is_oop) {
2334 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2335 } else {
2336 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2337 }
2338 __ vzeroupper();
2339 __ xorptr(rax, rax); // return 0
2340 __ leave(); // required for proper stackwalking of RuntimeStub frame
2341 __ ret(0);
2342
2343 return start;
2344 }
2345
2346
2347 // Helper for generating a dynamic type check.
2348 // Smashes no registers.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)2349 void generate_type_check(Register sub_klass,
2350 Register super_check_offset,
2351 Register super_klass,
2352 Label& L_success) {
2353 assert_different_registers(sub_klass, super_check_offset, super_klass);
2354
2355 BLOCK_COMMENT("type_check:");
2356
2357 Label L_miss;
2358
2359 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL,
2360 super_check_offset);
2361 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2362
2363 // Fall through on failure!
2364 __ BIND(L_miss);
2365 }
2366
2367 //
2368 // Generate checkcasting array copy stub
2369 //
2370 // Input:
2371 // c_rarg0 - source array address
2372 // c_rarg1 - destination array address
2373 // c_rarg2 - element count, treated as ssize_t, can be zero
2374 // c_rarg3 - size_t ckoff (super_check_offset)
2375 // not Win64
2376 // c_rarg4 - oop ckval (super_klass)
2377 // Win64
2378 // rsp+40 - oop ckval (super_klass)
2379 //
2380 // Output:
2381 // rax == 0 - success
2382 // rax == -1^K - failure, where K is partial transfer count
2383 //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)2384 address generate_checkcast_copy(const char *name, address *entry,
2385 bool dest_uninitialized = false) {
2386
2387 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2388
2389 // Input registers (after setup_arg_regs)
2390 const Register from = rdi; // source array address
2391 const Register to = rsi; // destination array address
2392 const Register length = rdx; // elements count
2393 const Register ckoff = rcx; // super_check_offset
2394 const Register ckval = r8; // super_klass
2395
2396 // Registers used as temps (r13, r14 are save-on-entry)
2397 const Register end_from = from; // source array end address
2398 const Register end_to = r13; // destination array end address
2399 const Register count = rdx; // -(count_remaining)
2400 const Register r14_length = r14; // saved copy of length
2401 // End pointers are inclusive, and if length is not zero they point
2402 // to the last unit copied: end_to[0] := end_from[0]
2403
2404 const Register rax_oop = rax; // actual oop copied
2405 const Register r11_klass = r11; // oop._klass
2406
2407 //---------------------------------------------------------------
2408 // Assembler stub will be used for this call to arraycopy
2409 // if the two arrays are subtypes of Object[] but the
2410 // destination array type is not equal to or a supertype
2411 // of the source type. Each element must be separately
2412 // checked.
2413
2414 __ align(CodeEntryAlignment);
2415 StubCodeMark mark(this, "StubRoutines", name);
2416 address start = __ pc();
2417
2418 __ enter(); // required for proper stackwalking of RuntimeStub frame
2419
2420 #ifdef ASSERT
2421 // caller guarantees that the arrays really are different
2422 // otherwise, we would have to make conjoint checks
2423 { Label L;
2424 array_overlap_test(L, TIMES_OOP);
2425 __ stop("checkcast_copy within a single array");
2426 __ bind(L);
2427 }
2428 #endif //ASSERT
2429
2430 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2431 // ckoff => rcx, ckval => r8
2432 // r9 and r10 may be used to save non-volatile registers
2433 #ifdef _WIN64
2434 // last argument (#4) is on stack on Win64
2435 __ movptr(ckval, Address(rsp, 6 * wordSize));
2436 #endif
2437
2438 // Caller of this entry point must set up the argument registers.
2439 if (entry != NULL) {
2440 *entry = __ pc();
2441 BLOCK_COMMENT("Entry:");
2442 }
2443
2444 // allocate spill slots for r13, r14
2445 enum {
2446 saved_r13_offset,
2447 saved_r14_offset,
2448 saved_r10_offset,
2449 saved_rbp_offset
2450 };
2451 __ subptr(rsp, saved_rbp_offset * wordSize);
2452 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2453 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2454 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2455
2456 #ifdef ASSERT
2457 Label L2;
2458 __ get_thread(r14);
2459 __ cmpptr(r15_thread, r14);
2460 __ jcc(Assembler::equal, L2);
2461 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2462 __ bind(L2);
2463 #endif // ASSERT
2464
2465 // check that int operands are properly extended to size_t
2466 assert_clean_int(length, rax);
2467 assert_clean_int(ckoff, rax);
2468
2469 #ifdef ASSERT
2470 BLOCK_COMMENT("assert consistent ckoff/ckval");
2471 // The ckoff and ckval must be mutually consistent,
2472 // even though caller generates both.
2473 { Label L;
2474 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2475 __ cmpl(ckoff, Address(ckval, sco_offset));
2476 __ jcc(Assembler::equal, L);
2477 __ stop("super_check_offset inconsistent");
2478 __ bind(L);
2479 }
2480 #endif //ASSERT
2481
2482 // Loop-invariant addresses. They are exclusive end pointers.
2483 Address end_from_addr(from, length, TIMES_OOP, 0);
2484 Address end_to_addr(to, length, TIMES_OOP, 0);
2485 // Loop-variant addresses. They assume post-incremented count < 0.
2486 Address from_element_addr(end_from, count, TIMES_OOP, 0);
2487 Address to_element_addr(end_to, count, TIMES_OOP, 0);
2488
2489 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2490 if (dest_uninitialized) {
2491 decorators |= IS_DEST_UNINITIALIZED;
2492 }
2493
2494 BasicType type = T_OBJECT;
2495 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2496 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2497
2498 // Copy from low to high addresses, indexed from the end of each array.
2499 __ lea(end_from, end_from_addr);
2500 __ lea(end_to, end_to_addr);
2501 __ movptr(r14_length, length); // save a copy of the length
2502 assert(length == count, ""); // else fix next line:
2503 __ negptr(count); // negate and test the length
2504 __ jcc(Assembler::notZero, L_load_element);
2505
2506 // Empty array: Nothing to do.
2507 __ xorptr(rax, rax); // return 0 on (trivial) success
2508 __ jmp(L_done);
2509
2510 // ======== begin loop ========
2511 // (Loop is rotated; its entry is L_load_element.)
2512 // Loop control:
2513 // for (count = -count; count != 0; count++)
2514 // Base pointers src, dst are biased by 8*(count-1),to last element.
2515 __ align(OptoLoopAlignment);
2516
2517 __ BIND(L_store_element);
2518 __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW); // store the oop
2519 __ increment(count); // increment the count toward zero
2520 __ jcc(Assembler::zero, L_do_card_marks);
2521
2522 // ======== loop entry is here ========
2523 __ BIND(L_load_element);
2524 __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2525 __ testptr(rax_oop, rax_oop);
2526 __ jcc(Assembler::zero, L_store_element);
2527
2528 __ load_klass(r11_klass, rax_oop);// query the object klass
2529 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2530 // ======== end loop ========
2531
2532 // It was a real error; we must depend on the caller to finish the job.
2533 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2534 // Emit GC store barriers for the oops we have copied (r14 + rdx),
2535 // and report their number to the caller.
2536 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2537 Label L_post_barrier;
2538 __ addptr(r14_length, count); // K = (original - remaining) oops
2539 __ movptr(rax, r14_length); // save the value
2540 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
2541 __ jccb(Assembler::notZero, L_post_barrier);
2542 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2543
2544 // Come here on success only.
2545 __ BIND(L_do_card_marks);
2546 __ xorptr(rax, rax); // return 0 on success
2547
2548 __ BIND(L_post_barrier);
2549 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2550
2551 // Common exit point (success or failure).
2552 __ BIND(L_done);
2553 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2554 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2555 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2556 restore_arg_regs();
2557 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2558 __ leave(); // required for proper stackwalking of RuntimeStub frame
2559 __ ret(0);
2560
2561 return start;
2562 }
2563
2564 //
2565 // Generate 'unsafe' array copy stub
2566 // Though just as safe as the other stubs, it takes an unscaled
2567 // size_t argument instead of an element count.
2568 //
2569 // Input:
2570 // c_rarg0 - source array address
2571 // c_rarg1 - destination array address
2572 // c_rarg2 - byte count, treated as ssize_t, can be zero
2573 //
2574 // Examines the alignment of the operands and dispatches
2575 // to a long, int, short, or byte copy loop.
2576 //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)2577 address generate_unsafe_copy(const char *name,
2578 address byte_copy_entry, address short_copy_entry,
2579 address int_copy_entry, address long_copy_entry) {
2580
2581 Label L_long_aligned, L_int_aligned, L_short_aligned;
2582
2583 // Input registers (before setup_arg_regs)
2584 const Register from = c_rarg0; // source array address
2585 const Register to = c_rarg1; // destination array address
2586 const Register size = c_rarg2; // byte count (size_t)
2587
2588 // Register used as a temp
2589 const Register bits = rax; // test copy of low bits
2590
2591 __ align(CodeEntryAlignment);
2592 StubCodeMark mark(this, "StubRoutines", name);
2593 address start = __ pc();
2594
2595 __ enter(); // required for proper stackwalking of RuntimeStub frame
2596
2597 // bump this on entry, not on exit:
2598 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2599
2600 __ mov(bits, from);
2601 __ orptr(bits, to);
2602 __ orptr(bits, size);
2603
2604 __ testb(bits, BytesPerLong-1);
2605 __ jccb(Assembler::zero, L_long_aligned);
2606
2607 __ testb(bits, BytesPerInt-1);
2608 __ jccb(Assembler::zero, L_int_aligned);
2609
2610 __ testb(bits, BytesPerShort-1);
2611 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2612
2613 __ BIND(L_short_aligned);
2614 __ shrptr(size, LogBytesPerShort); // size => short_count
2615 __ jump(RuntimeAddress(short_copy_entry));
2616
2617 __ BIND(L_int_aligned);
2618 __ shrptr(size, LogBytesPerInt); // size => int_count
2619 __ jump(RuntimeAddress(int_copy_entry));
2620
2621 __ BIND(L_long_aligned);
2622 __ shrptr(size, LogBytesPerLong); // size => qword_count
2623 __ jump(RuntimeAddress(long_copy_entry));
2624
2625 return start;
2626 }
2627
2628 // Perform range checks on the proposed arraycopy.
2629 // Kills temp, but nothing else.
2630 // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)2631 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2632 Register src_pos, // source position (c_rarg1)
2633 Register dst, // destination array oo (c_rarg2)
2634 Register dst_pos, // destination position (c_rarg3)
2635 Register length,
2636 Register temp,
2637 Label& L_failed) {
2638 BLOCK_COMMENT("arraycopy_range_checks:");
2639
2640 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2641 __ movl(temp, length);
2642 __ addl(temp, src_pos); // src_pos + length
2643 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2644 __ jcc(Assembler::above, L_failed);
2645
2646 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2647 __ movl(temp, length);
2648 __ addl(temp, dst_pos); // dst_pos + length
2649 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2650 __ jcc(Assembler::above, L_failed);
2651
2652 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2653 // Move with sign extension can be used since they are positive.
2654 __ movslq(src_pos, src_pos);
2655 __ movslq(dst_pos, dst_pos);
2656
2657 BLOCK_COMMENT("arraycopy_range_checks done");
2658 }
2659
2660 //
2661 // Generate generic array copy stubs
2662 //
2663 // Input:
2664 // c_rarg0 - src oop
2665 // c_rarg1 - src_pos (32-bits)
2666 // c_rarg2 - dst oop
2667 // c_rarg3 - dst_pos (32-bits)
2668 // not Win64
2669 // c_rarg4 - element count (32-bits)
2670 // Win64
2671 // rsp+40 - element count (32-bits)
2672 //
2673 // Output:
2674 // rax == 0 - success
2675 // rax == -1^K - failure, where K is partial transfer count
2676 //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)2677 address generate_generic_copy(const char *name,
2678 address byte_copy_entry, address short_copy_entry,
2679 address int_copy_entry, address oop_copy_entry,
2680 address long_copy_entry, address checkcast_copy_entry) {
2681
2682 Label L_failed, L_failed_0, L_objArray;
2683 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2684
2685 // Input registers
2686 const Register src = c_rarg0; // source array oop
2687 const Register src_pos = c_rarg1; // source position
2688 const Register dst = c_rarg2; // destination array oop
2689 const Register dst_pos = c_rarg3; // destination position
2690 #ifndef _WIN64
2691 const Register length = c_rarg4;
2692 #else
2693 const Address length(rsp, 6 * wordSize); // elements count is on stack on Win64
2694 #endif
2695
2696 { int modulus = CodeEntryAlignment;
2697 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2698 int advance = target - (__ offset() % modulus);
2699 if (advance < 0) advance += modulus;
2700 if (advance > 0) __ nop(advance);
2701 }
2702 StubCodeMark mark(this, "StubRoutines", name);
2703
2704 // Short-hop target to L_failed. Makes for denser prologue code.
2705 __ BIND(L_failed_0);
2706 __ jmp(L_failed);
2707 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2708
2709 __ align(CodeEntryAlignment);
2710 address start = __ pc();
2711
2712 __ enter(); // required for proper stackwalking of RuntimeStub frame
2713
2714 // bump this on entry, not on exit:
2715 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2716
2717 //-----------------------------------------------------------------------
2718 // Assembler stub will be used for this call to arraycopy
2719 // if the following conditions are met:
2720 //
2721 // (1) src and dst must not be null.
2722 // (2) src_pos must not be negative.
2723 // (3) dst_pos must not be negative.
2724 // (4) length must not be negative.
2725 // (5) src klass and dst klass should be the same and not NULL.
2726 // (6) src and dst should be arrays.
2727 // (7) src_pos + length must not exceed length of src.
2728 // (8) dst_pos + length must not exceed length of dst.
2729 //
2730
2731 // if (src == NULL) return -1;
2732 __ testptr(src, src); // src oop
2733 size_t j1off = __ offset();
2734 __ jccb(Assembler::zero, L_failed_0);
2735
2736 // if (src_pos < 0) return -1;
2737 __ testl(src_pos, src_pos); // src_pos (32-bits)
2738 __ jccb(Assembler::negative, L_failed_0);
2739
2740 // if (dst == NULL) return -1;
2741 __ testptr(dst, dst); // dst oop
2742 __ jccb(Assembler::zero, L_failed_0);
2743
2744 // if (dst_pos < 0) return -1;
2745 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2746 size_t j4off = __ offset();
2747 __ jccb(Assembler::negative, L_failed_0);
2748
2749 // The first four tests are very dense code,
2750 // but not quite dense enough to put four
2751 // jumps in a 16-byte instruction fetch buffer.
2752 // That's good, because some branch predicters
2753 // do not like jumps so close together.
2754 // Make sure of this.
2755 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2756
2757 // registers used as temp
2758 const Register r11_length = r11; // elements count to copy
2759 const Register r10_src_klass = r10; // array klass
2760
2761 // if (length < 0) return -1;
2762 __ movl(r11_length, length); // length (elements count, 32-bits value)
2763 __ testl(r11_length, r11_length);
2764 __ jccb(Assembler::negative, L_failed_0);
2765
2766 __ load_klass(r10_src_klass, src);
2767 #ifdef ASSERT
2768 // assert(src->klass() != NULL);
2769 {
2770 BLOCK_COMMENT("assert klasses not null {");
2771 Label L1, L2;
2772 __ testptr(r10_src_klass, r10_src_klass);
2773 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
2774 __ bind(L1);
2775 __ stop("broken null klass");
2776 __ bind(L2);
2777 __ load_klass(rax, dst);
2778 __ cmpq(rax, 0);
2779 __ jcc(Assembler::equal, L1); // this would be broken also
2780 BLOCK_COMMENT("} assert klasses not null done");
2781 }
2782 #endif
2783
2784 // Load layout helper (32-bits)
2785 //
2786 // |array_tag| | header_size | element_type | |log2_element_size|
2787 // 32 30 24 16 8 2 0
2788 //
2789 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2790 //
2791
2792 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2793
2794 // Handle objArrays completely differently...
2795 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2796 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2797 __ jcc(Assembler::equal, L_objArray);
2798
2799 // if (src->klass() != dst->klass()) return -1;
2800 __ load_klass(rax, dst);
2801 __ cmpq(r10_src_klass, rax);
2802 __ jcc(Assembler::notEqual, L_failed);
2803
2804 const Register rax_lh = rax; // layout helper
2805 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2806
2807 // if (!src->is_Array()) return -1;
2808 __ cmpl(rax_lh, Klass::_lh_neutral_value);
2809 __ jcc(Assembler::greaterEqual, L_failed);
2810
2811 // At this point, it is known to be a typeArray (array_tag 0x3).
2812 #ifdef ASSERT
2813 {
2814 BLOCK_COMMENT("assert primitive array {");
2815 Label L;
2816 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2817 __ jcc(Assembler::greaterEqual, L);
2818 __ stop("must be a primitive array");
2819 __ bind(L);
2820 BLOCK_COMMENT("} assert primitive array done");
2821 }
2822 #endif
2823
2824 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2825 r10, L_failed);
2826
2827 // TypeArrayKlass
2828 //
2829 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2830 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2831 //
2832
2833 const Register r10_offset = r10; // array offset
2834 const Register rax_elsize = rax_lh; // element size
2835
2836 __ movl(r10_offset, rax_lh);
2837 __ shrl(r10_offset, Klass::_lh_header_size_shift);
2838 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
2839 __ addptr(src, r10_offset); // src array offset
2840 __ addptr(dst, r10_offset); // dst array offset
2841 BLOCK_COMMENT("choose copy loop based on element size");
2842 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2843
2844 // next registers should be set before the jump to corresponding stub
2845 const Register from = c_rarg0; // source array address
2846 const Register to = c_rarg1; // destination array address
2847 const Register count = c_rarg2; // elements count
2848
2849 // 'from', 'to', 'count' registers should be set in such order
2850 // since they are the same as 'src', 'src_pos', 'dst'.
2851
2852 __ BIND(L_copy_bytes);
2853 __ cmpl(rax_elsize, 0);
2854 __ jccb(Assembler::notEqual, L_copy_shorts);
2855 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2856 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2857 __ movl2ptr(count, r11_length); // length
2858 __ jump(RuntimeAddress(byte_copy_entry));
2859
2860 __ BIND(L_copy_shorts);
2861 __ cmpl(rax_elsize, LogBytesPerShort);
2862 __ jccb(Assembler::notEqual, L_copy_ints);
2863 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2864 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2865 __ movl2ptr(count, r11_length); // length
2866 __ jump(RuntimeAddress(short_copy_entry));
2867
2868 __ BIND(L_copy_ints);
2869 __ cmpl(rax_elsize, LogBytesPerInt);
2870 __ jccb(Assembler::notEqual, L_copy_longs);
2871 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2872 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2873 __ movl2ptr(count, r11_length); // length
2874 __ jump(RuntimeAddress(int_copy_entry));
2875
2876 __ BIND(L_copy_longs);
2877 #ifdef ASSERT
2878 {
2879 BLOCK_COMMENT("assert long copy {");
2880 Label L;
2881 __ cmpl(rax_elsize, LogBytesPerLong);
2882 __ jcc(Assembler::equal, L);
2883 __ stop("must be long copy, but elsize is wrong");
2884 __ bind(L);
2885 BLOCK_COMMENT("} assert long copy done");
2886 }
2887 #endif
2888 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2889 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2890 __ movl2ptr(count, r11_length); // length
2891 __ jump(RuntimeAddress(long_copy_entry));
2892
2893 // ObjArrayKlass
2894 __ BIND(L_objArray);
2895 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
2896
2897 Label L_plain_copy, L_checkcast_copy;
2898 // test array classes for subtyping
2899 __ load_klass(rax, dst);
2900 __ cmpq(r10_src_klass, rax); // usual case is exact equality
2901 __ jcc(Assembler::notEqual, L_checkcast_copy);
2902
2903 // Identically typed arrays can be copied without element-wise checks.
2904 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2905 r10, L_failed);
2906
2907 __ lea(from, Address(src, src_pos, TIMES_OOP,
2908 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2909 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2910 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2911 __ movl2ptr(count, r11_length); // length
2912 __ BIND(L_plain_copy);
2913 __ jump(RuntimeAddress(oop_copy_entry));
2914
2915 __ BIND(L_checkcast_copy);
2916 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
2917 {
2918 // Before looking at dst.length, make sure dst is also an objArray.
2919 __ cmpl(Address(rax, lh_offset), objArray_lh);
2920 __ jcc(Assembler::notEqual, L_failed);
2921
2922 // It is safe to examine both src.length and dst.length.
2923 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2924 rax, L_failed);
2925
2926 const Register r11_dst_klass = r11;
2927 __ load_klass(r11_dst_klass, dst); // reload
2928
2929 // Marshal the base address arguments now, freeing registers.
2930 __ lea(from, Address(src, src_pos, TIMES_OOP,
2931 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2932 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2933 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2934 __ movl(count, length); // length (reloaded)
2935 Register sco_temp = c_rarg3; // this register is free now
2936 assert_different_registers(from, to, count, sco_temp,
2937 r11_dst_klass, r10_src_klass);
2938 assert_clean_int(count, sco_temp);
2939
2940 // Generate the type check.
2941 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2942 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2943 assert_clean_int(sco_temp, rax);
2944 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2945
2946 // Fetch destination element klass from the ObjArrayKlass header.
2947 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2948 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2949 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
2950 assert_clean_int(sco_temp, rax);
2951
2952 // the checkcast_copy loop needs two extra arguments:
2953 assert(c_rarg3 == sco_temp, "#3 already in place");
2954 // Set up arguments for checkcast_copy_entry.
2955 setup_arg_regs(4);
2956 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2957 __ jump(RuntimeAddress(checkcast_copy_entry));
2958 }
2959
2960 __ BIND(L_failed);
2961 __ xorptr(rax, rax);
2962 __ notptr(rax); // return -1
2963 __ leave(); // required for proper stackwalking of RuntimeStub frame
2964 __ ret(0);
2965
2966 return start;
2967 }
2968
generate_data_cache_writeback()2969 address generate_data_cache_writeback() {
2970 const Register src = c_rarg0; // source address
2971
2972 __ align(CodeEntryAlignment);
2973
2974 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2975
2976 address start = __ pc();
2977 __ enter();
2978 __ cache_wb(Address(src, 0));
2979 __ leave();
2980 __ ret(0);
2981
2982 return start;
2983 }
2984
generate_data_cache_writeback_sync()2985 address generate_data_cache_writeback_sync() {
2986 const Register is_pre = c_rarg0; // pre or post sync
2987
2988 __ align(CodeEntryAlignment);
2989
2990 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2991
2992 // pre wbsync is a no-op
2993 // post wbsync translates to an sfence
2994
2995 Label skip;
2996 address start = __ pc();
2997 __ enter();
2998 __ cmpl(is_pre, 0);
2999 __ jcc(Assembler::notEqual, skip);
3000 __ cache_wbsync(false);
3001 __ bind(skip);
3002 __ leave();
3003 __ ret(0);
3004
3005 return start;
3006 }
3007
generate_arraycopy_stubs()3008 void generate_arraycopy_stubs() {
3009 address entry;
3010 address entry_jbyte_arraycopy;
3011 address entry_jshort_arraycopy;
3012 address entry_jint_arraycopy;
3013 address entry_oop_arraycopy;
3014 address entry_jlong_arraycopy;
3015 address entry_checkcast_arraycopy;
3016
3017 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
3018 "jbyte_disjoint_arraycopy");
3019 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3020 "jbyte_arraycopy");
3021
3022 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3023 "jshort_disjoint_arraycopy");
3024 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3025 "jshort_arraycopy");
3026
3027 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry,
3028 "jint_disjoint_arraycopy");
3029 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry,
3030 &entry_jint_arraycopy, "jint_arraycopy");
3031
3032 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry,
3033 "jlong_disjoint_arraycopy");
3034 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry,
3035 &entry_jlong_arraycopy, "jlong_arraycopy");
3036
3037
3038 if (UseCompressedOops) {
3039 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry,
3040 "oop_disjoint_arraycopy");
3041 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry,
3042 &entry_oop_arraycopy, "oop_arraycopy");
3043 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry,
3044 "oop_disjoint_arraycopy_uninit",
3045 /*dest_uninitialized*/true);
3046 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry,
3047 NULL, "oop_arraycopy_uninit",
3048 /*dest_uninitialized*/true);
3049 } else {
3050 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry,
3051 "oop_disjoint_arraycopy");
3052 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry,
3053 &entry_oop_arraycopy, "oop_arraycopy");
3054 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry,
3055 "oop_disjoint_arraycopy_uninit",
3056 /*dest_uninitialized*/true);
3057 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry,
3058 NULL, "oop_arraycopy_uninit",
3059 /*dest_uninitialized*/true);
3060 }
3061
3062 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3063 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3064 /*dest_uninitialized*/true);
3065
3066 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3067 entry_jbyte_arraycopy,
3068 entry_jshort_arraycopy,
3069 entry_jint_arraycopy,
3070 entry_jlong_arraycopy);
3071 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3072 entry_jbyte_arraycopy,
3073 entry_jshort_arraycopy,
3074 entry_jint_arraycopy,
3075 entry_oop_arraycopy,
3076 entry_jlong_arraycopy,
3077 entry_checkcast_arraycopy);
3078
3079 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3080 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3081 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3082 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3083 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3084 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3085
3086 // We don't generate specialized code for HeapWord-aligned source
3087 // arrays, so just use the code we've already generated
3088 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
3089 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
3090
3091 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3092 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
3093
3094 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
3095 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
3096
3097 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
3098 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
3099
3100 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
3101 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
3102
3103 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
3104 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
3105 }
3106
3107 // AES intrinsic stubs
3108 enum {AESBlockSize = 16};
3109
generate_key_shuffle_mask()3110 address generate_key_shuffle_mask() {
3111 __ align(16);
3112 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3113 address start = __ pc();
3114 __ emit_data64( 0x0405060700010203, relocInfo::none );
3115 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3116 return start;
3117 }
3118
generate_counter_shuffle_mask()3119 address generate_counter_shuffle_mask() {
3120 __ align(16);
3121 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3122 address start = __ pc();
3123 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3124 __ emit_data64(0x0001020304050607, relocInfo::none);
3125 return start;
3126 }
3127
3128 // Utility routine for loading a 128-bit key word in little endian format
3129 // can optionally specify that the shuffle mask is already in an xmmregister
load_key(XMMRegister xmmdst,Register key,int offset,XMMRegister xmm_shuf_mask=NULL)3130 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3131 __ movdqu(xmmdst, Address(key, offset));
3132 if (xmm_shuf_mask != NULL) {
3133 __ pshufb(xmmdst, xmm_shuf_mask);
3134 } else {
3135 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3136 }
3137 }
3138
3139 // Utility routine for increase 128bit counter (iv in CTR mode)
inc_counter(Register reg,XMMRegister xmmdst,int inc_delta,Label & next_block)3140 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3141 __ pextrq(reg, xmmdst, 0x0);
3142 __ addq(reg, inc_delta);
3143 __ pinsrq(xmmdst, reg, 0x0);
3144 __ jcc(Assembler::carryClear, next_block); // jump if no carry
3145 __ pextrq(reg, xmmdst, 0x01); // Carry
3146 __ addq(reg, 0x01);
3147 __ pinsrq(xmmdst, reg, 0x01); //Carry end
3148 __ BIND(next_block); // next instruction
3149 }
3150
3151 // Arguments:
3152 //
3153 // Inputs:
3154 // c_rarg0 - source byte array address
3155 // c_rarg1 - destination byte array address
3156 // c_rarg2 - K (key) in little endian int array
3157 //
generate_aescrypt_encryptBlock()3158 address generate_aescrypt_encryptBlock() {
3159 assert(UseAES, "need AES instructions and misaligned SSE support");
3160 __ align(CodeEntryAlignment);
3161 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3162 Label L_doLast;
3163 address start = __ pc();
3164
3165 const Register from = c_rarg0; // source array address
3166 const Register to = c_rarg1; // destination array address
3167 const Register key = c_rarg2; // key array address
3168 const Register keylen = rax;
3169
3170 const XMMRegister xmm_result = xmm0;
3171 const XMMRegister xmm_key_shuf_mask = xmm1;
3172 // On win64 xmm6-xmm15 must be preserved so don't use them.
3173 const XMMRegister xmm_temp1 = xmm2;
3174 const XMMRegister xmm_temp2 = xmm3;
3175 const XMMRegister xmm_temp3 = xmm4;
3176 const XMMRegister xmm_temp4 = xmm5;
3177
3178 __ enter(); // required for proper stackwalking of RuntimeStub frame
3179
3180 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3181 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3182
3183 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3184 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
3185
3186 // For encryption, the java expanded key ordering is just what we need
3187 // we don't know if the key is aligned, hence not using load-execute form
3188
3189 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3190 __ pxor(xmm_result, xmm_temp1);
3191
3192 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3193 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3194 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3195 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3196
3197 __ aesenc(xmm_result, xmm_temp1);
3198 __ aesenc(xmm_result, xmm_temp2);
3199 __ aesenc(xmm_result, xmm_temp3);
3200 __ aesenc(xmm_result, xmm_temp4);
3201
3202 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3203 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3204 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3205 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3206
3207 __ aesenc(xmm_result, xmm_temp1);
3208 __ aesenc(xmm_result, xmm_temp2);
3209 __ aesenc(xmm_result, xmm_temp3);
3210 __ aesenc(xmm_result, xmm_temp4);
3211
3212 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3213 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3214
3215 __ cmpl(keylen, 44);
3216 __ jccb(Assembler::equal, L_doLast);
3217
3218 __ aesenc(xmm_result, xmm_temp1);
3219 __ aesenc(xmm_result, xmm_temp2);
3220
3221 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3222 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3223
3224 __ cmpl(keylen, 52);
3225 __ jccb(Assembler::equal, L_doLast);
3226
3227 __ aesenc(xmm_result, xmm_temp1);
3228 __ aesenc(xmm_result, xmm_temp2);
3229
3230 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3231 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3232
3233 __ BIND(L_doLast);
3234 __ aesenc(xmm_result, xmm_temp1);
3235 __ aesenclast(xmm_result, xmm_temp2);
3236 __ movdqu(Address(to, 0), xmm_result); // store the result
3237 __ xorptr(rax, rax); // return 0
3238 __ leave(); // required for proper stackwalking of RuntimeStub frame
3239 __ ret(0);
3240
3241 return start;
3242 }
3243
3244
3245 // Arguments:
3246 //
3247 // Inputs:
3248 // c_rarg0 - source byte array address
3249 // c_rarg1 - destination byte array address
3250 // c_rarg2 - K (key) in little endian int array
3251 //
generate_aescrypt_decryptBlock()3252 address generate_aescrypt_decryptBlock() {
3253 assert(UseAES, "need AES instructions and misaligned SSE support");
3254 __ align(CodeEntryAlignment);
3255 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3256 Label L_doLast;
3257 address start = __ pc();
3258
3259 const Register from = c_rarg0; // source array address
3260 const Register to = c_rarg1; // destination array address
3261 const Register key = c_rarg2; // key array address
3262 const Register keylen = rax;
3263
3264 const XMMRegister xmm_result = xmm0;
3265 const XMMRegister xmm_key_shuf_mask = xmm1;
3266 // On win64 xmm6-xmm15 must be preserved so don't use them.
3267 const XMMRegister xmm_temp1 = xmm2;
3268 const XMMRegister xmm_temp2 = xmm3;
3269 const XMMRegister xmm_temp3 = xmm4;
3270 const XMMRegister xmm_temp4 = xmm5;
3271
3272 __ enter(); // required for proper stackwalking of RuntimeStub frame
3273
3274 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3275 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3276
3277 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3278 __ movdqu(xmm_result, Address(from, 0));
3279
3280 // for decryption java expanded key ordering is rotated one position from what we want
3281 // so we start from 0x10 here and hit 0x00 last
3282 // we don't know if the key is aligned, hence not using load-execute form
3283 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3284 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3285 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3286 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3287
3288 __ pxor (xmm_result, xmm_temp1);
3289 __ aesdec(xmm_result, xmm_temp2);
3290 __ aesdec(xmm_result, xmm_temp3);
3291 __ aesdec(xmm_result, xmm_temp4);
3292
3293 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3294 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3295 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3296 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3297
3298 __ aesdec(xmm_result, xmm_temp1);
3299 __ aesdec(xmm_result, xmm_temp2);
3300 __ aesdec(xmm_result, xmm_temp3);
3301 __ aesdec(xmm_result, xmm_temp4);
3302
3303 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3304 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3305 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3306
3307 __ cmpl(keylen, 44);
3308 __ jccb(Assembler::equal, L_doLast);
3309
3310 __ aesdec(xmm_result, xmm_temp1);
3311 __ aesdec(xmm_result, xmm_temp2);
3312
3313 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3314 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3315
3316 __ cmpl(keylen, 52);
3317 __ jccb(Assembler::equal, L_doLast);
3318
3319 __ aesdec(xmm_result, xmm_temp1);
3320 __ aesdec(xmm_result, xmm_temp2);
3321
3322 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3323 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3324
3325 __ BIND(L_doLast);
3326 __ aesdec(xmm_result, xmm_temp1);
3327 __ aesdec(xmm_result, xmm_temp2);
3328
3329 // for decryption the aesdeclast operation is always on key+0x00
3330 __ aesdeclast(xmm_result, xmm_temp3);
3331 __ movdqu(Address(to, 0), xmm_result); // store the result
3332 __ xorptr(rax, rax); // return 0
3333 __ leave(); // required for proper stackwalking of RuntimeStub frame
3334 __ ret(0);
3335
3336 return start;
3337 }
3338
3339
3340 // Arguments:
3341 //
3342 // Inputs:
3343 // c_rarg0 - source byte array address
3344 // c_rarg1 - destination byte array address
3345 // c_rarg2 - K (key) in little endian int array
3346 // c_rarg3 - r vector byte array address
3347 // c_rarg4 - input length
3348 //
3349 // Output:
3350 // rax - input length
3351 //
generate_cipherBlockChaining_encryptAESCrypt()3352 address generate_cipherBlockChaining_encryptAESCrypt() {
3353 assert(UseAES, "need AES instructions and misaligned SSE support");
3354 __ align(CodeEntryAlignment);
3355 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3356 address start = __ pc();
3357
3358 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3359 const Register from = c_rarg0; // source array address
3360 const Register to = c_rarg1; // destination array address
3361 const Register key = c_rarg2; // key array address
3362 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3363 // and left with the results of the last encryption block
3364 #ifndef _WIN64
3365 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3366 #else
3367 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3368 const Register len_reg = r11; // pick the volatile windows register
3369 #endif
3370 const Register pos = rax;
3371
3372 // xmm register assignments for the loops below
3373 const XMMRegister xmm_result = xmm0;
3374 const XMMRegister xmm_temp = xmm1;
3375 // keys 0-10 preloaded into xmm2-xmm12
3376 const int XMM_REG_NUM_KEY_FIRST = 2;
3377 const int XMM_REG_NUM_KEY_LAST = 15;
3378 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3379 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3380 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3381 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3382 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3383
3384 __ enter(); // required for proper stackwalking of RuntimeStub frame
3385
3386 #ifdef _WIN64
3387 // on win64, fill len_reg from stack position
3388 __ movl(len_reg, len_mem);
3389 #else
3390 __ push(len_reg); // Save
3391 #endif
3392
3393 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3394 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3395 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3396 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3397 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3398 offset += 0x10;
3399 }
3400 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3401
3402 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3403 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3404 __ cmpl(rax, 44);
3405 __ jcc(Assembler::notEqual, L_key_192_256);
3406
3407 // 128 bit code follows here
3408 __ movptr(pos, 0);
3409 __ align(OptoLoopAlignment);
3410
3411 __ BIND(L_loopTop_128);
3412 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3413 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3414 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3415 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3416 __ aesenc(xmm_result, as_XMMRegister(rnum));
3417 }
3418 __ aesenclast(xmm_result, xmm_key10);
3419 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3420 // no need to store r to memory until we exit
3421 __ addptr(pos, AESBlockSize);
3422 __ subptr(len_reg, AESBlockSize);
3423 __ jcc(Assembler::notEqual, L_loopTop_128);
3424
3425 __ BIND(L_exit);
3426 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3427
3428 #ifdef _WIN64
3429 __ movl(rax, len_mem);
3430 #else
3431 __ pop(rax); // return length
3432 #endif
3433 __ leave(); // required for proper stackwalking of RuntimeStub frame
3434 __ ret(0);
3435
3436 __ BIND(L_key_192_256);
3437 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3438 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3439 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3440 __ cmpl(rax, 52);
3441 __ jcc(Assembler::notEqual, L_key_256);
3442
3443 // 192-bit code follows here (could be changed to use more xmm registers)
3444 __ movptr(pos, 0);
3445 __ align(OptoLoopAlignment);
3446
3447 __ BIND(L_loopTop_192);
3448 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3449 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3450 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3451 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3452 __ aesenc(xmm_result, as_XMMRegister(rnum));
3453 }
3454 __ aesenclast(xmm_result, xmm_key12);
3455 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3456 // no need to store r to memory until we exit
3457 __ addptr(pos, AESBlockSize);
3458 __ subptr(len_reg, AESBlockSize);
3459 __ jcc(Assembler::notEqual, L_loopTop_192);
3460 __ jmp(L_exit);
3461
3462 __ BIND(L_key_256);
3463 // 256-bit code follows here (could be changed to use more xmm registers)
3464 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3465 __ movptr(pos, 0);
3466 __ align(OptoLoopAlignment);
3467
3468 __ BIND(L_loopTop_256);
3469 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3470 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3471 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3472 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3473 __ aesenc(xmm_result, as_XMMRegister(rnum));
3474 }
3475 load_key(xmm_temp, key, 0xe0);
3476 __ aesenclast(xmm_result, xmm_temp);
3477 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3478 // no need to store r to memory until we exit
3479 __ addptr(pos, AESBlockSize);
3480 __ subptr(len_reg, AESBlockSize);
3481 __ jcc(Assembler::notEqual, L_loopTop_256);
3482 __ jmp(L_exit);
3483
3484 return start;
3485 }
3486
3487 // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3488 void generate_safefetch(const char* name, int size, address* entry,
3489 address* fault_pc, address* continuation_pc) {
3490 // safefetch signatures:
3491 // int SafeFetch32(int* adr, int errValue);
3492 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3493 //
3494 // arguments:
3495 // c_rarg0 = adr
3496 // c_rarg1 = errValue
3497 //
3498 // result:
3499 // PPC_RET = *adr or errValue
3500
3501 StubCodeMark mark(this, "StubRoutines", name);
3502
3503 // Entry point, pc or function descriptor.
3504 *entry = __ pc();
3505
3506 // Load *adr into c_rarg1, may fault.
3507 *fault_pc = __ pc();
3508 switch (size) {
3509 case 4:
3510 // int32_t
3511 __ movl(c_rarg1, Address(c_rarg0, 0));
3512 break;
3513 case 8:
3514 // int64_t
3515 __ movq(c_rarg1, Address(c_rarg0, 0));
3516 break;
3517 default:
3518 ShouldNotReachHere();
3519 }
3520
3521 // return errValue or *adr
3522 *continuation_pc = __ pc();
3523 __ movq(rax, c_rarg1);
3524 __ ret(0);
3525 }
3526
3527 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3528 // to hide instruction latency
3529 //
3530 // Arguments:
3531 //
3532 // Inputs:
3533 // c_rarg0 - source byte array address
3534 // c_rarg1 - destination byte array address
3535 // c_rarg2 - K (key) in little endian int array
3536 // c_rarg3 - r vector byte array address
3537 // c_rarg4 - input length
3538 //
3539 // Output:
3540 // rax - input length
3541 //
generate_cipherBlockChaining_decryptAESCrypt_Parallel()3542 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3543 assert(UseAES, "need AES instructions and misaligned SSE support");
3544 __ align(CodeEntryAlignment);
3545 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3546 address start = __ pc();
3547
3548 const Register from = c_rarg0; // source array address
3549 const Register to = c_rarg1; // destination array address
3550 const Register key = c_rarg2; // key array address
3551 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3552 // and left with the results of the last encryption block
3553 #ifndef _WIN64
3554 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3555 #else
3556 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3557 const Register len_reg = r11; // pick the volatile windows register
3558 #endif
3559 const Register pos = rax;
3560
3561 const int PARALLEL_FACTOR = 4;
3562 const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3563
3564 Label L_exit;
3565 Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3566 Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3567 Label L_singleBlock_loopTop[3]; // 128, 192, 256
3568 Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3569 Label L_multiBlock_loopTop[3]; // 128, 192, 256
3570
3571 // keys 0-10 preloaded into xmm5-xmm15
3572 const int XMM_REG_NUM_KEY_FIRST = 5;
3573 const int XMM_REG_NUM_KEY_LAST = 15;
3574 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3575 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3576
3577 __ enter(); // required for proper stackwalking of RuntimeStub frame
3578
3579 #ifdef _WIN64
3580 // on win64, fill len_reg from stack position
3581 __ movl(len_reg, len_mem);
3582 #else
3583 __ push(len_reg); // Save
3584 #endif
3585 __ push(rbx);
3586 // the java expanded key ordering is rotated one position from what we want
3587 // so we start from 0x10 here and hit 0x00 last
3588 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3589 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3590 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3591 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3592 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3593 offset += 0x10;
3594 }
3595 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3596
3597 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3598
3599 // registers holding the four results in the parallelized loop
3600 const XMMRegister xmm_result0 = xmm0;
3601 const XMMRegister xmm_result1 = xmm2;
3602 const XMMRegister xmm_result2 = xmm3;
3603 const XMMRegister xmm_result3 = xmm4;
3604
3605 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
3606
3607 __ xorptr(pos, pos);
3608
3609 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3610 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3611 __ cmpl(rbx, 52);
3612 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3613 __ cmpl(rbx, 60);
3614 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3615
3616 #define DoFour(opc, src_reg) \
3617 __ opc(xmm_result0, src_reg); \
3618 __ opc(xmm_result1, src_reg); \
3619 __ opc(xmm_result2, src_reg); \
3620 __ opc(xmm_result3, src_reg); \
3621
3622 for (int k = 0; k < 3; ++k) {
3623 __ BIND(L_multiBlock_loopTopHead[k]);
3624 if (k != 0) {
3625 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3626 __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3627 }
3628 if (k == 1) {
3629 __ subptr(rsp, 6 * wordSize);
3630 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3631 load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3632 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3633 load_key(xmm1, key, 0xc0); // 0xc0;
3634 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3635 } else if (k == 2) {
3636 __ subptr(rsp, 10 * wordSize);
3637 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3638 load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3639 __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3640 load_key(xmm1, key, 0xe0); // 0xe0;
3641 __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3642 load_key(xmm15, key, 0xb0); // 0xb0;
3643 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3644 load_key(xmm1, key, 0xc0); // 0xc0;
3645 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3646 }
3647 __ align(OptoLoopAlignment);
3648 __ BIND(L_multiBlock_loopTop[k]);
3649 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3650 __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3651
3652 if (k != 0) {
3653 __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3654 __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3655 }
3656
3657 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3658 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3659 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3660 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3661
3662 DoFour(pxor, xmm_key_first);
3663 if (k == 0) {
3664 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3665 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3666 }
3667 DoFour(aesdeclast, xmm_key_last);
3668 } else if (k == 1) {
3669 for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3670 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3671 }
3672 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3673 DoFour(aesdec, xmm1); // key : 0xc0
3674 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3675 DoFour(aesdeclast, xmm_key_last);
3676 } else if (k == 2) {
3677 for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3678 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3679 }
3680 DoFour(aesdec, xmm1); // key : 0xc0
3681 __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3682 __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3683 DoFour(aesdec, xmm15); // key : 0xd0
3684 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3685 DoFour(aesdec, xmm1); // key : 0xe0
3686 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3687 DoFour(aesdeclast, xmm_key_last);
3688 }
3689
3690 // for each result, xor with the r vector of previous cipher block
3691 __ pxor(xmm_result0, xmm_prev_block_cipher);
3692 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3693 __ pxor(xmm_result1, xmm_prev_block_cipher);
3694 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3695 __ pxor(xmm_result2, xmm_prev_block_cipher);
3696 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3697 __ pxor(xmm_result3, xmm_prev_block_cipher);
3698 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
3699 if (k != 0) {
3700 __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3701 }
3702
3703 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
3704 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3705 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3706 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3707
3708 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3709 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3710 __ jmp(L_multiBlock_loopTop[k]);
3711
3712 // registers used in the non-parallelized loops
3713 // xmm register assignments for the loops below
3714 const XMMRegister xmm_result = xmm0;
3715 const XMMRegister xmm_prev_block_cipher_save = xmm2;
3716 const XMMRegister xmm_key11 = xmm3;
3717 const XMMRegister xmm_key12 = xmm4;
3718 const XMMRegister key_tmp = xmm4;
3719
3720 __ BIND(L_singleBlock_loopTopHead[k]);
3721 if (k == 1) {
3722 __ addptr(rsp, 6 * wordSize);
3723 } else if (k == 2) {
3724 __ addptr(rsp, 10 * wordSize);
3725 }
3726 __ cmpptr(len_reg, 0); // any blocks left??
3727 __ jcc(Assembler::equal, L_exit);
3728 __ BIND(L_singleBlock_loopTopHead2[k]);
3729 if (k == 1) {
3730 load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3731 load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3732 }
3733 if (k == 2) {
3734 load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3735 }
3736 __ align(OptoLoopAlignment);
3737 __ BIND(L_singleBlock_loopTop[k]);
3738 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3739 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3740 __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3741 for (int rnum = 1; rnum <= 9 ; rnum++) {
3742 __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3743 }
3744 if (k == 1) {
3745 __ aesdec(xmm_result, xmm_key11);
3746 __ aesdec(xmm_result, xmm_key12);
3747 }
3748 if (k == 2) {
3749 __ aesdec(xmm_result, xmm_key11);
3750 load_key(key_tmp, key, 0xc0);
3751 __ aesdec(xmm_result, key_tmp);
3752 load_key(key_tmp, key, 0xd0);
3753 __ aesdec(xmm_result, key_tmp);
3754 load_key(key_tmp, key, 0xe0);
3755 __ aesdec(xmm_result, key_tmp);
3756 }
3757
3758 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3759 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3760 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3761 // no need to store r to memory until we exit
3762 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3763 __ addptr(pos, AESBlockSize);
3764 __ subptr(len_reg, AESBlockSize);
3765 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3766 if (k != 2) {
3767 __ jmp(L_exit);
3768 }
3769 } //for 128/192/256
3770
3771 __ BIND(L_exit);
3772 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
3773 __ pop(rbx);
3774 #ifdef _WIN64
3775 __ movl(rax, len_mem);
3776 #else
3777 __ pop(rax); // return length
3778 #endif
3779 __ leave(); // required for proper stackwalking of RuntimeStub frame
3780 __ ret(0);
3781 return start;
3782 }
3783
generate_electronicCodeBook_encryptAESCrypt()3784 address generate_electronicCodeBook_encryptAESCrypt() {
3785 __ align(CodeEntryAlignment);
3786 StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
3787 address start = __ pc();
3788 const Register from = c_rarg0; // source array address
3789 const Register to = c_rarg1; // destination array address
3790 const Register key = c_rarg2; // key array address
3791 const Register len = c_rarg3; // src len (must be multiple of blocksize 16)
3792 __ enter(); // required for proper stackwalking of RuntimeStub frame
3793 __ aesecb_encrypt(from, to, key, len);
3794 __ leave(); // required for proper stackwalking of RuntimeStub frame
3795 __ ret(0);
3796 return start;
3797 }
3798
generate_electronicCodeBook_decryptAESCrypt()3799 address generate_electronicCodeBook_decryptAESCrypt() {
3800 __ align(CodeEntryAlignment);
3801 StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
3802 address start = __ pc();
3803 const Register from = c_rarg0; // source array address
3804 const Register to = c_rarg1; // destination array address
3805 const Register key = c_rarg2; // key array address
3806 const Register len = c_rarg3; // src len (must be multiple of blocksize 16)
3807 __ enter(); // required for proper stackwalking of RuntimeStub frame
3808 __ aesecb_decrypt(from, to, key, len);
3809 __ leave(); // required for proper stackwalking of RuntimeStub frame
3810 __ ret(0);
3811 return start;
3812 }
3813
generate_upper_word_mask()3814 address generate_upper_word_mask() {
3815 __ align(64);
3816 StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3817 address start = __ pc();
3818 __ emit_data64(0x0000000000000000, relocInfo::none);
3819 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3820 return start;
3821 }
3822
generate_shuffle_byte_flip_mask()3823 address generate_shuffle_byte_flip_mask() {
3824 __ align(64);
3825 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3826 address start = __ pc();
3827 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3828 __ emit_data64(0x0001020304050607, relocInfo::none);
3829 return start;
3830 }
3831
3832 // ofs and limit are use for multi-block byte array.
3833 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
generate_sha1_implCompress(bool multi_block,const char * name)3834 address generate_sha1_implCompress(bool multi_block, const char *name) {
3835 __ align(CodeEntryAlignment);
3836 StubCodeMark mark(this, "StubRoutines", name);
3837 address start = __ pc();
3838
3839 Register buf = c_rarg0;
3840 Register state = c_rarg1;
3841 Register ofs = c_rarg2;
3842 Register limit = c_rarg3;
3843
3844 const XMMRegister abcd = xmm0;
3845 const XMMRegister e0 = xmm1;
3846 const XMMRegister e1 = xmm2;
3847 const XMMRegister msg0 = xmm3;
3848
3849 const XMMRegister msg1 = xmm4;
3850 const XMMRegister msg2 = xmm5;
3851 const XMMRegister msg3 = xmm6;
3852 const XMMRegister shuf_mask = xmm7;
3853
3854 __ enter();
3855
3856 __ subptr(rsp, 4 * wordSize);
3857
3858 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3859 buf, state, ofs, limit, rsp, multi_block);
3860
3861 __ addptr(rsp, 4 * wordSize);
3862
3863 __ leave();
3864 __ ret(0);
3865 return start;
3866 }
3867
generate_pshuffle_byte_flip_mask()3868 address generate_pshuffle_byte_flip_mask() {
3869 __ align(64);
3870 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3871 address start = __ pc();
3872 __ emit_data64(0x0405060700010203, relocInfo::none);
3873 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3874
3875 if (VM_Version::supports_avx2()) {
3876 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3877 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3878 // _SHUF_00BA
3879 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3880 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3881 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3882 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3883 // _SHUF_DC00
3884 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3885 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3886 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3887 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3888 }
3889
3890 return start;
3891 }
3892
3893 //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
generate_pshuffle_byte_flip_mask_sha512()3894 address generate_pshuffle_byte_flip_mask_sha512() {
3895 __ align(32);
3896 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3897 address start = __ pc();
3898 if (VM_Version::supports_avx2()) {
3899 __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3900 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3901 __ emit_data64(0x1011121314151617, relocInfo::none);
3902 __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3903 __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3904 __ emit_data64(0x0000000000000000, relocInfo::none);
3905 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3906 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3907 }
3908
3909 return start;
3910 }
3911
3912 // ofs and limit are use for multi-block byte array.
3913 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
generate_sha256_implCompress(bool multi_block,const char * name)3914 address generate_sha256_implCompress(bool multi_block, const char *name) {
3915 assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3916 __ align(CodeEntryAlignment);
3917 StubCodeMark mark(this, "StubRoutines", name);
3918 address start = __ pc();
3919
3920 Register buf = c_rarg0;
3921 Register state = c_rarg1;
3922 Register ofs = c_rarg2;
3923 Register limit = c_rarg3;
3924
3925 const XMMRegister msg = xmm0;
3926 const XMMRegister state0 = xmm1;
3927 const XMMRegister state1 = xmm2;
3928 const XMMRegister msgtmp0 = xmm3;
3929
3930 const XMMRegister msgtmp1 = xmm4;
3931 const XMMRegister msgtmp2 = xmm5;
3932 const XMMRegister msgtmp3 = xmm6;
3933 const XMMRegister msgtmp4 = xmm7;
3934
3935 const XMMRegister shuf_mask = xmm8;
3936
3937 __ enter();
3938
3939 __ subptr(rsp, 4 * wordSize);
3940
3941 if (VM_Version::supports_sha()) {
3942 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3943 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3944 } else if (VM_Version::supports_avx2()) {
3945 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3946 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3947 }
3948 __ addptr(rsp, 4 * wordSize);
3949 __ vzeroupper();
3950 __ leave();
3951 __ ret(0);
3952 return start;
3953 }
3954
generate_sha512_implCompress(bool multi_block,const char * name)3955 address generate_sha512_implCompress(bool multi_block, const char *name) {
3956 assert(VM_Version::supports_avx2(), "");
3957 assert(VM_Version::supports_bmi2(), "");
3958 __ align(CodeEntryAlignment);
3959 StubCodeMark mark(this, "StubRoutines", name);
3960 address start = __ pc();
3961
3962 Register buf = c_rarg0;
3963 Register state = c_rarg1;
3964 Register ofs = c_rarg2;
3965 Register limit = c_rarg3;
3966
3967 const XMMRegister msg = xmm0;
3968 const XMMRegister state0 = xmm1;
3969 const XMMRegister state1 = xmm2;
3970 const XMMRegister msgtmp0 = xmm3;
3971 const XMMRegister msgtmp1 = xmm4;
3972 const XMMRegister msgtmp2 = xmm5;
3973 const XMMRegister msgtmp3 = xmm6;
3974 const XMMRegister msgtmp4 = xmm7;
3975
3976 const XMMRegister shuf_mask = xmm8;
3977
3978 __ enter();
3979
3980 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3981 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3982
3983 __ vzeroupper();
3984 __ leave();
3985 __ ret(0);
3986 return start;
3987 }
3988
3989 // This mask is used for incrementing counter value(linc0, linc4, etc.)
counter_mask_addr()3990 address counter_mask_addr() {
3991 __ align(64);
3992 StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
3993 address start = __ pc();
3994 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
3995 __ emit_data64(0x0001020304050607, relocInfo::none);
3996 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3997 __ emit_data64(0x0001020304050607, relocInfo::none);
3998 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3999 __ emit_data64(0x0001020304050607, relocInfo::none);
4000 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4001 __ emit_data64(0x0001020304050607, relocInfo::none);
4002 __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4003 __ emit_data64(0x0000000000000000, relocInfo::none);
4004 __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4005 __ emit_data64(0x0000000000000000, relocInfo::none);
4006 __ emit_data64(0x0000000000000002, relocInfo::none);
4007 __ emit_data64(0x0000000000000000, relocInfo::none);
4008 __ emit_data64(0x0000000000000003, relocInfo::none);
4009 __ emit_data64(0x0000000000000000, relocInfo::none);
4010 __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4011 __ emit_data64(0x0000000000000000, relocInfo::none);
4012 __ emit_data64(0x0000000000000004, relocInfo::none);
4013 __ emit_data64(0x0000000000000000, relocInfo::none);
4014 __ emit_data64(0x0000000000000004, relocInfo::none);
4015 __ emit_data64(0x0000000000000000, relocInfo::none);
4016 __ emit_data64(0x0000000000000004, relocInfo::none);
4017 __ emit_data64(0x0000000000000000, relocInfo::none);
4018 __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4019 __ emit_data64(0x0000000000000000, relocInfo::none);
4020 __ emit_data64(0x0000000000000008, relocInfo::none);
4021 __ emit_data64(0x0000000000000000, relocInfo::none);
4022 __ emit_data64(0x0000000000000008, relocInfo::none);
4023 __ emit_data64(0x0000000000000000, relocInfo::none);
4024 __ emit_data64(0x0000000000000008, relocInfo::none);
4025 __ emit_data64(0x0000000000000000, relocInfo::none);
4026 __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4027 __ emit_data64(0x0000000000000000, relocInfo::none);
4028 __ emit_data64(0x0000000000000020, relocInfo::none);
4029 __ emit_data64(0x0000000000000000, relocInfo::none);
4030 __ emit_data64(0x0000000000000020, relocInfo::none);
4031 __ emit_data64(0x0000000000000000, relocInfo::none);
4032 __ emit_data64(0x0000000000000020, relocInfo::none);
4033 __ emit_data64(0x0000000000000000, relocInfo::none);
4034 __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4035 __ emit_data64(0x0000000000000000, relocInfo::none);
4036 __ emit_data64(0x0000000000000010, relocInfo::none);
4037 __ emit_data64(0x0000000000000000, relocInfo::none);
4038 __ emit_data64(0x0000000000000010, relocInfo::none);
4039 __ emit_data64(0x0000000000000000, relocInfo::none);
4040 __ emit_data64(0x0000000000000010, relocInfo::none);
4041 __ emit_data64(0x0000000000000000, relocInfo::none);
4042 return start;
4043 }
4044
4045 // Vector AES Counter implementation
generate_counterMode_VectorAESCrypt()4046 address generate_counterMode_VectorAESCrypt() {
4047 __ align(CodeEntryAlignment);
4048 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4049 address start = __ pc();
4050 const Register from = c_rarg0; // source array address
4051 const Register to = c_rarg1; // destination array address
4052 const Register key = c_rarg2; // key array address r8
4053 const Register counter = c_rarg3; // counter byte array initialized from counter array address
4054 // and updated with the incremented counter in the end
4055 #ifndef _WIN64
4056 const Register len_reg = c_rarg4;
4057 const Register saved_encCounter_start = c_rarg5;
4058 const Register used_addr = r10;
4059 const Address used_mem(rbp, 2 * wordSize);
4060 const Register used = r11;
4061 #else
4062 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4063 const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4064 const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4065 const Register len_reg = r10; // pick the first volatile windows register
4066 const Register saved_encCounter_start = r11;
4067 const Register used_addr = r13;
4068 const Register used = r14;
4069 #endif
4070 __ enter();
4071 // Save state before entering routine
4072 __ push(r12);
4073 __ push(r13);
4074 __ push(r14);
4075 __ push(r15);
4076 #ifdef _WIN64
4077 // on win64, fill len_reg from stack position
4078 __ movl(len_reg, len_mem);
4079 __ movptr(saved_encCounter_start, saved_encCounter_mem);
4080 __ movptr(used_addr, used_mem);
4081 __ movl(used, Address(used_addr, 0));
4082 #else
4083 __ push(len_reg); // Save
4084 __ movptr(used_addr, used_mem);
4085 __ movl(used, Address(used_addr, 0));
4086 #endif
4087 __ push(rbx);
4088 __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4089 // Restore state before leaving routine
4090 __ pop(rbx);
4091 #ifdef _WIN64
4092 __ movl(rax, len_mem); // return length
4093 #else
4094 __ pop(rax); // return length
4095 #endif
4096 __ pop(r15);
4097 __ pop(r14);
4098 __ pop(r13);
4099 __ pop(r12);
4100
4101 __ leave(); // required for proper stackwalking of RuntimeStub frame
4102 __ ret(0);
4103 return start;
4104 }
4105
4106 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4107 // to hide instruction latency
4108 //
4109 // Arguments:
4110 //
4111 // Inputs:
4112 // c_rarg0 - source byte array address
4113 // c_rarg1 - destination byte array address
4114 // c_rarg2 - K (key) in little endian int array
4115 // c_rarg3 - counter vector byte array address
4116 // Linux
4117 // c_rarg4 - input length
4118 // c_rarg5 - saved encryptedCounter start
4119 // rbp + 6 * wordSize - saved used length
4120 // Windows
4121 // rbp + 6 * wordSize - input length
4122 // rbp + 7 * wordSize - saved encryptedCounter start
4123 // rbp + 8 * wordSize - saved used length
4124 //
4125 // Output:
4126 // rax - input length
4127 //
generate_counterMode_AESCrypt_Parallel()4128 address generate_counterMode_AESCrypt_Parallel() {
4129 assert(UseAES, "need AES instructions and misaligned SSE support");
4130 __ align(CodeEntryAlignment);
4131 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4132 address start = __ pc();
4133 const Register from = c_rarg0; // source array address
4134 const Register to = c_rarg1; // destination array address
4135 const Register key = c_rarg2; // key array address
4136 const Register counter = c_rarg3; // counter byte array initialized from counter array address
4137 // and updated with the incremented counter in the end
4138 #ifndef _WIN64
4139 const Register len_reg = c_rarg4;
4140 const Register saved_encCounter_start = c_rarg5;
4141 const Register used_addr = r10;
4142 const Address used_mem(rbp, 2 * wordSize);
4143 const Register used = r11;
4144 #else
4145 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4146 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4147 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4148 const Register len_reg = r10; // pick the first volatile windows register
4149 const Register saved_encCounter_start = r11;
4150 const Register used_addr = r13;
4151 const Register used = r14;
4152 #endif
4153 const Register pos = rax;
4154
4155 const int PARALLEL_FACTOR = 6;
4156 const XMMRegister xmm_counter_shuf_mask = xmm0;
4157 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4158 const XMMRegister xmm_curr_counter = xmm2;
4159
4160 const XMMRegister xmm_key_tmp0 = xmm3;
4161 const XMMRegister xmm_key_tmp1 = xmm4;
4162
4163 // registers holding the four results in the parallelized loop
4164 const XMMRegister xmm_result0 = xmm5;
4165 const XMMRegister xmm_result1 = xmm6;
4166 const XMMRegister xmm_result2 = xmm7;
4167 const XMMRegister xmm_result3 = xmm8;
4168 const XMMRegister xmm_result4 = xmm9;
4169 const XMMRegister xmm_result5 = xmm10;
4170
4171 const XMMRegister xmm_from0 = xmm11;
4172 const XMMRegister xmm_from1 = xmm12;
4173 const XMMRegister xmm_from2 = xmm13;
4174 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4175 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4176 const XMMRegister xmm_from5 = xmm4;
4177
4178 //for key_128, key_192, key_256
4179 const int rounds[3] = {10, 12, 14};
4180 Label L_exit_preLoop, L_preLoop_start;
4181 Label L_multiBlock_loopTop[3];
4182 Label L_singleBlockLoopTop[3];
4183 Label L__incCounter[3][6]; //for 6 blocks
4184 Label L__incCounter_single[3]; //for single block, key128, key192, key256
4185 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4186 Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4187
4188 Label L_exit;
4189
4190 __ enter(); // required for proper stackwalking of RuntimeStub frame
4191
4192 #ifdef _WIN64
4193 // allocate spill slots for r13, r14
4194 enum {
4195 saved_r13_offset,
4196 saved_r14_offset
4197 };
4198 __ subptr(rsp, 2 * wordSize);
4199 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4200 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4201
4202 // on win64, fill len_reg from stack position
4203 __ movl(len_reg, len_mem);
4204 __ movptr(saved_encCounter_start, saved_encCounter_mem);
4205 __ movptr(used_addr, used_mem);
4206 __ movl(used, Address(used_addr, 0));
4207 #else
4208 __ push(len_reg); // Save
4209 __ movptr(used_addr, used_mem);
4210 __ movl(used, Address(used_addr, 0));
4211 #endif
4212
4213 __ push(rbx); // Save RBX
4214 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4215 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4216 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4217 __ movptr(pos, 0);
4218
4219 // Use the partially used encrpyted counter from last invocation
4220 __ BIND(L_preLoop_start);
4221 __ cmpptr(used, 16);
4222 __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4223 __ cmpptr(len_reg, 0);
4224 __ jcc(Assembler::lessEqual, L_exit_preLoop);
4225 __ movb(rbx, Address(saved_encCounter_start, used));
4226 __ xorb(rbx, Address(from, pos));
4227 __ movb(Address(to, pos), rbx);
4228 __ addptr(pos, 1);
4229 __ addptr(used, 1);
4230 __ subptr(len_reg, 1);
4231
4232 __ jmp(L_preLoop_start);
4233
4234 __ BIND(L_exit_preLoop);
4235 __ movl(Address(used_addr, 0), used);
4236
4237 // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4238 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4239 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4240 __ cmpl(rbx, 52);
4241 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4242 __ cmpl(rbx, 60);
4243 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4244
4245 #define CTR_DoSix(opc, src_reg) \
4246 __ opc(xmm_result0, src_reg); \
4247 __ opc(xmm_result1, src_reg); \
4248 __ opc(xmm_result2, src_reg); \
4249 __ opc(xmm_result3, src_reg); \
4250 __ opc(xmm_result4, src_reg); \
4251 __ opc(xmm_result5, src_reg);
4252
4253 // k == 0 : generate code for key_128
4254 // k == 1 : generate code for key_192
4255 // k == 2 : generate code for key_256
4256 for (int k = 0; k < 3; ++k) {
4257 //multi blocks starts here
4258 __ align(OptoLoopAlignment);
4259 __ BIND(L_multiBlock_loopTop[k]);
4260 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4261 __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4262 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4263
4264 //load, then increase counters
4265 CTR_DoSix(movdqa, xmm_curr_counter);
4266 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4267 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4268 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4269 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4270 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
4271 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4272 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4273 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
4274
4275 //load two ROUND_KEYs at a time
4276 for (int i = 1; i < rounds[k]; ) {
4277 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4278 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4279 CTR_DoSix(aesenc, xmm_key_tmp1);
4280 i++;
4281 if (i != rounds[k]) {
4282 CTR_DoSix(aesenc, xmm_key_tmp0);
4283 } else {
4284 CTR_DoSix(aesenclast, xmm_key_tmp0);
4285 }
4286 i++;
4287 }
4288
4289 // get next PARALLEL_FACTOR blocks into xmm_result registers
4290 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4291 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4292 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4293 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4294 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4295 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4296
4297 __ pxor(xmm_result0, xmm_from0);
4298 __ pxor(xmm_result1, xmm_from1);
4299 __ pxor(xmm_result2, xmm_from2);
4300 __ pxor(xmm_result3, xmm_from3);
4301 __ pxor(xmm_result4, xmm_from4);
4302 __ pxor(xmm_result5, xmm_from5);
4303
4304 // store 6 results into the next 64 bytes of output
4305 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4306 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4307 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4308 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4309 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4310 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4311
4312 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4313 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4314 __ jmp(L_multiBlock_loopTop[k]);
4315
4316 // singleBlock starts here
4317 __ align(OptoLoopAlignment);
4318 __ BIND(L_singleBlockLoopTop[k]);
4319 __ cmpptr(len_reg, 0);
4320 __ jcc(Assembler::lessEqual, L_exit);
4321 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4322 __ movdqa(xmm_result0, xmm_curr_counter);
4323 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4324 __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4325 __ pxor(xmm_result0, xmm_key_tmp0);
4326 for (int i = 1; i < rounds[k]; i++) {
4327 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4328 __ aesenc(xmm_result0, xmm_key_tmp0);
4329 }
4330 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4331 __ aesenclast(xmm_result0, xmm_key_tmp0);
4332 __ cmpptr(len_reg, AESBlockSize);
4333 __ jcc(Assembler::less, L_processTail_insr[k]);
4334 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4335 __ pxor(xmm_result0, xmm_from0);
4336 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4337 __ addptr(pos, AESBlockSize);
4338 __ subptr(len_reg, AESBlockSize);
4339 __ jmp(L_singleBlockLoopTop[k]);
4340 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array
4341 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register
4342 __ testptr(len_reg, 8);
4343 __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4344 __ subptr(pos,8);
4345 __ pinsrq(xmm_from0, Address(from, pos), 0);
4346 __ BIND(L_processTail_4_insr[k]);
4347 __ testptr(len_reg, 4);
4348 __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4349 __ subptr(pos,4);
4350 __ pslldq(xmm_from0, 4);
4351 __ pinsrd(xmm_from0, Address(from, pos), 0);
4352 __ BIND(L_processTail_2_insr[k]);
4353 __ testptr(len_reg, 2);
4354 __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4355 __ subptr(pos, 2);
4356 __ pslldq(xmm_from0, 2);
4357 __ pinsrw(xmm_from0, Address(from, pos), 0);
4358 __ BIND(L_processTail_1_insr[k]);
4359 __ testptr(len_reg, 1);
4360 __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4361 __ subptr(pos, 1);
4362 __ pslldq(xmm_from0, 1);
4363 __ pinsrb(xmm_from0, Address(from, pos), 0);
4364 __ BIND(L_processTail_exit_insr[k]);
4365
4366 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4367 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
4368
4369 __ testptr(len_reg, 8);
4370 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array
4371 __ pextrq(Address(to, pos), xmm_result0, 0);
4372 __ psrldq(xmm_result0, 8);
4373 __ addptr(pos, 8);
4374 __ BIND(L_processTail_4_extr[k]);
4375 __ testptr(len_reg, 4);
4376 __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4377 __ pextrd(Address(to, pos), xmm_result0, 0);
4378 __ psrldq(xmm_result0, 4);
4379 __ addptr(pos, 4);
4380 __ BIND(L_processTail_2_extr[k]);
4381 __ testptr(len_reg, 2);
4382 __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4383 __ pextrw(Address(to, pos), xmm_result0, 0);
4384 __ psrldq(xmm_result0, 2);
4385 __ addptr(pos, 2);
4386 __ BIND(L_processTail_1_extr[k]);
4387 __ testptr(len_reg, 1);
4388 __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4389 __ pextrb(Address(to, pos), xmm_result0, 0);
4390
4391 __ BIND(L_processTail_exit_extr[k]);
4392 __ movl(Address(used_addr, 0), len_reg);
4393 __ jmp(L_exit);
4394
4395 }
4396
4397 __ BIND(L_exit);
4398 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4399 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4400 __ pop(rbx); // pop the saved RBX.
4401 #ifdef _WIN64
4402 __ movl(rax, len_mem);
4403 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4404 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4405 __ addptr(rsp, 2 * wordSize);
4406 #else
4407 __ pop(rax); // return 'len'
4408 #endif
4409 __ leave(); // required for proper stackwalking of RuntimeStub frame
4410 __ ret(0);
4411 return start;
4412 }
4413
roundDec(XMMRegister xmm_reg)4414 void roundDec(XMMRegister xmm_reg) {
4415 __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4416 __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4417 __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4418 __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4419 __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4420 __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4421 __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4422 __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4423 }
4424
roundDeclast(XMMRegister xmm_reg)4425 void roundDeclast(XMMRegister xmm_reg) {
4426 __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4427 __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4428 __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4429 __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4430 __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4431 __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4432 __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4433 __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4434 }
4435
ev_load_key(XMMRegister xmmdst,Register key,int offset,XMMRegister xmm_shuf_mask=NULL)4436 void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4437 __ movdqu(xmmdst, Address(key, offset));
4438 if (xmm_shuf_mask != NULL) {
4439 __ pshufb(xmmdst, xmm_shuf_mask);
4440 } else {
4441 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4442 }
4443 __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4444
4445 }
4446
generate_cipherBlockChaining_decryptVectorAESCrypt()4447 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4448 assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
4449 __ align(CodeEntryAlignment);
4450 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4451 address start = __ pc();
4452
4453 const Register from = c_rarg0; // source array address
4454 const Register to = c_rarg1; // destination array address
4455 const Register key = c_rarg2; // key array address
4456 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
4457 // and left with the results of the last encryption block
4458 #ifndef _WIN64
4459 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
4460 #else
4461 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4462 const Register len_reg = r11; // pick the volatile windows register
4463 #endif
4464
4465 Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4466 Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4467
4468 __ enter();
4469
4470 #ifdef _WIN64
4471 // on win64, fill len_reg from stack position
4472 __ movl(len_reg, len_mem);
4473 #else
4474 __ push(len_reg); // Save
4475 #endif
4476 __ push(rbx);
4477 __ vzeroupper();
4478
4479 // Temporary variable declaration for swapping key bytes
4480 const XMMRegister xmm_key_shuf_mask = xmm1;
4481 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4482
4483 // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4484 const Register rounds = rbx;
4485 __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4486
4487 const XMMRegister IV = xmm0;
4488 // Load IV and broadcast value to 512-bits
4489 __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4490
4491 // Temporary variables for storing round keys
4492 const XMMRegister RK0 = xmm30;
4493 const XMMRegister RK1 = xmm9;
4494 const XMMRegister RK2 = xmm18;
4495 const XMMRegister RK3 = xmm19;
4496 const XMMRegister RK4 = xmm20;
4497 const XMMRegister RK5 = xmm21;
4498 const XMMRegister RK6 = xmm22;
4499 const XMMRegister RK7 = xmm23;
4500 const XMMRegister RK8 = xmm24;
4501 const XMMRegister RK9 = xmm25;
4502 const XMMRegister RK10 = xmm26;
4503
4504 // Load and shuffle key
4505 // the java expanded key ordering is rotated one position from what we want
4506 // so we start from 1*16 here and hit 0*16 last
4507 ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4508 ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4509 ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4510 ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4511 ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4512 ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4513 ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4514 ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4515 ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4516 ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4517 ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4518
4519 // Variables for storing source cipher text
4520 const XMMRegister S0 = xmm10;
4521 const XMMRegister S1 = xmm11;
4522 const XMMRegister S2 = xmm12;
4523 const XMMRegister S3 = xmm13;
4524 const XMMRegister S4 = xmm14;
4525 const XMMRegister S5 = xmm15;
4526 const XMMRegister S6 = xmm16;
4527 const XMMRegister S7 = xmm17;
4528
4529 // Variables for storing decrypted text
4530 const XMMRegister B0 = xmm1;
4531 const XMMRegister B1 = xmm2;
4532 const XMMRegister B2 = xmm3;
4533 const XMMRegister B3 = xmm4;
4534 const XMMRegister B4 = xmm5;
4535 const XMMRegister B5 = xmm6;
4536 const XMMRegister B6 = xmm7;
4537 const XMMRegister B7 = xmm8;
4538
4539 __ cmpl(rounds, 44);
4540 __ jcc(Assembler::greater, KEY_192);
4541 __ jmp(Loop);
4542
4543 __ BIND(KEY_192);
4544 const XMMRegister RK11 = xmm27;
4545 const XMMRegister RK12 = xmm28;
4546 ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4547 ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4548
4549 __ cmpl(rounds, 52);
4550 __ jcc(Assembler::greater, KEY_256);
4551 __ jmp(Loop);
4552
4553 __ BIND(KEY_256);
4554 const XMMRegister RK13 = xmm29;
4555 const XMMRegister RK14 = xmm31;
4556 ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4557 ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4558
4559 __ BIND(Loop);
4560 __ cmpl(len_reg, 512);
4561 __ jcc(Assembler::below, Lcbc_dec_rem);
4562 __ BIND(Loop1);
4563 __ subl(len_reg, 512);
4564 __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4565 __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4566 __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4567 __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4568 __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4569 __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4570 __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4571 __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4572 __ leaq(from, Address(from, 8 * 64));
4573
4574 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4575 __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4576 __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4577 __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4578 __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4579 __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4580 __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4581 __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4582
4583 __ evalignq(IV, S0, IV, 0x06);
4584 __ evalignq(S0, S1, S0, 0x06);
4585 __ evalignq(S1, S2, S1, 0x06);
4586 __ evalignq(S2, S3, S2, 0x06);
4587 __ evalignq(S3, S4, S3, 0x06);
4588 __ evalignq(S4, S5, S4, 0x06);
4589 __ evalignq(S5, S6, S5, 0x06);
4590 __ evalignq(S6, S7, S6, 0x06);
4591
4592 roundDec(RK2);
4593 roundDec(RK3);
4594 roundDec(RK4);
4595 roundDec(RK5);
4596 roundDec(RK6);
4597 roundDec(RK7);
4598 roundDec(RK8);
4599 roundDec(RK9);
4600 roundDec(RK10);
4601
4602 __ cmpl(rounds, 44);
4603 __ jcc(Assembler::belowEqual, L_128);
4604 roundDec(RK11);
4605 roundDec(RK12);
4606
4607 __ cmpl(rounds, 52);
4608 __ jcc(Assembler::belowEqual, L_192);
4609 roundDec(RK13);
4610 roundDec(RK14);
4611
4612 __ BIND(L_256);
4613 roundDeclast(RK0);
4614 __ jmp(Loop2);
4615
4616 __ BIND(L_128);
4617 roundDeclast(RK0);
4618 __ jmp(Loop2);
4619
4620 __ BIND(L_192);
4621 roundDeclast(RK0);
4622
4623 __ BIND(Loop2);
4624 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4625 __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
4626 __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
4627 __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
4628 __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
4629 __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
4630 __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
4631 __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
4632 __ evmovdquq(IV, S7, Assembler::AVX_512bit);
4633
4634 __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
4635 __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
4636 __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
4637 __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
4638 __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
4639 __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
4640 __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
4641 __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
4642 __ leaq(to, Address(to, 8 * 64));
4643 __ jmp(Loop);
4644
4645 __ BIND(Lcbc_dec_rem);
4646 __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
4647
4648 __ BIND(Lcbc_dec_rem_loop);
4649 __ subl(len_reg, 16);
4650 __ jcc(Assembler::carrySet, Lcbc_dec_ret);
4651
4652 __ movdqu(S0, Address(from, 0));
4653 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4654 __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
4655 __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
4656 __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
4657 __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
4658 __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
4659 __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
4660 __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
4661 __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
4662 __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
4663 __ cmpl(rounds, 44);
4664 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4665
4666 __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
4667 __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
4668 __ cmpl(rounds, 52);
4669 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4670
4671 __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
4672 __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
4673
4674 __ BIND(Lcbc_dec_rem_last);
4675 __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
4676
4677 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4678 __ evmovdquq(IV, S0, Assembler::AVX_512bit);
4679 __ movdqu(Address(to, 0), B0);
4680 __ leaq(from, Address(from, 16));
4681 __ leaq(to, Address(to, 16));
4682 __ jmp(Lcbc_dec_rem_loop);
4683
4684 __ BIND(Lcbc_dec_ret);
4685 __ movdqu(Address(rvec, 0), IV);
4686
4687 // Zero out the round keys
4688 __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
4689 __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
4690 __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
4691 __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
4692 __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
4693 __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
4694 __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
4695 __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
4696 __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
4697 __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
4698 __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
4699 __ cmpl(rounds, 44);
4700 __ jcc(Assembler::belowEqual, Lcbc_exit);
4701 __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
4702 __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
4703 __ cmpl(rounds, 52);
4704 __ jcc(Assembler::belowEqual, Lcbc_exit);
4705 __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
4706 __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
4707
4708 __ BIND(Lcbc_exit);
4709 __ pop(rbx);
4710 #ifdef _WIN64
4711 __ movl(rax, len_mem);
4712 #else
4713 __ pop(rax); // return length
4714 #endif
4715 __ leave(); // required for proper stackwalking of RuntimeStub frame
4716 __ ret(0);
4717 return start;
4718 }
4719
4720 // Polynomial x^128+x^127+x^126+x^121+1
ghash_polynomial_addr()4721 address ghash_polynomial_addr() {
4722 __ align(CodeEntryAlignment);
4723 StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
4724 address start = __ pc();
4725 __ emit_data64(0x0000000000000001, relocInfo::none);
4726 __ emit_data64(0xc200000000000000, relocInfo::none);
4727 return start;
4728 }
4729
ghash_shufflemask_addr()4730 address ghash_shufflemask_addr() {
4731 __ align(CodeEntryAlignment);
4732 StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
4733 address start = __ pc();
4734 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4735 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4736 return start;
4737 }
4738
4739 // Ghash single and multi block operations using AVX instructions
generate_avx_ghash_processBlocks()4740 address generate_avx_ghash_processBlocks() {
4741 __ align(CodeEntryAlignment);
4742
4743 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4744 address start = __ pc();
4745
4746 // arguments
4747 const Register state = c_rarg0;
4748 const Register htbl = c_rarg1;
4749 const Register data = c_rarg2;
4750 const Register blocks = c_rarg3;
4751 __ enter();
4752 // Save state before entering routine
4753 __ avx_ghash(state, htbl, data, blocks);
4754 __ leave(); // required for proper stackwalking of RuntimeStub frame
4755 __ ret(0);
4756 return start;
4757 }
4758
4759 // byte swap x86 long
generate_ghash_long_swap_mask()4760 address generate_ghash_long_swap_mask() {
4761 __ align(CodeEntryAlignment);
4762 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4763 address start = __ pc();
4764 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4765 __ emit_data64(0x0706050403020100, relocInfo::none );
4766 return start;
4767 }
4768
4769 // byte swap x86 byte array
generate_ghash_byte_swap_mask()4770 address generate_ghash_byte_swap_mask() {
4771 __ align(CodeEntryAlignment);
4772 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4773 address start = __ pc();
4774 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4775 __ emit_data64(0x0001020304050607, relocInfo::none );
4776 return start;
4777 }
4778
4779 /* Single and multi-block ghash operations */
generate_ghash_processBlocks()4780 address generate_ghash_processBlocks() {
4781 __ align(CodeEntryAlignment);
4782 Label L_ghash_loop, L_exit;
4783 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4784 address start = __ pc();
4785
4786 const Register state = c_rarg0;
4787 const Register subkeyH = c_rarg1;
4788 const Register data = c_rarg2;
4789 const Register blocks = c_rarg3;
4790
4791 const XMMRegister xmm_temp0 = xmm0;
4792 const XMMRegister xmm_temp1 = xmm1;
4793 const XMMRegister xmm_temp2 = xmm2;
4794 const XMMRegister xmm_temp3 = xmm3;
4795 const XMMRegister xmm_temp4 = xmm4;
4796 const XMMRegister xmm_temp5 = xmm5;
4797 const XMMRegister xmm_temp6 = xmm6;
4798 const XMMRegister xmm_temp7 = xmm7;
4799 const XMMRegister xmm_temp8 = xmm8;
4800 const XMMRegister xmm_temp9 = xmm9;
4801 const XMMRegister xmm_temp10 = xmm10;
4802
4803 __ enter();
4804
4805 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4806
4807 __ movdqu(xmm_temp0, Address(state, 0));
4808 __ pshufb(xmm_temp0, xmm_temp10);
4809
4810
4811 __ BIND(L_ghash_loop);
4812 __ movdqu(xmm_temp2, Address(data, 0));
4813 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4814
4815 __ movdqu(xmm_temp1, Address(subkeyH, 0));
4816 __ pshufb(xmm_temp1, xmm_temp10);
4817
4818 __ pxor(xmm_temp0, xmm_temp2);
4819
4820 //
4821 // Multiply with the hash key
4822 //
4823 __ movdqu(xmm_temp3, xmm_temp0);
4824 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
4825 __ movdqu(xmm_temp4, xmm_temp0);
4826 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
4827
4828 __ movdqu(xmm_temp5, xmm_temp0);
4829 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
4830 __ movdqu(xmm_temp6, xmm_temp0);
4831 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
4832
4833 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
4834
4835 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
4836 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
4837 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
4838 __ pxor(xmm_temp3, xmm_temp5);
4839 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
4840 // of the carry-less multiplication of
4841 // xmm0 by xmm1.
4842
4843 // We shift the result of the multiplication by one bit position
4844 // to the left to cope for the fact that the bits are reversed.
4845 __ movdqu(xmm_temp7, xmm_temp3);
4846 __ movdqu(xmm_temp8, xmm_temp6);
4847 __ pslld(xmm_temp3, 1);
4848 __ pslld(xmm_temp6, 1);
4849 __ psrld(xmm_temp7, 31);
4850 __ psrld(xmm_temp8, 31);
4851 __ movdqu(xmm_temp9, xmm_temp7);
4852 __ pslldq(xmm_temp8, 4);
4853 __ pslldq(xmm_temp7, 4);
4854 __ psrldq(xmm_temp9, 12);
4855 __ por(xmm_temp3, xmm_temp7);
4856 __ por(xmm_temp6, xmm_temp8);
4857 __ por(xmm_temp6, xmm_temp9);
4858
4859 //
4860 // First phase of the reduction
4861 //
4862 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4863 // independently.
4864 __ movdqu(xmm_temp7, xmm_temp3);
4865 __ movdqu(xmm_temp8, xmm_temp3);
4866 __ movdqu(xmm_temp9, xmm_temp3);
4867 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
4868 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
4869 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
4870 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
4871 __ pxor(xmm_temp7, xmm_temp9);
4872 __ movdqu(xmm_temp8, xmm_temp7);
4873 __ pslldq(xmm_temp7, 12);
4874 __ psrldq(xmm_temp8, 4);
4875 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
4876
4877 //
4878 // Second phase of the reduction
4879 //
4880 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4881 // shift operations.
4882 __ movdqu(xmm_temp2, xmm_temp3);
4883 __ movdqu(xmm_temp4, xmm_temp3);
4884 __ movdqu(xmm_temp5, xmm_temp3);
4885 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
4886 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
4887 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
4888 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
4889 __ pxor(xmm_temp2, xmm_temp5);
4890 __ pxor(xmm_temp2, xmm_temp8);
4891 __ pxor(xmm_temp3, xmm_temp2);
4892 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
4893
4894 __ decrement(blocks);
4895 __ jcc(Assembler::zero, L_exit);
4896 __ movdqu(xmm_temp0, xmm_temp6);
4897 __ addptr(data, 16);
4898 __ jmp(L_ghash_loop);
4899
4900 __ BIND(L_exit);
4901 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
4902 __ movdqu(Address(state, 0), xmm_temp6); // store the result
4903 __ leave();
4904 __ ret(0);
4905 return start;
4906 }
4907
4908 //base64 character set
base64_charset_addr()4909 address base64_charset_addr() {
4910 __ align(CodeEntryAlignment);
4911 StubCodeMark mark(this, "StubRoutines", "base64_charset");
4912 address start = __ pc();
4913 __ emit_data64(0x0000004200000041, relocInfo::none);
4914 __ emit_data64(0x0000004400000043, relocInfo::none);
4915 __ emit_data64(0x0000004600000045, relocInfo::none);
4916 __ emit_data64(0x0000004800000047, relocInfo::none);
4917 __ emit_data64(0x0000004a00000049, relocInfo::none);
4918 __ emit_data64(0x0000004c0000004b, relocInfo::none);
4919 __ emit_data64(0x0000004e0000004d, relocInfo::none);
4920 __ emit_data64(0x000000500000004f, relocInfo::none);
4921 __ emit_data64(0x0000005200000051, relocInfo::none);
4922 __ emit_data64(0x0000005400000053, relocInfo::none);
4923 __ emit_data64(0x0000005600000055, relocInfo::none);
4924 __ emit_data64(0x0000005800000057, relocInfo::none);
4925 __ emit_data64(0x0000005a00000059, relocInfo::none);
4926 __ emit_data64(0x0000006200000061, relocInfo::none);
4927 __ emit_data64(0x0000006400000063, relocInfo::none);
4928 __ emit_data64(0x0000006600000065, relocInfo::none);
4929 __ emit_data64(0x0000006800000067, relocInfo::none);
4930 __ emit_data64(0x0000006a00000069, relocInfo::none);
4931 __ emit_data64(0x0000006c0000006b, relocInfo::none);
4932 __ emit_data64(0x0000006e0000006d, relocInfo::none);
4933 __ emit_data64(0x000000700000006f, relocInfo::none);
4934 __ emit_data64(0x0000007200000071, relocInfo::none);
4935 __ emit_data64(0x0000007400000073, relocInfo::none);
4936 __ emit_data64(0x0000007600000075, relocInfo::none);
4937 __ emit_data64(0x0000007800000077, relocInfo::none);
4938 __ emit_data64(0x0000007a00000079, relocInfo::none);
4939 __ emit_data64(0x0000003100000030, relocInfo::none);
4940 __ emit_data64(0x0000003300000032, relocInfo::none);
4941 __ emit_data64(0x0000003500000034, relocInfo::none);
4942 __ emit_data64(0x0000003700000036, relocInfo::none);
4943 __ emit_data64(0x0000003900000038, relocInfo::none);
4944 __ emit_data64(0x0000002f0000002b, relocInfo::none);
4945 return start;
4946 }
4947
4948 //base64 url character set
base64url_charset_addr()4949 address base64url_charset_addr() {
4950 __ align(CodeEntryAlignment);
4951 StubCodeMark mark(this, "StubRoutines", "base64url_charset");
4952 address start = __ pc();
4953 __ emit_data64(0x0000004200000041, relocInfo::none);
4954 __ emit_data64(0x0000004400000043, relocInfo::none);
4955 __ emit_data64(0x0000004600000045, relocInfo::none);
4956 __ emit_data64(0x0000004800000047, relocInfo::none);
4957 __ emit_data64(0x0000004a00000049, relocInfo::none);
4958 __ emit_data64(0x0000004c0000004b, relocInfo::none);
4959 __ emit_data64(0x0000004e0000004d, relocInfo::none);
4960 __ emit_data64(0x000000500000004f, relocInfo::none);
4961 __ emit_data64(0x0000005200000051, relocInfo::none);
4962 __ emit_data64(0x0000005400000053, relocInfo::none);
4963 __ emit_data64(0x0000005600000055, relocInfo::none);
4964 __ emit_data64(0x0000005800000057, relocInfo::none);
4965 __ emit_data64(0x0000005a00000059, relocInfo::none);
4966 __ emit_data64(0x0000006200000061, relocInfo::none);
4967 __ emit_data64(0x0000006400000063, relocInfo::none);
4968 __ emit_data64(0x0000006600000065, relocInfo::none);
4969 __ emit_data64(0x0000006800000067, relocInfo::none);
4970 __ emit_data64(0x0000006a00000069, relocInfo::none);
4971 __ emit_data64(0x0000006c0000006b, relocInfo::none);
4972 __ emit_data64(0x0000006e0000006d, relocInfo::none);
4973 __ emit_data64(0x000000700000006f, relocInfo::none);
4974 __ emit_data64(0x0000007200000071, relocInfo::none);
4975 __ emit_data64(0x0000007400000073, relocInfo::none);
4976 __ emit_data64(0x0000007600000075, relocInfo::none);
4977 __ emit_data64(0x0000007800000077, relocInfo::none);
4978 __ emit_data64(0x0000007a00000079, relocInfo::none);
4979 __ emit_data64(0x0000003100000030, relocInfo::none);
4980 __ emit_data64(0x0000003300000032, relocInfo::none);
4981 __ emit_data64(0x0000003500000034, relocInfo::none);
4982 __ emit_data64(0x0000003700000036, relocInfo::none);
4983 __ emit_data64(0x0000003900000038, relocInfo::none);
4984 __ emit_data64(0x0000005f0000002d, relocInfo::none);
4985
4986 return start;
4987 }
4988
base64_bswap_mask_addr()4989 address base64_bswap_mask_addr() {
4990 __ align(CodeEntryAlignment);
4991 StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64");
4992 address start = __ pc();
4993 __ emit_data64(0x0504038002010080, relocInfo::none);
4994 __ emit_data64(0x0b0a098008070680, relocInfo::none);
4995 __ emit_data64(0x0908078006050480, relocInfo::none);
4996 __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none);
4997 __ emit_data64(0x0605048003020180, relocInfo::none);
4998 __ emit_data64(0x0c0b0a8009080780, relocInfo::none);
4999 __ emit_data64(0x0504038002010080, relocInfo::none);
5000 __ emit_data64(0x0b0a098008070680, relocInfo::none);
5001
5002 return start;
5003 }
5004
base64_right_shift_mask_addr()5005 address base64_right_shift_mask_addr() {
5006 __ align(CodeEntryAlignment);
5007 StubCodeMark mark(this, "StubRoutines", "right_shift_mask");
5008 address start = __ pc();
5009 __ emit_data64(0x0006000400020000, relocInfo::none);
5010 __ emit_data64(0x0006000400020000, relocInfo::none);
5011 __ emit_data64(0x0006000400020000, relocInfo::none);
5012 __ emit_data64(0x0006000400020000, relocInfo::none);
5013 __ emit_data64(0x0006000400020000, relocInfo::none);
5014 __ emit_data64(0x0006000400020000, relocInfo::none);
5015 __ emit_data64(0x0006000400020000, relocInfo::none);
5016 __ emit_data64(0x0006000400020000, relocInfo::none);
5017
5018 return start;
5019 }
5020
base64_left_shift_mask_addr()5021 address base64_left_shift_mask_addr() {
5022 __ align(CodeEntryAlignment);
5023 StubCodeMark mark(this, "StubRoutines", "left_shift_mask");
5024 address start = __ pc();
5025 __ emit_data64(0x0000000200040000, relocInfo::none);
5026 __ emit_data64(0x0000000200040000, relocInfo::none);
5027 __ emit_data64(0x0000000200040000, relocInfo::none);
5028 __ emit_data64(0x0000000200040000, relocInfo::none);
5029 __ emit_data64(0x0000000200040000, relocInfo::none);
5030 __ emit_data64(0x0000000200040000, relocInfo::none);
5031 __ emit_data64(0x0000000200040000, relocInfo::none);
5032 __ emit_data64(0x0000000200040000, relocInfo::none);
5033
5034 return start;
5035 }
5036
base64_and_mask_addr()5037 address base64_and_mask_addr() {
5038 __ align(CodeEntryAlignment);
5039 StubCodeMark mark(this, "StubRoutines", "and_mask");
5040 address start = __ pc();
5041 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5042 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5043 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5044 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5045 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5046 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5047 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5048 __ emit_data64(0x3f003f003f000000, relocInfo::none);
5049 return start;
5050 }
5051
base64_gather_mask_addr()5052 address base64_gather_mask_addr() {
5053 __ align(CodeEntryAlignment);
5054 StubCodeMark mark(this, "StubRoutines", "gather_mask");
5055 address start = __ pc();
5056 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5057 return start;
5058 }
5059
5060 // Code for generating Base64 encoding.
5061 // Intrinsic function prototype in Base64.java:
5062 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
generate_base64_encodeBlock()5063 address generate_base64_encodeBlock() {
5064 __ align(CodeEntryAlignment);
5065 StubCodeMark mark(this, "StubRoutines", "implEncode");
5066 address start = __ pc();
5067 __ enter();
5068
5069 // Save callee-saved registers before using them
5070 __ push(r12);
5071 __ push(r13);
5072 __ push(r14);
5073 __ push(r15);
5074
5075 // arguments
5076 const Register source = c_rarg0; // Source Array
5077 const Register start_offset = c_rarg1; // start offset
5078 const Register end_offset = c_rarg2; // end offset
5079 const Register dest = c_rarg3; // destination array
5080
5081 #ifndef _WIN64
5082 const Register dp = c_rarg4; // Position for writing to dest array
5083 const Register isURL = c_rarg5;// Base64 or URL character set
5084 #else
5085 const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5086 const Address isURL_mem(rbp, 7 * wordSize);
5087 const Register isURL = r10; // pick the volatile windows register
5088 const Register dp = r12;
5089 __ movl(dp, dp_mem);
5090 __ movl(isURL, isURL_mem);
5091 #endif
5092
5093 const Register length = r14;
5094 Label L_process80, L_process32, L_process3, L_exit, L_processdata;
5095
5096 // calculate length from offsets
5097 __ movl(length, end_offset);
5098 __ subl(length, start_offset);
5099 __ cmpl(length, 0);
5100 __ jcc(Assembler::lessEqual, L_exit);
5101
5102 __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
5103 // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
5104 __ cmpl(isURL, 0);
5105 __ jcc(Assembler::equal, L_processdata);
5106 __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr()));
5107
5108 // load masks required for encoding data
5109 __ BIND(L_processdata);
5110 __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
5111 // Set 64 bits of K register.
5112 __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
5113 __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
5114 __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
5115 __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
5116 __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13);
5117
5118 // Vector Base64 implementation, producing 96 bytes of encoded data
5119 __ BIND(L_process80);
5120 __ cmpl(length, 80);
5121 __ jcc(Assembler::below, L_process32);
5122 __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit);
5123 __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit);
5124 __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit);
5125
5126 //permute the input data in such a manner that we have continuity of the source
5127 __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit);
5128 __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit);
5129 __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit);
5130
5131 //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte.
5132 //we can deal with 12 bytes at a time in a 128 bit register
5133 __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit);
5134 __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit);
5135 __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit);
5136
5137 //convert byte to word. Each 128 bit register will have 6 bytes for processing
5138 __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit);
5139 __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit);
5140 __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit);
5141
5142 // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers
5143 __ evpsrlvw(xmm0, xmm3, xmm13, Assembler::AVX_512bit);
5144 __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit);
5145 __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit);
5146
5147 __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit);
5148 __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit);
5149 __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit);
5150
5151 __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit);
5152 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5153 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5154
5155 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5156 __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5157 __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5158
5159 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5160 __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit);
5161 __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit);
5162
5163 // Get the final 4*6 bits base64 encoding
5164 __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit);
5165 __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit);
5166 __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit);
5167
5168 // Shift
5169 __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5170 __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5171 __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5172
5173 // look up 6 bits in the base64 character set to fetch the encoding
5174 // we are converting word to dword as gather instructions need dword indices for looking up encoding
5175 __ vextracti64x4(xmm6, xmm3, 0);
5176 __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit);
5177 __ vextracti64x4(xmm6, xmm3, 1);
5178 __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit);
5179
5180 __ vextracti64x4(xmm6, xmm4, 0);
5181 __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit);
5182 __ vextracti64x4(xmm6, xmm4, 1);
5183 __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit);
5184
5185 __ vextracti64x4(xmm4, xmm5, 0);
5186 __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit);
5187
5188 __ vextracti64x4(xmm4, xmm5, 1);
5189 __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
5190
5191 __ kmovql(k2, k3);
5192 __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
5193 __ kmovql(k2, k3);
5194 __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
5195 __ kmovql(k2, k3);
5196 __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
5197 __ kmovql(k2, k3);
5198 __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
5199 __ kmovql(k2, k3);
5200 __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5201 __ kmovql(k2, k3);
5202 __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
5203
5204 //Down convert dword to byte. Final output is 16*6 = 96 bytes long
5205 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit);
5206 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit);
5207 __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit);
5208 __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit);
5209 __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit);
5210 __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit);
5211
5212 __ addq(dest, 96);
5213 __ addq(source, 72);
5214 __ subq(length, 72);
5215 __ jmp(L_process80);
5216
5217 // Vector Base64 implementation generating 32 bytes of encoded data
5218 __ BIND(L_process32);
5219 __ cmpl(length, 32);
5220 __ jcc(Assembler::below, L_process3);
5221 __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit);
5222 __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit);
5223 __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit);
5224 __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit);
5225 __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit);
5226 __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit);
5227
5228 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5229 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5230 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5231 __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit);
5232 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5233 __ vextracti64x4(xmm9, xmm1, 0);
5234 __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
5235 __ vextracti64x4(xmm9, xmm1, 1);
5236 __ vpmovzxwd(xmm5, xmm9, Assembler::AVX_512bit);
5237 __ kmovql(k2, k3);
5238 __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5239 __ kmovql(k2, k3);
5240 __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
5241 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
5242 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
5243 __ subq(length, 24);
5244 __ addq(dest, 32);
5245 __ addq(source, 24);
5246 __ jmp(L_process32);
5247
5248 // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data
5249 /* This code corresponds to the scalar version of the following snippet in Base64.java
5250 ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff);
5251 ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f];
5252 ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f];
5253 ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f];
5254 ** dst[dp0++] = (byte)base64[bits & 0x3f];*/
5255 __ BIND(L_process3);
5256 __ cmpl(length, 3);
5257 __ jcc(Assembler::below, L_exit);
5258 // Read 1 byte at a time
5259 __ movzbl(rax, Address(source, start_offset));
5260 __ shll(rax, 0x10);
5261 __ movl(r15, rax);
5262 __ movzbl(rax, Address(source, start_offset, Address::times_1, 1));
5263 __ shll(rax, 0x8);
5264 __ movzwl(rax, rax);
5265 __ orl(r15, rax);
5266 __ movzbl(rax, Address(source, start_offset, Address::times_1, 2));
5267 __ orl(rax, r15);
5268 // Save 3 bytes read in r15
5269 __ movl(r15, rax);
5270 __ shrl(rax, 0x12);
5271 __ andl(rax, 0x3f);
5272 // rax contains the index, r11 contains base64 lookup table
5273 __ movb(rax, Address(r11, rax, Address::times_4));
5274 // Write the encoded byte to destination
5275 __ movb(Address(dest, dp, Address::times_1, 0), rax);
5276 __ movl(rax, r15);
5277 __ shrl(rax, 0xc);
5278 __ andl(rax, 0x3f);
5279 __ movb(rax, Address(r11, rax, Address::times_4));
5280 __ movb(Address(dest, dp, Address::times_1, 1), rax);
5281 __ movl(rax, r15);
5282 __ shrl(rax, 0x6);
5283 __ andl(rax, 0x3f);
5284 __ movb(rax, Address(r11, rax, Address::times_4));
5285 __ movb(Address(dest, dp, Address::times_1, 2), rax);
5286 __ movl(rax, r15);
5287 __ andl(rax, 0x3f);
5288 __ movb(rax, Address(r11, rax, Address::times_4));
5289 __ movb(Address(dest, dp, Address::times_1, 3), rax);
5290 __ subl(length, 3);
5291 __ addq(dest, 4);
5292 __ addq(source, 3);
5293 __ jmp(L_process3);
5294 __ BIND(L_exit);
5295 __ pop(r15);
5296 __ pop(r14);
5297 __ pop(r13);
5298 __ pop(r12);
5299 __ leave();
5300 __ ret(0);
5301 return start;
5302 }
5303
5304 /**
5305 * Arguments:
5306 *
5307 * Inputs:
5308 * c_rarg0 - int crc
5309 * c_rarg1 - byte* buf
5310 * c_rarg2 - int length
5311 *
5312 * Ouput:
5313 * rax - int crc result
5314 */
generate_updateBytesCRC32()5315 address generate_updateBytesCRC32() {
5316 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
5317
5318 __ align(CodeEntryAlignment);
5319 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5320
5321 address start = __ pc();
5322 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5323 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5324 // rscratch1: r10
5325 const Register crc = c_rarg0; // crc
5326 const Register buf = c_rarg1; // source java byte array address
5327 const Register len = c_rarg2; // length
5328 const Register table = c_rarg3; // crc_table address (reuse register)
5329 const Register tmp = r11;
5330 assert_different_registers(crc, buf, len, table, tmp, rax);
5331
5332 BLOCK_COMMENT("Entry:");
5333 __ enter(); // required for proper stackwalking of RuntimeStub frame
5334
5335 __ kernel_crc32(crc, buf, len, table, tmp);
5336
5337 __ movl(rax, crc);
5338 __ vzeroupper();
5339 __ leave(); // required for proper stackwalking of RuntimeStub frame
5340 __ ret(0);
5341
5342 return start;
5343 }
5344
5345 /**
5346 * Arguments:
5347 *
5348 * Inputs:
5349 * c_rarg0 - int crc
5350 * c_rarg1 - byte* buf
5351 * c_rarg2 - long length
5352 * c_rarg3 - table_start - optional (present only when doing a library_call,
5353 * not used by x86 algorithm)
5354 *
5355 * Ouput:
5356 * rax - int crc result
5357 */
generate_updateBytesCRC32C(bool is_pclmulqdq_supported)5358 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
5359 assert(UseCRC32CIntrinsics, "need SSE4_2");
5360 __ align(CodeEntryAlignment);
5361 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
5362 address start = __ pc();
5363 //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs
5364 //Windows RCX RDX R8 R9 none none XMM0..XMM3
5365 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7
5366 const Register crc = c_rarg0; // crc
5367 const Register buf = c_rarg1; // source java byte array address
5368 const Register len = c_rarg2; // length
5369 const Register a = rax;
5370 const Register j = r9;
5371 const Register k = r10;
5372 const Register l = r11;
5373 #ifdef _WIN64
5374 const Register y = rdi;
5375 const Register z = rsi;
5376 #else
5377 const Register y = rcx;
5378 const Register z = r8;
5379 #endif
5380 assert_different_registers(crc, buf, len, a, j, k, l, y, z);
5381
5382 BLOCK_COMMENT("Entry:");
5383 __ enter(); // required for proper stackwalking of RuntimeStub frame
5384 #ifdef _WIN64
5385 __ push(y);
5386 __ push(z);
5387 #endif
5388 __ crc32c_ipl_alg2_alt2(crc, buf, len,
5389 a, j, k,
5390 l, y, z,
5391 c_farg0, c_farg1, c_farg2,
5392 is_pclmulqdq_supported);
5393 __ movl(rax, crc);
5394 #ifdef _WIN64
5395 __ pop(z);
5396 __ pop(y);
5397 #endif
5398 __ vzeroupper();
5399 __ leave(); // required for proper stackwalking of RuntimeStub frame
5400 __ ret(0);
5401
5402 return start;
5403 }
5404
5405 /**
5406 * Arguments:
5407 *
5408 * Input:
5409 * c_rarg0 - x address
5410 * c_rarg1 - x length
5411 * c_rarg2 - y address
5412 * c_rarg3 - y length
5413 * not Win64
5414 * c_rarg4 - z address
5415 * c_rarg5 - z length
5416 * Win64
5417 * rsp+40 - z address
5418 * rsp+48 - z length
5419 */
generate_multiplyToLen()5420 address generate_multiplyToLen() {
5421 __ align(CodeEntryAlignment);
5422 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5423
5424 address start = __ pc();
5425 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5426 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5427 const Register x = rdi;
5428 const Register xlen = rax;
5429 const Register y = rsi;
5430 const Register ylen = rcx;
5431 const Register z = r8;
5432 const Register zlen = r11;
5433
5434 // Next registers will be saved on stack in multiply_to_len().
5435 const Register tmp1 = r12;
5436 const Register tmp2 = r13;
5437 const Register tmp3 = r14;
5438 const Register tmp4 = r15;
5439 const Register tmp5 = rbx;
5440
5441 BLOCK_COMMENT("Entry:");
5442 __ enter(); // required for proper stackwalking of RuntimeStub frame
5443
5444 #ifndef _WIN64
5445 __ movptr(zlen, r9); // Save r9 in r11 - zlen
5446 #endif
5447 setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
5448 // ylen => rcx, z => r8, zlen => r11
5449 // r9 and r10 may be used to save non-volatile registers
5450 #ifdef _WIN64
5451 // last 2 arguments (#4, #5) are on stack on Win64
5452 __ movptr(z, Address(rsp, 6 * wordSize));
5453 __ movptr(zlen, Address(rsp, 7 * wordSize));
5454 #endif
5455
5456 __ movptr(xlen, rsi);
5457 __ movptr(y, rdx);
5458 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
5459
5460 restore_arg_regs();
5461
5462 __ leave(); // required for proper stackwalking of RuntimeStub frame
5463 __ ret(0);
5464
5465 return start;
5466 }
5467
5468 /**
5469 * Arguments:
5470 *
5471 * Input:
5472 * c_rarg0 - obja address
5473 * c_rarg1 - objb address
5474 * c_rarg3 - length length
5475 * c_rarg4 - scale log2_array_indxscale
5476 *
5477 * Output:
5478 * rax - int >= mismatched index, < 0 bitwise complement of tail
5479 */
generate_vectorizedMismatch()5480 address generate_vectorizedMismatch() {
5481 __ align(CodeEntryAlignment);
5482 StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
5483 address start = __ pc();
5484
5485 BLOCK_COMMENT("Entry:");
5486 __ enter();
5487
5488 #ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5489 const Register scale = c_rarg0; //rcx, will exchange with r9
5490 const Register objb = c_rarg1; //rdx
5491 const Register length = c_rarg2; //r8
5492 const Register obja = c_rarg3; //r9
5493 __ xchgq(obja, scale); //now obja and scale contains the correct contents
5494
5495 const Register tmp1 = r10;
5496 const Register tmp2 = r11;
5497 #endif
5498 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5499 const Register obja = c_rarg0; //U:rdi
5500 const Register objb = c_rarg1; //U:rsi
5501 const Register length = c_rarg2; //U:rdx
5502 const Register scale = c_rarg3; //U:rcx
5503 const Register tmp1 = r8;
5504 const Register tmp2 = r9;
5505 #endif
5506 const Register result = rax; //return value
5507 const XMMRegister vec0 = xmm0;
5508 const XMMRegister vec1 = xmm1;
5509 const XMMRegister vec2 = xmm2;
5510
5511 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
5512
5513 __ vzeroupper();
5514 __ leave();
5515 __ ret(0);
5516
5517 return start;
5518 }
5519
5520 /**
5521 * Arguments:
5522 *
5523 // Input:
5524 // c_rarg0 - x address
5525 // c_rarg1 - x length
5526 // c_rarg2 - z address
5527 // c_rarg3 - z lenth
5528 *
5529 */
generate_squareToLen()5530 address generate_squareToLen() {
5531
5532 __ align(CodeEntryAlignment);
5533 StubCodeMark mark(this, "StubRoutines", "squareToLen");
5534
5535 address start = __ pc();
5536 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5537 // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
5538 const Register x = rdi;
5539 const Register len = rsi;
5540 const Register z = r8;
5541 const Register zlen = rcx;
5542
5543 const Register tmp1 = r12;
5544 const Register tmp2 = r13;
5545 const Register tmp3 = r14;
5546 const Register tmp4 = r15;
5547 const Register tmp5 = rbx;
5548
5549 BLOCK_COMMENT("Entry:");
5550 __ enter(); // required for proper stackwalking of RuntimeStub frame
5551
5552 setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
5553 // zlen => rcx
5554 // r9 and r10 may be used to save non-volatile registers
5555 __ movptr(r8, rdx);
5556 __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5557
5558 restore_arg_regs();
5559
5560 __ leave(); // required for proper stackwalking of RuntimeStub frame
5561 __ ret(0);
5562
5563 return start;
5564 }
5565
generate_method_entry_barrier()5566 address generate_method_entry_barrier() {
5567 __ align(CodeEntryAlignment);
5568 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5569
5570 Label deoptimize_label;
5571
5572 address start = __ pc();
5573
5574 __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
5575
5576 BLOCK_COMMENT("Entry:");
5577 __ enter(); // save rbp
5578
5579 // save c_rarg0, because we want to use that value.
5580 // We could do without it but then we depend on the number of slots used by pusha
5581 __ push(c_rarg0);
5582
5583 __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
5584
5585 __ pusha();
5586
5587 // The method may have floats as arguments, and we must spill them before calling
5588 // the VM runtime.
5589 assert(Argument::n_float_register_parameters_j == 8, "Assumption");
5590 const int xmm_size = wordSize * 2;
5591 const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
5592 __ subptr(rsp, xmm_spill_size);
5593 __ movdqu(Address(rsp, xmm_size * 7), xmm7);
5594 __ movdqu(Address(rsp, xmm_size * 6), xmm6);
5595 __ movdqu(Address(rsp, xmm_size * 5), xmm5);
5596 __ movdqu(Address(rsp, xmm_size * 4), xmm4);
5597 __ movdqu(Address(rsp, xmm_size * 3), xmm3);
5598 __ movdqu(Address(rsp, xmm_size * 2), xmm2);
5599 __ movdqu(Address(rsp, xmm_size * 1), xmm1);
5600 __ movdqu(Address(rsp, xmm_size * 0), xmm0);
5601
5602 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
5603
5604 __ movdqu(xmm0, Address(rsp, xmm_size * 0));
5605 __ movdqu(xmm1, Address(rsp, xmm_size * 1));
5606 __ movdqu(xmm2, Address(rsp, xmm_size * 2));
5607 __ movdqu(xmm3, Address(rsp, xmm_size * 3));
5608 __ movdqu(xmm4, Address(rsp, xmm_size * 4));
5609 __ movdqu(xmm5, Address(rsp, xmm_size * 5));
5610 __ movdqu(xmm6, Address(rsp, xmm_size * 6));
5611 __ movdqu(xmm7, Address(rsp, xmm_size * 7));
5612 __ addptr(rsp, xmm_spill_size);
5613
5614 __ cmpl(rax, 1); // 1 means deoptimize
5615 __ jcc(Assembler::equal, deoptimize_label);
5616
5617 __ popa();
5618 __ pop(c_rarg0);
5619
5620 __ leave();
5621
5622 __ addptr(rsp, 1 * wordSize); // cookie
5623 __ ret(0);
5624
5625
5626 __ BIND(deoptimize_label);
5627
5628 __ popa();
5629 __ pop(c_rarg0);
5630
5631 __ leave();
5632
5633 // this can be taken out, but is good for verification purposes. getting a SIGSEGV
5634 // here while still having a correct stack is valuable
5635 __ testptr(rsp, Address(rsp, 0));
5636
5637 __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
5638 __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
5639
5640 return start;
5641 }
5642
5643 /**
5644 * Arguments:
5645 *
5646 * Input:
5647 * c_rarg0 - out address
5648 * c_rarg1 - in address
5649 * c_rarg2 - offset
5650 * c_rarg3 - len
5651 * not Win64
5652 * c_rarg4 - k
5653 * Win64
5654 * rsp+40 - k
5655 */
generate_mulAdd()5656 address generate_mulAdd() {
5657 __ align(CodeEntryAlignment);
5658 StubCodeMark mark(this, "StubRoutines", "mulAdd");
5659
5660 address start = __ pc();
5661 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5662 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5663 const Register out = rdi;
5664 const Register in = rsi;
5665 const Register offset = r11;
5666 const Register len = rcx;
5667 const Register k = r8;
5668
5669 // Next registers will be saved on stack in mul_add().
5670 const Register tmp1 = r12;
5671 const Register tmp2 = r13;
5672 const Register tmp3 = r14;
5673 const Register tmp4 = r15;
5674 const Register tmp5 = rbx;
5675
5676 BLOCK_COMMENT("Entry:");
5677 __ enter(); // required for proper stackwalking of RuntimeStub frame
5678
5679 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
5680 // len => rcx, k => r8
5681 // r9 and r10 may be used to save non-volatile registers
5682 #ifdef _WIN64
5683 // last argument is on stack on Win64
5684 __ movl(k, Address(rsp, 6 * wordSize));
5685 #endif
5686 __ movptr(r11, rdx); // move offset in rdx to offset(r11)
5687 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5688
5689 restore_arg_regs();
5690
5691 __ leave(); // required for proper stackwalking of RuntimeStub frame
5692 __ ret(0);
5693
5694 return start;
5695 }
5696
generate_libmExp()5697 address generate_libmExp() {
5698 StubCodeMark mark(this, "StubRoutines", "libmExp");
5699
5700 address start = __ pc();
5701
5702 const XMMRegister x0 = xmm0;
5703 const XMMRegister x1 = xmm1;
5704 const XMMRegister x2 = xmm2;
5705 const XMMRegister x3 = xmm3;
5706
5707 const XMMRegister x4 = xmm4;
5708 const XMMRegister x5 = xmm5;
5709 const XMMRegister x6 = xmm6;
5710 const XMMRegister x7 = xmm7;
5711
5712 const Register tmp = r11;
5713
5714 BLOCK_COMMENT("Entry:");
5715 __ enter(); // required for proper stackwalking of RuntimeStub frame
5716
5717 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5718
5719 __ leave(); // required for proper stackwalking of RuntimeStub frame
5720 __ ret(0);
5721
5722 return start;
5723
5724 }
5725
generate_libmLog()5726 address generate_libmLog() {
5727 StubCodeMark mark(this, "StubRoutines", "libmLog");
5728
5729 address start = __ pc();
5730
5731 const XMMRegister x0 = xmm0;
5732 const XMMRegister x1 = xmm1;
5733 const XMMRegister x2 = xmm2;
5734 const XMMRegister x3 = xmm3;
5735
5736 const XMMRegister x4 = xmm4;
5737 const XMMRegister x5 = xmm5;
5738 const XMMRegister x6 = xmm6;
5739 const XMMRegister x7 = xmm7;
5740
5741 const Register tmp1 = r11;
5742 const Register tmp2 = r8;
5743
5744 BLOCK_COMMENT("Entry:");
5745 __ enter(); // required for proper stackwalking of RuntimeStub frame
5746
5747 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
5748
5749 __ leave(); // required for proper stackwalking of RuntimeStub frame
5750 __ ret(0);
5751
5752 return start;
5753
5754 }
5755
generate_libmLog10()5756 address generate_libmLog10() {
5757 StubCodeMark mark(this, "StubRoutines", "libmLog10");
5758
5759 address start = __ pc();
5760
5761 const XMMRegister x0 = xmm0;
5762 const XMMRegister x1 = xmm1;
5763 const XMMRegister x2 = xmm2;
5764 const XMMRegister x3 = xmm3;
5765
5766 const XMMRegister x4 = xmm4;
5767 const XMMRegister x5 = xmm5;
5768 const XMMRegister x6 = xmm6;
5769 const XMMRegister x7 = xmm7;
5770
5771 const Register tmp = r11;
5772
5773 BLOCK_COMMENT("Entry:");
5774 __ enter(); // required for proper stackwalking of RuntimeStub frame
5775
5776 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5777
5778 __ leave(); // required for proper stackwalking of RuntimeStub frame
5779 __ ret(0);
5780
5781 return start;
5782
5783 }
5784
generate_libmPow()5785 address generate_libmPow() {
5786 StubCodeMark mark(this, "StubRoutines", "libmPow");
5787
5788 address start = __ pc();
5789
5790 const XMMRegister x0 = xmm0;
5791 const XMMRegister x1 = xmm1;
5792 const XMMRegister x2 = xmm2;
5793 const XMMRegister x3 = xmm3;
5794
5795 const XMMRegister x4 = xmm4;
5796 const XMMRegister x5 = xmm5;
5797 const XMMRegister x6 = xmm6;
5798 const XMMRegister x7 = xmm7;
5799
5800 const Register tmp1 = r8;
5801 const Register tmp2 = r9;
5802 const Register tmp3 = r10;
5803 const Register tmp4 = r11;
5804
5805 BLOCK_COMMENT("Entry:");
5806 __ enter(); // required for proper stackwalking of RuntimeStub frame
5807
5808 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5809
5810 __ leave(); // required for proper stackwalking of RuntimeStub frame
5811 __ ret(0);
5812
5813 return start;
5814
5815 }
5816
generate_libmSin()5817 address generate_libmSin() {
5818 StubCodeMark mark(this, "StubRoutines", "libmSin");
5819
5820 address start = __ pc();
5821
5822 const XMMRegister x0 = xmm0;
5823 const XMMRegister x1 = xmm1;
5824 const XMMRegister x2 = xmm2;
5825 const XMMRegister x3 = xmm3;
5826
5827 const XMMRegister x4 = xmm4;
5828 const XMMRegister x5 = xmm5;
5829 const XMMRegister x6 = xmm6;
5830 const XMMRegister x7 = xmm7;
5831
5832 const Register tmp1 = r8;
5833 const Register tmp2 = r9;
5834 const Register tmp3 = r10;
5835 const Register tmp4 = r11;
5836
5837 BLOCK_COMMENT("Entry:");
5838 __ enter(); // required for proper stackwalking of RuntimeStub frame
5839
5840 #ifdef _WIN64
5841 __ push(rsi);
5842 __ push(rdi);
5843 #endif
5844 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5845
5846 #ifdef _WIN64
5847 __ pop(rdi);
5848 __ pop(rsi);
5849 #endif
5850
5851 __ leave(); // required for proper stackwalking of RuntimeStub frame
5852 __ ret(0);
5853
5854 return start;
5855
5856 }
5857
generate_libmCos()5858 address generate_libmCos() {
5859 StubCodeMark mark(this, "StubRoutines", "libmCos");
5860
5861 address start = __ pc();
5862
5863 const XMMRegister x0 = xmm0;
5864 const XMMRegister x1 = xmm1;
5865 const XMMRegister x2 = xmm2;
5866 const XMMRegister x3 = xmm3;
5867
5868 const XMMRegister x4 = xmm4;
5869 const XMMRegister x5 = xmm5;
5870 const XMMRegister x6 = xmm6;
5871 const XMMRegister x7 = xmm7;
5872
5873 const Register tmp1 = r8;
5874 const Register tmp2 = r9;
5875 const Register tmp3 = r10;
5876 const Register tmp4 = r11;
5877
5878 BLOCK_COMMENT("Entry:");
5879 __ enter(); // required for proper stackwalking of RuntimeStub frame
5880
5881 #ifdef _WIN64
5882 __ push(rsi);
5883 __ push(rdi);
5884 #endif
5885 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5886
5887 #ifdef _WIN64
5888 __ pop(rdi);
5889 __ pop(rsi);
5890 #endif
5891
5892 __ leave(); // required for proper stackwalking of RuntimeStub frame
5893 __ ret(0);
5894
5895 return start;
5896
5897 }
5898
generate_libmTan()5899 address generate_libmTan() {
5900 StubCodeMark mark(this, "StubRoutines", "libmTan");
5901
5902 address start = __ pc();
5903
5904 const XMMRegister x0 = xmm0;
5905 const XMMRegister x1 = xmm1;
5906 const XMMRegister x2 = xmm2;
5907 const XMMRegister x3 = xmm3;
5908
5909 const XMMRegister x4 = xmm4;
5910 const XMMRegister x5 = xmm5;
5911 const XMMRegister x6 = xmm6;
5912 const XMMRegister x7 = xmm7;
5913
5914 const Register tmp1 = r8;
5915 const Register tmp2 = r9;
5916 const Register tmp3 = r10;
5917 const Register tmp4 = r11;
5918
5919 BLOCK_COMMENT("Entry:");
5920 __ enter(); // required for proper stackwalking of RuntimeStub frame
5921
5922 #ifdef _WIN64
5923 __ push(rsi);
5924 __ push(rdi);
5925 #endif
5926 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5927
5928 #ifdef _WIN64
5929 __ pop(rdi);
5930 __ pop(rsi);
5931 #endif
5932
5933 __ leave(); // required for proper stackwalking of RuntimeStub frame
5934 __ ret(0);
5935
5936 return start;
5937
5938 }
5939
5940 #undef __
5941 #define __ masm->
5942
5943 // Continuation point for throwing of implicit exceptions that are
5944 // not handled in the current activation. Fabricates an exception
5945 // oop and initiates normal exception dispatching in this
5946 // frame. Since we need to preserve callee-saved values (currently
5947 // only for C2, but done for C1 as well) we need a callee-saved oop
5948 // map and therefore have to make these stubs into RuntimeStubs
5949 // rather than BufferBlobs. If the compiler needs all registers to
5950 // be preserved between the fault point and the exception handler
5951 // then it must assume responsibility for that in
5952 // AbstractCompiler::continuation_for_implicit_null_exception or
5953 // continuation_for_implicit_division_by_zero_exception. All other
5954 // implicit exceptions (e.g., NullPointerException or
5955 // AbstractMethodError on entry) are either at call sites or
5956 // otherwise assume that stack unwinding will be initiated, so
5957 // caller saved registers were assumed volatile in the compiler.
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)5958 address generate_throw_exception(const char* name,
5959 address runtime_entry,
5960 Register arg1 = noreg,
5961 Register arg2 = noreg) {
5962 // Information about frame layout at time of blocking runtime call.
5963 // Note that we only have to preserve callee-saved registers since
5964 // the compilers are responsible for supplying a continuation point
5965 // if they expect all registers to be preserved.
5966 enum layout {
5967 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5968 rbp_off2,
5969 return_off,
5970 return_off2,
5971 framesize // inclusive of return address
5972 };
5973
5974 int insts_size = 512;
5975 int locs_size = 64;
5976
5977 CodeBuffer code(name, insts_size, locs_size);
5978 OopMapSet* oop_maps = new OopMapSet();
5979 MacroAssembler* masm = new MacroAssembler(&code);
5980
5981 address start = __ pc();
5982
5983 // This is an inlined and slightly modified version of call_VM
5984 // which has the ability to fetch the return PC out of
5985 // thread-local storage and also sets up last_Java_sp slightly
5986 // differently than the real call_VM
5987
5988 __ enter(); // required for proper stackwalking of RuntimeStub frame
5989
5990 assert(is_even(framesize/2), "sp not 16-byte aligned");
5991
5992 // return address and rbp are already in place
5993 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5994
5995 int frame_complete = __ pc() - start;
5996
5997 // Set up last_Java_sp and last_Java_fp
5998 address the_pc = __ pc();
5999 __ set_last_Java_frame(rsp, rbp, the_pc);
6000 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
6001
6002 // Call runtime
6003 if (arg1 != noreg) {
6004 assert(arg2 != c_rarg1, "clobbered");
6005 __ movptr(c_rarg1, arg1);
6006 }
6007 if (arg2 != noreg) {
6008 __ movptr(c_rarg2, arg2);
6009 }
6010 __ movptr(c_rarg0, r15_thread);
6011 BLOCK_COMMENT("call runtime_entry");
6012 __ call(RuntimeAddress(runtime_entry));
6013
6014 // Generate oop map
6015 OopMap* map = new OopMap(framesize, 0);
6016
6017 oop_maps->add_gc_map(the_pc - start, map);
6018
6019 __ reset_last_Java_frame(true);
6020
6021 __ leave(); // required for proper stackwalking of RuntimeStub frame
6022
6023 // check for pending exceptions
6024 #ifdef ASSERT
6025 Label L;
6026 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
6027 (int32_t) NULL_WORD);
6028 __ jcc(Assembler::notEqual, L);
6029 __ should_not_reach_here();
6030 __ bind(L);
6031 #endif // ASSERT
6032 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6033
6034
6035 // codeBlob framesize is in words (not VMRegImpl::slot_size)
6036 RuntimeStub* stub =
6037 RuntimeStub::new_runtime_stub(name,
6038 &code,
6039 frame_complete,
6040 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6041 oop_maps, false);
6042 return stub->entry_point();
6043 }
6044
create_control_words()6045 void create_control_words() {
6046 // Round to nearest, 53-bit mode, exceptions masked
6047 StubRoutines::_fpu_cntrl_wrd_std = 0x027F;
6048 // Round to zero, 53-bit mode, exception mased
6049 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
6050 // Round to nearest, 24-bit mode, exceptions masked
6051 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F;
6052 // Round to nearest, 64-bit mode, exceptions masked
6053 StubRoutines::_mxcsr_std = 0x1F80;
6054 // Note: the following two constants are 80-bit values
6055 // layout is critical for correct loading by FPU.
6056 // Bias for strict fp multiply/divide
6057 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
6058 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
6059 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
6060 // Un-Bias for strict fp multiply/divide
6061 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
6062 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
6063 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
6064 }
6065
6066 // Initialization
generate_initial()6067 void generate_initial() {
6068 // Generates all stubs and initializes the entry points
6069
6070 // This platform-specific settings are needed by generate_call_stub()
6071 create_control_words();
6072
6073 // entry points that exist in all platforms Note: This is code
6074 // that could be shared among different platforms - however the
6075 // benefit seems to be smaller than the disadvantage of having a
6076 // much more complicated generator structure. See also comment in
6077 // stubRoutines.hpp.
6078
6079 StubRoutines::_forward_exception_entry = generate_forward_exception();
6080
6081 StubRoutines::_call_stub_entry =
6082 generate_call_stub(StubRoutines::_call_stub_return_address);
6083
6084 // is referenced by megamorphic call
6085 StubRoutines::_catch_exception_entry = generate_catch_exception();
6086
6087 // atomic calls
6088 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
6089 StubRoutines::_atomic_xchg_long_entry = generate_atomic_xchg_long();
6090 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
6091 StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte();
6092 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
6093 StubRoutines::_atomic_add_entry = generate_atomic_add();
6094 StubRoutines::_atomic_add_long_entry = generate_atomic_add_long();
6095 StubRoutines::_fence_entry = generate_orderaccess_fence();
6096
6097 // platform dependent
6098 StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
6099 StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
6100
6101 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr();
6102
6103 // Build this early so it's available for the interpreter.
6104 StubRoutines::_throw_StackOverflowError_entry =
6105 generate_throw_exception("StackOverflowError throw_exception",
6106 CAST_FROM_FN_PTR(address,
6107 SharedRuntime::
6108 throw_StackOverflowError));
6109 StubRoutines::_throw_delayed_StackOverflowError_entry =
6110 generate_throw_exception("delayed StackOverflowError throw_exception",
6111 CAST_FROM_FN_PTR(address,
6112 SharedRuntime::
6113 throw_delayed_StackOverflowError));
6114 if (UseCRC32Intrinsics) {
6115 // set table address before stub generation which use it
6116 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
6117 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6118 }
6119
6120 if (UseCRC32CIntrinsics) {
6121 bool supports_clmul = VM_Version::supports_clmul();
6122 StubRoutines::x86::generate_CRC32C_table(supports_clmul);
6123 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
6124 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
6125 }
6126 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
6127 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
6128 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
6129 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6130 StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
6131 StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
6132 StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
6133 StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
6134 StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
6135 StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
6136 StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
6137 StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
6138 StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
6139 StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
6140 StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
6141 StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
6142 StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
6143 StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
6144 }
6145 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
6146 StubRoutines::_dexp = generate_libmExp();
6147 }
6148 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
6149 StubRoutines::_dlog = generate_libmLog();
6150 }
6151 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
6152 StubRoutines::_dlog10 = generate_libmLog10();
6153 }
6154 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
6155 StubRoutines::_dpow = generate_libmPow();
6156 }
6157 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
6158 StubRoutines::_dsin = generate_libmSin();
6159 }
6160 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
6161 StubRoutines::_dcos = generate_libmCos();
6162 }
6163 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6164 StubRoutines::_dtan = generate_libmTan();
6165 }
6166 }
6167 }
6168
generate_all()6169 void generate_all() {
6170 // Generates all stubs and initializes the entry points
6171
6172 // These entry points require SharedInfo::stack0 to be set up in
6173 // non-core builds and need to be relocatable, so they each
6174 // fabricate a RuntimeStub internally.
6175 StubRoutines::_throw_AbstractMethodError_entry =
6176 generate_throw_exception("AbstractMethodError throw_exception",
6177 CAST_FROM_FN_PTR(address,
6178 SharedRuntime::
6179 throw_AbstractMethodError));
6180
6181 StubRoutines::_throw_IncompatibleClassChangeError_entry =
6182 generate_throw_exception("IncompatibleClassChangeError throw_exception",
6183 CAST_FROM_FN_PTR(address,
6184 SharedRuntime::
6185 throw_IncompatibleClassChangeError));
6186
6187 StubRoutines::_throw_NullPointerException_at_call_entry =
6188 generate_throw_exception("NullPointerException at call throw_exception",
6189 CAST_FROM_FN_PTR(address,
6190 SharedRuntime::
6191 throw_NullPointerException_at_call));
6192
6193 // entry points that are platform specific
6194 StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
6195 StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
6196 StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
6197 StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
6198
6199 StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
6200 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
6201 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6202 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
6203 StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
6204 StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
6205 StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6206 StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
6207 StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
6208 StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
6209 StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
6210
6211 // support for verify_oop (must happen after universe_init)
6212 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6213
6214 // data cache line writeback
6215 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
6216 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
6217
6218 // arraycopy stubs used by compilers
6219 generate_arraycopy_stubs();
6220
6221 // don't bother generating these AES intrinsic stubs unless global flag is set
6222 if (UseAESIntrinsics) {
6223 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
6224 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6225 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6226 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6227 if (VM_Version::supports_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
6228 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
6229 StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
6230 StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
6231 } else {
6232 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
6233 }
6234 }
6235 if (UseAESCTRIntrinsics) {
6236 if (VM_Version::supports_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
6237 StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
6238 StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
6239 } else {
6240 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
6241 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
6242 }
6243 }
6244
6245 if (UseSHA1Intrinsics) {
6246 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
6247 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
6248 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
6249 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
6250 }
6251 if (UseSHA256Intrinsics) {
6252 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
6253 char* dst = (char*)StubRoutines::x86::_k256_W;
6254 char* src = (char*)StubRoutines::x86::_k256;
6255 for (int ii = 0; ii < 16; ++ii) {
6256 memcpy(dst + 32 * ii, src + 16 * ii, 16);
6257 memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
6258 }
6259 StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
6260 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
6261 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
6262 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
6263 }
6264 if (UseSHA512Intrinsics) {
6265 StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
6266 StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
6267 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
6268 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
6269 }
6270
6271 // Generate GHASH intrinsics code
6272 if (UseGHASHIntrinsics) {
6273 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
6274 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
6275 if (VM_Version::supports_avx()) {
6276 StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
6277 StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
6278 StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
6279 } else {
6280 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6281 }
6282 }
6283
6284 if (UseBASE64Intrinsics) {
6285 StubRoutines::x86::_and_mask = base64_and_mask_addr();
6286 StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr();
6287 StubRoutines::x86::_base64_charset = base64_charset_addr();
6288 StubRoutines::x86::_url_charset = base64url_charset_addr();
6289 StubRoutines::x86::_gather_mask = base64_gather_mask_addr();
6290 StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr();
6291 StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr();
6292 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6293 }
6294
6295 // Safefetch stubs.
6296 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
6297 &StubRoutines::_safefetch32_fault_pc,
6298 &StubRoutines::_safefetch32_continuation_pc);
6299 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6300 &StubRoutines::_safefetchN_fault_pc,
6301 &StubRoutines::_safefetchN_continuation_pc);
6302
6303 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6304 if (bs_nm != NULL) {
6305 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
6306 }
6307 #ifdef COMPILER2
6308 if (UseMultiplyToLenIntrinsic) {
6309 StubRoutines::_multiplyToLen = generate_multiplyToLen();
6310 }
6311 if (UseSquareToLenIntrinsic) {
6312 StubRoutines::_squareToLen = generate_squareToLen();
6313 }
6314 if (UseMulAddIntrinsic) {
6315 StubRoutines::_mulAdd = generate_mulAdd();
6316 }
6317 #ifndef _WINDOWS
6318 if (UseMontgomeryMultiplyIntrinsic) {
6319 StubRoutines::_montgomeryMultiply
6320 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
6321 }
6322 if (UseMontgomerySquareIntrinsic) {
6323 StubRoutines::_montgomerySquare
6324 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
6325 }
6326 #endif // WINDOWS
6327 #endif // COMPILER2
6328
6329 if (UseVectorizedMismatchIntrinsic) {
6330 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
6331 }
6332 }
6333
6334 public:
StubGenerator(CodeBuffer * code,bool all)6335 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6336 if (all) {
6337 generate_all();
6338 } else {
6339 generate_initial();
6340 }
6341 }
6342 }; // end class declaration
6343
6344 #define UCM_TABLE_MAX_ENTRIES 16
StubGenerator_generate(CodeBuffer * code,bool all)6345 void StubGenerator_generate(CodeBuffer* code, bool all) {
6346 if (UnsafeCopyMemory::_table == NULL) {
6347 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
6348 }
6349 StubGenerator g(code, all);
6350 }
6351