1 /*
2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "ci/ciUtilities.hpp"
29 #include "gc/shared/barrierSet.hpp"
30 #include "gc/shared/barrierSetAssembler.hpp"
31 #include "gc/shared/barrierSetNMethod.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "memory/universe.hpp"
34 #include "nativeInst_x86.hpp"
35 #include "oops/instanceOop.hpp"
36 #include "oops/method.hpp"
37 #include "oops/objArrayKlass.hpp"
38 #include "oops/oop.inline.hpp"
39 #include "prims/methodHandles.hpp"
40 #include "runtime/frame.inline.hpp"
41 #include "runtime/handles.inline.hpp"
42 #include "runtime/sharedRuntime.hpp"
43 #include "runtime/stubCodeGenerator.hpp"
44 #include "runtime/stubRoutines.hpp"
45 #include "runtime/thread.inline.hpp"
46 #ifdef COMPILER2
47 #include "opto/runtime.hpp"
48 #endif
49 #if INCLUDE_ZGC
50 #include "gc/z/zThreadLocalData.hpp"
51 #endif
52
53 // Declaration and definition of StubGenerator (no .hpp file).
54 // For a more detailed description of the stub routine structure
55 // see the comment in stubRoutines.hpp
56
57 #define __ _masm->
58 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
59 #define a__ ((Assembler*)_masm)->
60
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #else
64 #define BLOCK_COMMENT(str) __ block_comment(str)
65 #endif
66
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
69
70 // Stub Code definitions
71
72 class StubGenerator: public StubCodeGenerator {
73 private:
74
75 #ifdef PRODUCT
76 #define inc_counter_np(counter) ((void)0)
77 #else
78 void inc_counter_np_(int& counter) {
79 // This can destroy rscratch1 if counter is far from the code cache
80 __ incrementl(ExternalAddress((address)&counter));
81 }
82 #define inc_counter_np(counter) \
83 BLOCK_COMMENT("inc_counter " #counter); \
84 inc_counter_np_(counter);
85 #endif
86
87 // Call stubs are used to call Java from C
88 //
89 // Linux Arguments:
90 // c_rarg0: call wrapper address address
91 // c_rarg1: result address
92 // c_rarg2: result type BasicType
93 // c_rarg3: method Method*
94 // c_rarg4: (interpreter) entry point address
95 // c_rarg5: parameters intptr_t*
96 // 16(rbp): parameter size (in words) int
97 // 24(rbp): thread Thread*
98 //
99 // [ return_from_Java ] <--- rsp
100 // [ argument word n ]
101 // ...
102 // -12 [ argument word 1 ]
103 // -11 [ saved r15 ] <--- rsp_after_call
104 // -10 [ saved r14 ]
105 // -9 [ saved r13 ]
106 // -8 [ saved r12 ]
107 // -7 [ saved rbx ]
108 // -6 [ call wrapper ]
109 // -5 [ result ]
110 // -4 [ result type ]
111 // -3 [ method ]
112 // -2 [ entry point ]
113 // -1 [ parameters ]
114 // 0 [ saved rbp ] <--- rbp
115 // 1 [ return address ]
116 // 2 [ parameter size ]
117 // 3 [ thread ]
118 //
119 // Windows Arguments:
120 // c_rarg0: call wrapper address address
121 // c_rarg1: result address
122 // c_rarg2: result type BasicType
123 // c_rarg3: method Method*
124 // 48(rbp): (interpreter) entry point address
125 // 56(rbp): parameters intptr_t*
126 // 64(rbp): parameter size (in words) int
127 // 72(rbp): thread Thread*
128 //
129 // [ return_from_Java ] <--- rsp
130 // [ argument word n ]
131 // ...
132 // -60 [ argument word 1 ]
133 // -59 [ saved xmm31 ] <--- rsp after_call
134 // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank)
135 // -27 [ saved xmm15 ]
136 // [ saved xmm7-xmm14 ]
137 // -9 [ saved xmm6 ] (each xmm register takes 2 slots)
138 // -7 [ saved r15 ]
139 // -6 [ saved r14 ]
140 // -5 [ saved r13 ]
141 // -4 [ saved r12 ]
142 // -3 [ saved rdi ]
143 // -2 [ saved rsi ]
144 // -1 [ saved rbx ]
145 // 0 [ saved rbp ] <--- rbp
146 // 1 [ return address ]
147 // 2 [ call wrapper ]
148 // 3 [ result ]
149 // 4 [ result type ]
150 // 5 [ method ]
151 // 6 [ entry point ]
152 // 7 [ parameters ]
153 // 8 [ parameter size ]
154 // 9 [ thread ]
155 //
156 // Windows reserves the callers stack space for arguments 1-4.
157 // We spill c_rarg0-c_rarg3 to this space.
158
159 // Call stub stack layout word offsets from rbp
160 enum call_stub_layout {
161 #ifdef _WIN64
162 xmm_save_first = 6, // save from xmm6
163 xmm_save_last = 31, // to xmm31
164 xmm_save_base = -9,
165 rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
166 r15_off = -7,
167 r14_off = -6,
168 r13_off = -5,
169 r12_off = -4,
170 rdi_off = -3,
171 rsi_off = -2,
172 rbx_off = -1,
173 rbp_off = 0,
174 retaddr_off = 1,
175 call_wrapper_off = 2,
176 result_off = 3,
177 result_type_off = 4,
178 method_off = 5,
179 entry_point_off = 6,
180 parameters_off = 7,
181 parameter_size_off = 8,
182 thread_off = 9
183 #else
184 rsp_after_call_off = -12,
185 mxcsr_off = rsp_after_call_off,
186 r15_off = -11,
187 r14_off = -10,
188 r13_off = -9,
189 r12_off = -8,
190 rbx_off = -7,
191 call_wrapper_off = -6,
192 result_off = -5,
193 result_type_off = -4,
194 method_off = -3,
195 entry_point_off = -2,
196 parameters_off = -1,
197 rbp_off = 0,
198 retaddr_off = 1,
199 parameter_size_off = 2,
200 thread_off = 3
201 #endif
202 };
203
204 #ifdef _WIN64
xmm_save(int reg)205 Address xmm_save(int reg) {
206 assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
207 return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
208 }
209 #endif
210
generate_call_stub(address & return_address)211 address generate_call_stub(address& return_address) {
212 assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
213 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
214 "adjust this code");
215 StubCodeMark mark(this, "StubRoutines", "call_stub");
216 address start = __ pc();
217
218 // same as in generate_catch_exception()!
219 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
220
221 const Address call_wrapper (rbp, call_wrapper_off * wordSize);
222 const Address result (rbp, result_off * wordSize);
223 const Address result_type (rbp, result_type_off * wordSize);
224 const Address method (rbp, method_off * wordSize);
225 const Address entry_point (rbp, entry_point_off * wordSize);
226 const Address parameters (rbp, parameters_off * wordSize);
227 const Address parameter_size(rbp, parameter_size_off * wordSize);
228
229 // same as in generate_catch_exception()!
230 const Address thread (rbp, thread_off * wordSize);
231
232 const Address r15_save(rbp, r15_off * wordSize);
233 const Address r14_save(rbp, r14_off * wordSize);
234 const Address r13_save(rbp, r13_off * wordSize);
235 const Address r12_save(rbp, r12_off * wordSize);
236 const Address rbx_save(rbp, rbx_off * wordSize);
237
238 // stub code
239 __ enter();
240 __ subptr(rsp, -rsp_after_call_off * wordSize);
241
242 // save register parameters
243 #ifndef _WIN64
244 __ movptr(parameters, c_rarg5); // parameters
245 __ movptr(entry_point, c_rarg4); // entry_point
246 #endif
247
248 __ movptr(method, c_rarg3); // method
249 __ movl(result_type, c_rarg2); // result type
250 __ movptr(result, c_rarg1); // result
251 __ movptr(call_wrapper, c_rarg0); // call wrapper
252
253 // save regs belonging to calling function
254 __ movptr(rbx_save, rbx);
255 __ movptr(r12_save, r12);
256 __ movptr(r13_save, r13);
257 __ movptr(r14_save, r14);
258 __ movptr(r15_save, r15);
259
260 #ifdef _WIN64
261 int last_reg = 15;
262 if (UseAVX > 2) {
263 last_reg = 31;
264 }
265 if (VM_Version::supports_evex()) {
266 for (int i = xmm_save_first; i <= last_reg; i++) {
267 __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
268 }
269 } else {
270 for (int i = xmm_save_first; i <= last_reg; i++) {
271 __ movdqu(xmm_save(i), as_XMMRegister(i));
272 }
273 }
274
275 const Address rdi_save(rbp, rdi_off * wordSize);
276 const Address rsi_save(rbp, rsi_off * wordSize);
277
278 __ movptr(rsi_save, rsi);
279 __ movptr(rdi_save, rdi);
280 #else
281 const Address mxcsr_save(rbp, mxcsr_off * wordSize);
282 {
283 Label skip_ldmx;
284 __ stmxcsr(mxcsr_save);
285 __ movl(rax, mxcsr_save);
286 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
287 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
288 __ cmp32(rax, mxcsr_std);
289 __ jcc(Assembler::equal, skip_ldmx);
290 __ ldmxcsr(mxcsr_std);
291 __ bind(skip_ldmx);
292 }
293 #endif
294
295 // Load up thread register
296 __ movptr(r15_thread, thread);
297 __ reinit_heapbase();
298
299 #ifdef ASSERT
300 // make sure we have no pending exceptions
301 {
302 Label L;
303 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
304 __ jcc(Assembler::equal, L);
305 __ stop("StubRoutines::call_stub: entered with pending exception");
306 __ bind(L);
307 }
308 #endif
309
310 // pass parameters if any
311 BLOCK_COMMENT("pass parameters if any");
312 Label parameters_done;
313 __ movl(c_rarg3, parameter_size);
314 __ testl(c_rarg3, c_rarg3);
315 __ jcc(Assembler::zero, parameters_done);
316
317 Label loop;
318 __ movptr(c_rarg2, parameters); // parameter pointer
319 __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1
320 __ BIND(loop);
321 __ movptr(rax, Address(c_rarg2, 0));// get parameter
322 __ addptr(c_rarg2, wordSize); // advance to next parameter
323 __ decrementl(c_rarg1); // decrement counter
324 __ push(rax); // pass parameter
325 __ jcc(Assembler::notZero, loop);
326
327 // call Java function
328 __ BIND(parameters_done);
329 __ movptr(rbx, method); // get Method*
330 __ movptr(c_rarg1, entry_point); // get entry_point
331 __ mov(r13, rsp); // set sender sp
332 BLOCK_COMMENT("call Java function");
333 __ call(c_rarg1);
334
335 BLOCK_COMMENT("call_stub_return_address:");
336 return_address = __ pc();
337
338 // store result depending on type (everything that is not
339 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
340 __ movptr(c_rarg0, result);
341 Label is_long, is_float, is_double, exit;
342 __ movl(c_rarg1, result_type);
343 __ cmpl(c_rarg1, T_OBJECT);
344 __ jcc(Assembler::equal, is_long);
345 __ cmpl(c_rarg1, T_LONG);
346 __ jcc(Assembler::equal, is_long);
347 __ cmpl(c_rarg1, T_FLOAT);
348 __ jcc(Assembler::equal, is_float);
349 __ cmpl(c_rarg1, T_DOUBLE);
350 __ jcc(Assembler::equal, is_double);
351
352 // handle T_INT case
353 __ movl(Address(c_rarg0, 0), rax);
354
355 __ BIND(exit);
356
357 // pop parameters
358 __ lea(rsp, rsp_after_call);
359
360 #ifdef ASSERT
361 // verify that threads correspond
362 {
363 Label L1, L2, L3;
364 __ cmpptr(r15_thread, thread);
365 __ jcc(Assembler::equal, L1);
366 __ stop("StubRoutines::call_stub: r15_thread is corrupted");
367 __ bind(L1);
368 __ get_thread(rbx);
369 __ cmpptr(r15_thread, thread);
370 __ jcc(Assembler::equal, L2);
371 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
372 __ bind(L2);
373 __ cmpptr(r15_thread, rbx);
374 __ jcc(Assembler::equal, L3);
375 __ stop("StubRoutines::call_stub: threads must correspond");
376 __ bind(L3);
377 }
378 #endif
379
380 // restore regs belonging to calling function
381 #ifdef _WIN64
382 // emit the restores for xmm regs
383 if (VM_Version::supports_evex()) {
384 for (int i = xmm_save_first; i <= last_reg; i++) {
385 __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
386 }
387 } else {
388 for (int i = xmm_save_first; i <= last_reg; i++) {
389 __ movdqu(as_XMMRegister(i), xmm_save(i));
390 }
391 }
392 #endif
393 __ movptr(r15, r15_save);
394 __ movptr(r14, r14_save);
395 __ movptr(r13, r13_save);
396 __ movptr(r12, r12_save);
397 __ movptr(rbx, rbx_save);
398
399 #ifdef _WIN64
400 __ movptr(rdi, rdi_save);
401 __ movptr(rsi, rsi_save);
402 #else
403 __ ldmxcsr(mxcsr_save);
404 #endif
405
406 // restore rsp
407 __ addptr(rsp, -rsp_after_call_off * wordSize);
408
409 // return
410 __ vzeroupper();
411 __ pop(rbp);
412 __ ret(0);
413
414 // handle return types different from T_INT
415 __ BIND(is_long);
416 __ movq(Address(c_rarg0, 0), rax);
417 __ jmp(exit);
418
419 __ BIND(is_float);
420 __ movflt(Address(c_rarg0, 0), xmm0);
421 __ jmp(exit);
422
423 __ BIND(is_double);
424 __ movdbl(Address(c_rarg0, 0), xmm0);
425 __ jmp(exit);
426
427 return start;
428 }
429
430 // Return point for a Java call if there's an exception thrown in
431 // Java code. The exception is caught and transformed into a
432 // pending exception stored in JavaThread that can be tested from
433 // within the VM.
434 //
435 // Note: Usually the parameters are removed by the callee. In case
436 // of an exception crossing an activation frame boundary, that is
437 // not the case if the callee is compiled code => need to setup the
438 // rsp.
439 //
440 // rax: exception oop
441
generate_catch_exception()442 address generate_catch_exception() {
443 StubCodeMark mark(this, "StubRoutines", "catch_exception");
444 address start = __ pc();
445
446 // same as in generate_call_stub():
447 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
448 const Address thread (rbp, thread_off * wordSize);
449
450 #ifdef ASSERT
451 // verify that threads correspond
452 {
453 Label L1, L2, L3;
454 __ cmpptr(r15_thread, thread);
455 __ jcc(Assembler::equal, L1);
456 __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
457 __ bind(L1);
458 __ get_thread(rbx);
459 __ cmpptr(r15_thread, thread);
460 __ jcc(Assembler::equal, L2);
461 __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
462 __ bind(L2);
463 __ cmpptr(r15_thread, rbx);
464 __ jcc(Assembler::equal, L3);
465 __ stop("StubRoutines::catch_exception: threads must correspond");
466 __ bind(L3);
467 }
468 #endif
469
470 // set pending exception
471 __ verify_oop(rax);
472
473 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
474 __ lea(rscratch1, ExternalAddress((address)__FILE__));
475 __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
476 __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__);
477
478 // complete return to VM
479 assert(StubRoutines::_call_stub_return_address != NULL,
480 "_call_stub_return_address must have been generated before");
481 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
482
483 return start;
484 }
485
486 // Continuation point for runtime calls returning with a pending
487 // exception. The pending exception check happened in the runtime
488 // or native call stub. The pending exception in Thread is
489 // converted into a Java-level exception.
490 //
491 // Contract with Java-level exception handlers:
492 // rax: exception
493 // rdx: throwing pc
494 //
495 // NOTE: At entry of this stub, exception-pc must be on stack !!
496
generate_forward_exception()497 address generate_forward_exception() {
498 StubCodeMark mark(this, "StubRoutines", "forward exception");
499 address start = __ pc();
500
501 // Upon entry, the sp points to the return address returning into
502 // Java (interpreted or compiled) code; i.e., the return address
503 // becomes the throwing pc.
504 //
505 // Arguments pushed before the runtime call are still on the stack
506 // but the exception handler will reset the stack pointer ->
507 // ignore them. A potential result in registers can be ignored as
508 // well.
509
510 #ifdef ASSERT
511 // make sure this code is only executed if there is a pending exception
512 {
513 Label L;
514 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
515 __ jcc(Assembler::notEqual, L);
516 __ stop("StubRoutines::forward exception: no pending exception (1)");
517 __ bind(L);
518 }
519 #endif
520
521 // compute exception handler into rbx
522 __ movptr(c_rarg0, Address(rsp, 0));
523 BLOCK_COMMENT("call exception_handler_for_return_address");
524 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
525 SharedRuntime::exception_handler_for_return_address),
526 r15_thread, c_rarg0);
527 __ mov(rbx, rax);
528
529 // setup rax & rdx, remove return address & clear pending exception
530 __ pop(rdx);
531 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
532 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
533
534 #ifdef ASSERT
535 // make sure exception is set
536 {
537 Label L;
538 __ testptr(rax, rax);
539 __ jcc(Assembler::notEqual, L);
540 __ stop("StubRoutines::forward exception: no pending exception (2)");
541 __ bind(L);
542 }
543 #endif
544
545 // continue at exception handler (return address removed)
546 // rax: exception
547 // rbx: exception handler
548 // rdx: throwing pc
549 __ verify_oop(rax);
550 __ jmp(rbx);
551
552 return start;
553 }
554
555 // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
556 //
557 // Arguments :
558 // c_rarg0: exchange_value
559 // c_rarg0: dest
560 //
561 // Result:
562 // *dest <- ex, return (orig *dest)
generate_atomic_xchg()563 address generate_atomic_xchg() {
564 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
565 address start = __ pc();
566
567 __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
568 __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
569 __ ret(0);
570
571 return start;
572 }
573
574 // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
575 //
576 // Arguments :
577 // c_rarg0: exchange_value
578 // c_rarg1: dest
579 //
580 // Result:
581 // *dest <- ex, return (orig *dest)
generate_atomic_xchg_long()582 address generate_atomic_xchg_long() {
583 StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
584 address start = __ pc();
585
586 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
587 __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
588 __ ret(0);
589
590 return start;
591 }
592
593 // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
594 // jint compare_value)
595 //
596 // Arguments :
597 // c_rarg0: exchange_value
598 // c_rarg1: dest
599 // c_rarg2: compare_value
600 //
601 // Result:
602 // if ( compare_value == *dest ) {
603 // *dest = exchange_value
604 // return compare_value;
605 // else
606 // return *dest;
generate_atomic_cmpxchg()607 address generate_atomic_cmpxchg() {
608 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
609 address start = __ pc();
610
611 __ movl(rax, c_rarg2);
612 __ lock();
613 __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
614 __ ret(0);
615
616 return start;
617 }
618
619 // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
620 // int8_t compare_value)
621 //
622 // Arguments :
623 // c_rarg0: exchange_value
624 // c_rarg1: dest
625 // c_rarg2: compare_value
626 //
627 // Result:
628 // if ( compare_value == *dest ) {
629 // *dest = exchange_value
630 // return compare_value;
631 // else
632 // return *dest;
generate_atomic_cmpxchg_byte()633 address generate_atomic_cmpxchg_byte() {
634 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
635 address start = __ pc();
636
637 __ movsbq(rax, c_rarg2);
638 __ lock();
639 __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
640 __ ret(0);
641
642 return start;
643 }
644
645 // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
646 // volatile int64_t* dest,
647 // int64_t compare_value)
648 // Arguments :
649 // c_rarg0: exchange_value
650 // c_rarg1: dest
651 // c_rarg2: compare_value
652 //
653 // Result:
654 // if ( compare_value == *dest ) {
655 // *dest = exchange_value
656 // return compare_value;
657 // else
658 // return *dest;
generate_atomic_cmpxchg_long()659 address generate_atomic_cmpxchg_long() {
660 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
661 address start = __ pc();
662
663 __ movq(rax, c_rarg2);
664 __ lock();
665 __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
666 __ ret(0);
667
668 return start;
669 }
670
671 // Support for jint atomic::add(jint add_value, volatile jint* dest)
672 //
673 // Arguments :
674 // c_rarg0: add_value
675 // c_rarg1: dest
676 //
677 // Result:
678 // *dest += add_value
679 // return *dest;
generate_atomic_add()680 address generate_atomic_add() {
681 StubCodeMark mark(this, "StubRoutines", "atomic_add");
682 address start = __ pc();
683
684 __ movl(rax, c_rarg0);
685 __ lock();
686 __ xaddl(Address(c_rarg1, 0), c_rarg0);
687 __ addl(rax, c_rarg0);
688 __ ret(0);
689
690 return start;
691 }
692
693 // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
694 //
695 // Arguments :
696 // c_rarg0: add_value
697 // c_rarg1: dest
698 //
699 // Result:
700 // *dest += add_value
701 // return *dest;
generate_atomic_add_long()702 address generate_atomic_add_long() {
703 StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
704 address start = __ pc();
705
706 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
707 __ lock();
708 __ xaddptr(Address(c_rarg1, 0), c_rarg0);
709 __ addptr(rax, c_rarg0);
710 __ ret(0);
711
712 return start;
713 }
714
715 // Support for intptr_t OrderAccess::fence()
716 //
717 // Arguments :
718 //
719 // Result:
generate_orderaccess_fence()720 address generate_orderaccess_fence() {
721 StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
722 address start = __ pc();
723 __ membar(Assembler::StoreLoad);
724 __ ret(0);
725
726 return start;
727 }
728
729 // Support for intptr_t get_previous_fp()
730 //
731 // This routine is used to find the previous frame pointer for the
732 // caller (current_frame_guess). This is used as part of debugging
733 // ps() is seemingly lost trying to find frames.
734 // This code assumes that caller current_frame_guess) has a frame.
generate_get_previous_fp()735 address generate_get_previous_fp() {
736 StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
737 const Address old_fp(rbp, 0);
738 const Address older_fp(rax, 0);
739 address start = __ pc();
740
741 __ enter();
742 __ movptr(rax, old_fp); // callers fp
743 __ movptr(rax, older_fp); // the frame for ps()
744 __ pop(rbp);
745 __ ret(0);
746
747 return start;
748 }
749
750 // Support for intptr_t get_previous_sp()
751 //
752 // This routine is used to find the previous stack pointer for the
753 // caller.
generate_get_previous_sp()754 address generate_get_previous_sp() {
755 StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
756 address start = __ pc();
757
758 __ movptr(rax, rsp);
759 __ addptr(rax, 8); // return address is at the top of the stack.
760 __ ret(0);
761
762 return start;
763 }
764
765 //----------------------------------------------------------------------------------------------------
766 // Support for void verify_mxcsr()
767 //
768 // This routine is used with -Xcheck:jni to verify that native
769 // JNI code does not return to Java code without restoring the
770 // MXCSR register to our expected state.
771
generate_verify_mxcsr()772 address generate_verify_mxcsr() {
773 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
774 address start = __ pc();
775
776 const Address mxcsr_save(rsp, 0);
777
778 if (CheckJNICalls) {
779 Label ok_ret;
780 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
781 __ push(rax);
782 __ subptr(rsp, wordSize); // allocate a temp location
783 __ stmxcsr(mxcsr_save);
784 __ movl(rax, mxcsr_save);
785 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
786 __ cmp32(rax, mxcsr_std);
787 __ jcc(Assembler::equal, ok_ret);
788
789 __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
790
791 __ ldmxcsr(mxcsr_std);
792
793 __ bind(ok_ret);
794 __ addptr(rsp, wordSize);
795 __ pop(rax);
796 }
797
798 __ ret(0);
799
800 return start;
801 }
802
generate_f2i_fixup()803 address generate_f2i_fixup() {
804 StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
805 Address inout(rsp, 5 * wordSize); // return address + 4 saves
806
807 address start = __ pc();
808
809 Label L;
810
811 __ push(rax);
812 __ push(c_rarg3);
813 __ push(c_rarg2);
814 __ push(c_rarg1);
815
816 __ movl(rax, 0x7f800000);
817 __ xorl(c_rarg3, c_rarg3);
818 __ movl(c_rarg2, inout);
819 __ movl(c_rarg1, c_rarg2);
820 __ andl(c_rarg1, 0x7fffffff);
821 __ cmpl(rax, c_rarg1); // NaN? -> 0
822 __ jcc(Assembler::negative, L);
823 __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
824 __ movl(c_rarg3, 0x80000000);
825 __ movl(rax, 0x7fffffff);
826 __ cmovl(Assembler::positive, c_rarg3, rax);
827
828 __ bind(L);
829 __ movptr(inout, c_rarg3);
830
831 __ pop(c_rarg1);
832 __ pop(c_rarg2);
833 __ pop(c_rarg3);
834 __ pop(rax);
835
836 __ ret(0);
837
838 return start;
839 }
840
generate_f2l_fixup()841 address generate_f2l_fixup() {
842 StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
843 Address inout(rsp, 5 * wordSize); // return address + 4 saves
844 address start = __ pc();
845
846 Label L;
847
848 __ push(rax);
849 __ push(c_rarg3);
850 __ push(c_rarg2);
851 __ push(c_rarg1);
852
853 __ movl(rax, 0x7f800000);
854 __ xorl(c_rarg3, c_rarg3);
855 __ movl(c_rarg2, inout);
856 __ movl(c_rarg1, c_rarg2);
857 __ andl(c_rarg1, 0x7fffffff);
858 __ cmpl(rax, c_rarg1); // NaN? -> 0
859 __ jcc(Assembler::negative, L);
860 __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
861 __ mov64(c_rarg3, 0x8000000000000000);
862 __ mov64(rax, 0x7fffffffffffffff);
863 __ cmov(Assembler::positive, c_rarg3, rax);
864
865 __ bind(L);
866 __ movptr(inout, c_rarg3);
867
868 __ pop(c_rarg1);
869 __ pop(c_rarg2);
870 __ pop(c_rarg3);
871 __ pop(rax);
872
873 __ ret(0);
874
875 return start;
876 }
877
generate_d2i_fixup()878 address generate_d2i_fixup() {
879 StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
880 Address inout(rsp, 6 * wordSize); // return address + 5 saves
881
882 address start = __ pc();
883
884 Label L;
885
886 __ push(rax);
887 __ push(c_rarg3);
888 __ push(c_rarg2);
889 __ push(c_rarg1);
890 __ push(c_rarg0);
891
892 __ movl(rax, 0x7ff00000);
893 __ movq(c_rarg2, inout);
894 __ movl(c_rarg3, c_rarg2);
895 __ mov(c_rarg1, c_rarg2);
896 __ mov(c_rarg0, c_rarg2);
897 __ negl(c_rarg3);
898 __ shrptr(c_rarg1, 0x20);
899 __ orl(c_rarg3, c_rarg2);
900 __ andl(c_rarg1, 0x7fffffff);
901 __ xorl(c_rarg2, c_rarg2);
902 __ shrl(c_rarg3, 0x1f);
903 __ orl(c_rarg1, c_rarg3);
904 __ cmpl(rax, c_rarg1);
905 __ jcc(Assembler::negative, L); // NaN -> 0
906 __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
907 __ movl(c_rarg2, 0x80000000);
908 __ movl(rax, 0x7fffffff);
909 __ cmov(Assembler::positive, c_rarg2, rax);
910
911 __ bind(L);
912 __ movptr(inout, c_rarg2);
913
914 __ pop(c_rarg0);
915 __ pop(c_rarg1);
916 __ pop(c_rarg2);
917 __ pop(c_rarg3);
918 __ pop(rax);
919
920 __ ret(0);
921
922 return start;
923 }
924
generate_d2l_fixup()925 address generate_d2l_fixup() {
926 StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
927 Address inout(rsp, 6 * wordSize); // return address + 5 saves
928
929 address start = __ pc();
930
931 Label L;
932
933 __ push(rax);
934 __ push(c_rarg3);
935 __ push(c_rarg2);
936 __ push(c_rarg1);
937 __ push(c_rarg0);
938
939 __ movl(rax, 0x7ff00000);
940 __ movq(c_rarg2, inout);
941 __ movl(c_rarg3, c_rarg2);
942 __ mov(c_rarg1, c_rarg2);
943 __ mov(c_rarg0, c_rarg2);
944 __ negl(c_rarg3);
945 __ shrptr(c_rarg1, 0x20);
946 __ orl(c_rarg3, c_rarg2);
947 __ andl(c_rarg1, 0x7fffffff);
948 __ xorl(c_rarg2, c_rarg2);
949 __ shrl(c_rarg3, 0x1f);
950 __ orl(c_rarg1, c_rarg3);
951 __ cmpl(rax, c_rarg1);
952 __ jcc(Assembler::negative, L); // NaN -> 0
953 __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
954 __ mov64(c_rarg2, 0x8000000000000000);
955 __ mov64(rax, 0x7fffffffffffffff);
956 __ cmovq(Assembler::positive, c_rarg2, rax);
957
958 __ bind(L);
959 __ movq(inout, c_rarg2);
960
961 __ pop(c_rarg0);
962 __ pop(c_rarg1);
963 __ pop(c_rarg2);
964 __ pop(c_rarg3);
965 __ pop(rax);
966
967 __ ret(0);
968
969 return start;
970 }
971
generate_fp_mask(const char * stub_name,int64_t mask)972 address generate_fp_mask(const char *stub_name, int64_t mask) {
973 __ align(CodeEntryAlignment);
974 StubCodeMark mark(this, "StubRoutines", stub_name);
975 address start = __ pc();
976
977 __ emit_data64( mask, relocInfo::none );
978 __ emit_data64( mask, relocInfo::none );
979
980 return start;
981 }
982
generate_vector_mask(const char * stub_name,int64_t mask)983 address generate_vector_mask(const char *stub_name, int64_t mask) {
984 __ align(CodeEntryAlignment);
985 StubCodeMark mark(this, "StubRoutines", stub_name);
986 address start = __ pc();
987
988 __ emit_data64(mask, relocInfo::none);
989 __ emit_data64(mask, relocInfo::none);
990 __ emit_data64(mask, relocInfo::none);
991 __ emit_data64(mask, relocInfo::none);
992 __ emit_data64(mask, relocInfo::none);
993 __ emit_data64(mask, relocInfo::none);
994 __ emit_data64(mask, relocInfo::none);
995 __ emit_data64(mask, relocInfo::none);
996
997 return start;
998 }
999
generate_vector_byte_perm_mask(const char * stub_name)1000 address generate_vector_byte_perm_mask(const char *stub_name) {
1001 __ align(CodeEntryAlignment);
1002 StubCodeMark mark(this, "StubRoutines", stub_name);
1003 address start = __ pc();
1004
1005 __ emit_data64(0x0000000000000001, relocInfo::none);
1006 __ emit_data64(0x0000000000000003, relocInfo::none);
1007 __ emit_data64(0x0000000000000005, relocInfo::none);
1008 __ emit_data64(0x0000000000000007, relocInfo::none);
1009 __ emit_data64(0x0000000000000000, relocInfo::none);
1010 __ emit_data64(0x0000000000000002, relocInfo::none);
1011 __ emit_data64(0x0000000000000004, relocInfo::none);
1012 __ emit_data64(0x0000000000000006, relocInfo::none);
1013
1014 return start;
1015 }
1016
1017 // Non-destructive plausibility checks for oops
1018 //
1019 // Arguments:
1020 // all args on stack!
1021 //
1022 // Stack after saving c_rarg3:
1023 // [tos + 0]: saved c_rarg3
1024 // [tos + 1]: saved c_rarg2
1025 // [tos + 2]: saved r12 (several TemplateTable methods use it)
1026 // [tos + 3]: saved flags
1027 // [tos + 4]: return address
1028 // * [tos + 5]: error message (char*)
1029 // * [tos + 6]: object to verify (oop)
1030 // * [tos + 7]: saved rax - saved by caller and bashed
1031 // * [tos + 8]: saved r10 (rscratch1) - saved by caller
1032 // * = popped on exit
generate_verify_oop()1033 address generate_verify_oop() {
1034 StubCodeMark mark(this, "StubRoutines", "verify_oop");
1035 address start = __ pc();
1036
1037 Label exit, error;
1038
1039 __ pushf();
1040 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1041
1042 __ push(r12);
1043
1044 // save c_rarg2 and c_rarg3
1045 __ push(c_rarg2);
1046 __ push(c_rarg3);
1047
1048 enum {
1049 // After previous pushes.
1050 oop_to_verify = 6 * wordSize,
1051 saved_rax = 7 * wordSize,
1052 saved_r10 = 8 * wordSize,
1053
1054 // Before the call to MacroAssembler::debug(), see below.
1055 return_addr = 16 * wordSize,
1056 error_msg = 17 * wordSize
1057 };
1058
1059 // get object
1060 __ movptr(rax, Address(rsp, oop_to_verify));
1061
1062 // make sure object is 'reasonable'
1063 __ testptr(rax, rax);
1064 __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1065
1066 #if INCLUDE_ZGC
1067 if (UseZGC) {
1068 // Check if metadata bits indicate a bad oop
1069 __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1070 __ jcc(Assembler::notZero, error);
1071 }
1072 #endif
1073
1074 // Check if the oop is in the right area of memory
1075 __ movptr(c_rarg2, rax);
1076 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1077 __ andptr(c_rarg2, c_rarg3);
1078 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1079 __ cmpptr(c_rarg2, c_rarg3);
1080 __ jcc(Assembler::notZero, error);
1081
1082 // set r12 to heapbase for load_klass()
1083 __ reinit_heapbase();
1084
1085 // make sure klass is 'reasonable', which is not zero.
1086 __ load_klass(rax, rax); // get klass
1087 __ testptr(rax, rax);
1088 __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1089
1090 // return if everything seems ok
1091 __ bind(exit);
1092 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1093 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1094 __ pop(c_rarg3); // restore c_rarg3
1095 __ pop(c_rarg2); // restore c_rarg2
1096 __ pop(r12); // restore r12
1097 __ popf(); // restore flags
1098 __ ret(4 * wordSize); // pop caller saved stuff
1099
1100 // handle errors
1101 __ bind(error);
1102 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1103 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1104 __ pop(c_rarg3); // get saved c_rarg3 back
1105 __ pop(c_rarg2); // get saved c_rarg2 back
1106 __ pop(r12); // get saved r12 back
1107 __ popf(); // get saved flags off stack --
1108 // will be ignored
1109
1110 __ pusha(); // push registers
1111 // (rip is already
1112 // already pushed)
1113 // debug(char* msg, int64_t pc, int64_t regs[])
1114 // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1115 // pushed all the registers, so now the stack looks like:
1116 // [tos + 0] 16 saved registers
1117 // [tos + 16] return address
1118 // * [tos + 17] error message (char*)
1119 // * [tos + 18] object to verify (oop)
1120 // * [tos + 19] saved rax - saved by caller and bashed
1121 // * [tos + 20] saved r10 (rscratch1) - saved by caller
1122 // * = popped on exit
1123
1124 __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message
1125 __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address
1126 __ movq(c_rarg2, rsp); // pass address of regs on stack
1127 __ mov(r12, rsp); // remember rsp
1128 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1129 __ andptr(rsp, -16); // align stack as required by ABI
1130 BLOCK_COMMENT("call MacroAssembler::debug");
1131 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1132 __ mov(rsp, r12); // restore rsp
1133 __ popa(); // pop registers (includes r12)
1134 __ ret(4 * wordSize); // pop caller saved stuff
1135
1136 return start;
1137 }
1138
1139 //
1140 // Verify that a register contains clean 32-bits positive value
1141 // (high 32-bits are 0) so it could be used in 64-bits shifts.
1142 //
1143 // Input:
1144 // Rint - 32-bits value
1145 // Rtmp - scratch
1146 //
assert_clean_int(Register Rint,Register Rtmp)1147 void assert_clean_int(Register Rint, Register Rtmp) {
1148 #ifdef ASSERT
1149 Label L;
1150 assert_different_registers(Rtmp, Rint);
1151 __ movslq(Rtmp, Rint);
1152 __ cmpq(Rtmp, Rint);
1153 __ jcc(Assembler::equal, L);
1154 __ stop("high 32-bits of int value are not 0");
1155 __ bind(L);
1156 #endif
1157 }
1158
1159 // Generate overlap test for array copy stubs
1160 //
1161 // Input:
1162 // c_rarg0 - from
1163 // c_rarg1 - to
1164 // c_rarg2 - element count
1165 //
1166 // Output:
1167 // rax - &from[element count - 1]
1168 //
array_overlap_test(address no_overlap_target,Address::ScaleFactor sf)1169 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1170 assert(no_overlap_target != NULL, "must be generated");
1171 array_overlap_test(no_overlap_target, NULL, sf);
1172 }
array_overlap_test(Label & L_no_overlap,Address::ScaleFactor sf)1173 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1174 array_overlap_test(NULL, &L_no_overlap, sf);
1175 }
array_overlap_test(address no_overlap_target,Label * NOLp,Address::ScaleFactor sf)1176 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1177 const Register from = c_rarg0;
1178 const Register to = c_rarg1;
1179 const Register count = c_rarg2;
1180 const Register end_from = rax;
1181
1182 __ cmpptr(to, from);
1183 __ lea(end_from, Address(from, count, sf, 0));
1184 if (NOLp == NULL) {
1185 ExternalAddress no_overlap(no_overlap_target);
1186 __ jump_cc(Assembler::belowEqual, no_overlap);
1187 __ cmpptr(to, end_from);
1188 __ jump_cc(Assembler::aboveEqual, no_overlap);
1189 } else {
1190 __ jcc(Assembler::belowEqual, (*NOLp));
1191 __ cmpptr(to, end_from);
1192 __ jcc(Assembler::aboveEqual, (*NOLp));
1193 }
1194 }
1195
1196 // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1197 //
1198 // Outputs:
1199 // rdi - rcx
1200 // rsi - rdx
1201 // rdx - r8
1202 // rcx - r9
1203 //
1204 // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1205 // are non-volatile. r9 and r10 should not be used by the caller.
1206 //
DEBUG_ONLY(bool regs_in_thread;)1207 DEBUG_ONLY(bool regs_in_thread;)
1208
1209 void setup_arg_regs(int nargs = 3) {
1210 const Register saved_rdi = r9;
1211 const Register saved_rsi = r10;
1212 assert(nargs == 3 || nargs == 4, "else fix");
1213 #ifdef _WIN64
1214 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1215 "unexpected argument registers");
1216 if (nargs >= 4)
1217 __ mov(rax, r9); // r9 is also saved_rdi
1218 __ movptr(saved_rdi, rdi);
1219 __ movptr(saved_rsi, rsi);
1220 __ mov(rdi, rcx); // c_rarg0
1221 __ mov(rsi, rdx); // c_rarg1
1222 __ mov(rdx, r8); // c_rarg2
1223 if (nargs >= 4)
1224 __ mov(rcx, rax); // c_rarg3 (via rax)
1225 #else
1226 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1227 "unexpected argument registers");
1228 #endif
1229 DEBUG_ONLY(regs_in_thread = false;)
1230 }
1231
restore_arg_regs()1232 void restore_arg_regs() {
1233 assert(!regs_in_thread, "wrong call to restore_arg_regs");
1234 const Register saved_rdi = r9;
1235 const Register saved_rsi = r10;
1236 #ifdef _WIN64
1237 __ movptr(rdi, saved_rdi);
1238 __ movptr(rsi, saved_rsi);
1239 #endif
1240 }
1241
1242 // This is used in places where r10 is a scratch register, and can
1243 // be adapted if r9 is needed also.
setup_arg_regs_using_thread()1244 void setup_arg_regs_using_thread() {
1245 const Register saved_r15 = r9;
1246 #ifdef _WIN64
1247 __ mov(saved_r15, r15); // r15 is callee saved and needs to be restored
1248 __ get_thread(r15_thread);
1249 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1250 "unexpected argument registers");
1251 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1252 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1253
1254 __ mov(rdi, rcx); // c_rarg0
1255 __ mov(rsi, rdx); // c_rarg1
1256 __ mov(rdx, r8); // c_rarg2
1257 #else
1258 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1259 "unexpected argument registers");
1260 #endif
1261 DEBUG_ONLY(regs_in_thread = true;)
1262 }
1263
restore_arg_regs_using_thread()1264 void restore_arg_regs_using_thread() {
1265 assert(regs_in_thread, "wrong call to restore_arg_regs");
1266 const Register saved_r15 = r9;
1267 #ifdef _WIN64
1268 __ get_thread(r15_thread);
1269 __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1270 __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1271 __ mov(r15, saved_r15); // r15 is callee saved and needs to be restored
1272 #endif
1273 }
1274
1275 // Copy big chunks forward
1276 //
1277 // Inputs:
1278 // end_from - source arrays end address
1279 // end_to - destination array end address
1280 // qword_count - 64-bits element count, negative
1281 // to - scratch
1282 // L_copy_bytes - entry label
1283 // L_copy_8_bytes - exit label
1284 //
copy_bytes_forward(Register end_from,Register end_to,Register qword_count,Register to,Label & L_copy_bytes,Label & L_copy_8_bytes)1285 void copy_bytes_forward(Register end_from, Register end_to,
1286 Register qword_count, Register to,
1287 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1288 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1289 Label L_loop;
1290 __ align(OptoLoopAlignment);
1291 if (UseUnalignedLoadStores) {
1292 Label L_end;
1293 // Copy 64-bytes per iteration
1294 if (UseAVX > 2) {
1295 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
1296
1297 __ BIND(L_copy_bytes);
1298 __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
1299 __ jccb(Assembler::less, L_above_threshold);
1300 __ jmpb(L_below_threshold);
1301
1302 __ bind(L_loop_avx512);
1303 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1304 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1305 __ bind(L_above_threshold);
1306 __ addptr(qword_count, 8);
1307 __ jcc(Assembler::lessEqual, L_loop_avx512);
1308 __ jmpb(L_32_byte_head);
1309
1310 __ bind(L_loop_avx2);
1311 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1312 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1313 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1314 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1315 __ bind(L_below_threshold);
1316 __ addptr(qword_count, 8);
1317 __ jcc(Assembler::lessEqual, L_loop_avx2);
1318
1319 __ bind(L_32_byte_head);
1320 __ subptr(qword_count, 4); // sub(8) and add(4)
1321 __ jccb(Assembler::greater, L_end);
1322 } else {
1323 __ BIND(L_loop);
1324 if (UseAVX == 2) {
1325 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1326 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1327 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1328 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1329 } else {
1330 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1331 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1332 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1333 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1334 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1335 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1336 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1337 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1338 }
1339
1340 __ BIND(L_copy_bytes);
1341 __ addptr(qword_count, 8);
1342 __ jcc(Assembler::lessEqual, L_loop);
1343 __ subptr(qword_count, 4); // sub(8) and add(4)
1344 __ jccb(Assembler::greater, L_end);
1345 }
1346 // Copy trailing 32 bytes
1347 if (UseAVX >= 2) {
1348 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1349 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1350 } else {
1351 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1352 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1353 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1354 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1355 }
1356 __ addptr(qword_count, 4);
1357 __ BIND(L_end);
1358 if (UseAVX >= 2) {
1359 // clean upper bits of YMM registers
1360 __ vpxor(xmm0, xmm0);
1361 __ vpxor(xmm1, xmm1);
1362 }
1363 } else {
1364 // Copy 32-bytes per iteration
1365 __ BIND(L_loop);
1366 __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1367 __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1368 __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1369 __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1370 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1371 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1372 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1373 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1374
1375 __ BIND(L_copy_bytes);
1376 __ addptr(qword_count, 4);
1377 __ jcc(Assembler::lessEqual, L_loop);
1378 }
1379 __ subptr(qword_count, 4);
1380 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1381 }
1382
1383 // Copy big chunks backward
1384 //
1385 // Inputs:
1386 // from - source arrays address
1387 // dest - destination array address
1388 // qword_count - 64-bits element count
1389 // to - scratch
1390 // L_copy_bytes - entry label
1391 // L_copy_8_bytes - exit label
1392 //
copy_bytes_backward(Register from,Register dest,Register qword_count,Register to,Label & L_copy_bytes,Label & L_copy_8_bytes)1393 void copy_bytes_backward(Register from, Register dest,
1394 Register qword_count, Register to,
1395 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1396 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1397 Label L_loop;
1398 __ align(OptoLoopAlignment);
1399 if (UseUnalignedLoadStores) {
1400 Label L_end;
1401 // Copy 64-bytes per iteration
1402 if (UseAVX > 2) {
1403 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
1404
1405 __ BIND(L_copy_bytes);
1406 __ cmpptr(qword_count, (AVX3Threshold / 8));
1407 __ jccb(Assembler::greater, L_above_threshold);
1408 __ jmpb(L_below_threshold);
1409
1410 __ BIND(L_loop_avx512);
1411 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1412 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1413 __ bind(L_above_threshold);
1414 __ subptr(qword_count, 8);
1415 __ jcc(Assembler::greaterEqual, L_loop_avx512);
1416 __ jmpb(L_32_byte_head);
1417
1418 __ bind(L_loop_avx2);
1419 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1420 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1421 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1422 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1423 __ bind(L_below_threshold);
1424 __ subptr(qword_count, 8);
1425 __ jcc(Assembler::greaterEqual, L_loop_avx2);
1426
1427 __ bind(L_32_byte_head);
1428 __ addptr(qword_count, 4); // add(8) and sub(4)
1429 __ jccb(Assembler::less, L_end);
1430 } else {
1431 __ BIND(L_loop);
1432 if (UseAVX == 2) {
1433 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1434 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1435 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1436 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1437 } else {
1438 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1439 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1440 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1441 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1442 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1443 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1444 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
1445 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
1446 }
1447
1448 __ BIND(L_copy_bytes);
1449 __ subptr(qword_count, 8);
1450 __ jcc(Assembler::greaterEqual, L_loop);
1451
1452 __ addptr(qword_count, 4); // add(8) and sub(4)
1453 __ jccb(Assembler::less, L_end);
1454 }
1455 // Copy trailing 32 bytes
1456 if (UseAVX >= 2) {
1457 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1458 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1459 } else {
1460 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1461 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1462 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1463 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1464 }
1465 __ subptr(qword_count, 4);
1466 __ BIND(L_end);
1467 if (UseAVX >= 2) {
1468 // clean upper bits of YMM registers
1469 __ vpxor(xmm0, xmm0);
1470 __ vpxor(xmm1, xmm1);
1471 }
1472 } else {
1473 // Copy 32-bytes per iteration
1474 __ BIND(L_loop);
1475 __ movq(to, Address(from, qword_count, Address::times_8, 24));
1476 __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1477 __ movq(to, Address(from, qword_count, Address::times_8, 16));
1478 __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1479 __ movq(to, Address(from, qword_count, Address::times_8, 8));
1480 __ movq(Address(dest, qword_count, Address::times_8, 8), to);
1481 __ movq(to, Address(from, qword_count, Address::times_8, 0));
1482 __ movq(Address(dest, qword_count, Address::times_8, 0), to);
1483
1484 __ BIND(L_copy_bytes);
1485 __ subptr(qword_count, 4);
1486 __ jcc(Assembler::greaterEqual, L_loop);
1487 }
1488 __ addptr(qword_count, 4);
1489 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1490 }
1491
1492 // Arguments:
1493 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494 // ignored
1495 // name - stub name string
1496 //
1497 // Inputs:
1498 // c_rarg0 - source array address
1499 // c_rarg1 - destination array address
1500 // c_rarg2 - element count, treated as ssize_t, can be zero
1501 //
1502 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503 // we let the hardware handle it. The one to eight bytes within words,
1504 // dwords or qwords that span cache line boundaries will still be loaded
1505 // and stored atomically.
1506 //
1507 // Side Effects:
1508 // disjoint_byte_copy_entry is set to the no-overlap entry point
1509 // used by generate_conjoint_byte_copy().
1510 //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1511 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1512 __ align(CodeEntryAlignment);
1513 StubCodeMark mark(this, "StubRoutines", name);
1514 address start = __ pc();
1515
1516 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1517 Label L_copy_byte, L_exit;
1518 const Register from = rdi; // source array address
1519 const Register to = rsi; // destination array address
1520 const Register count = rdx; // elements count
1521 const Register byte_count = rcx;
1522 const Register qword_count = count;
1523 const Register end_from = from; // source array end address
1524 const Register end_to = to; // destination array end address
1525 // End pointers are inclusive, and if count is not zero they point
1526 // to the last unit copied: end_to[0] := end_from[0]
1527
1528 __ enter(); // required for proper stackwalking of RuntimeStub frame
1529 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1530
1531 if (entry != NULL) {
1532 *entry = __ pc();
1533 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1534 BLOCK_COMMENT("Entry:");
1535 }
1536
1537 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1538 // r9 and r10 may be used to save non-volatile registers
1539
1540 {
1541 // UnsafeCopyMemory page error: continue after ucm
1542 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1543 // 'from', 'to' and 'count' are now valid
1544 __ movptr(byte_count, count);
1545 __ shrptr(count, 3); // count => qword_count
1546
1547 // Copy from low to high addresses. Use 'to' as scratch.
1548 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1549 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1550 __ negptr(qword_count); // make the count negative
1551 __ jmp(L_copy_bytes);
1552
1553 // Copy trailing qwords
1554 __ BIND(L_copy_8_bytes);
1555 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1556 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1557 __ increment(qword_count);
1558 __ jcc(Assembler::notZero, L_copy_8_bytes);
1559
1560 // Check for and copy trailing dword
1561 __ BIND(L_copy_4_bytes);
1562 __ testl(byte_count, 4);
1563 __ jccb(Assembler::zero, L_copy_2_bytes);
1564 __ movl(rax, Address(end_from, 8));
1565 __ movl(Address(end_to, 8), rax);
1566
1567 __ addptr(end_from, 4);
1568 __ addptr(end_to, 4);
1569
1570 // Check for and copy trailing word
1571 __ BIND(L_copy_2_bytes);
1572 __ testl(byte_count, 2);
1573 __ jccb(Assembler::zero, L_copy_byte);
1574 __ movw(rax, Address(end_from, 8));
1575 __ movw(Address(end_to, 8), rax);
1576
1577 __ addptr(end_from, 2);
1578 __ addptr(end_to, 2);
1579
1580 // Check for and copy trailing byte
1581 __ BIND(L_copy_byte);
1582 __ testl(byte_count, 1);
1583 __ jccb(Assembler::zero, L_exit);
1584 __ movb(rax, Address(end_from, 8));
1585 __ movb(Address(end_to, 8), rax);
1586 }
1587 __ BIND(L_exit);
1588 address ucme_exit_pc = __ pc();
1589 restore_arg_regs();
1590 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1591 __ xorptr(rax, rax); // return 0
1592 __ vzeroupper();
1593 __ leave(); // required for proper stackwalking of RuntimeStub frame
1594 __ ret(0);
1595
1596 {
1597 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1598 // Copy in multi-bytes chunks
1599 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1600 __ jmp(L_copy_4_bytes);
1601 }
1602 return start;
1603 }
1604
1605 // Arguments:
1606 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1607 // ignored
1608 // name - stub name string
1609 //
1610 // Inputs:
1611 // c_rarg0 - source array address
1612 // c_rarg1 - destination array address
1613 // c_rarg2 - element count, treated as ssize_t, can be zero
1614 //
1615 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1616 // we let the hardware handle it. The one to eight bytes within words,
1617 // dwords or qwords that span cache line boundaries will still be loaded
1618 // and stored atomically.
1619 //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1620 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1621 address* entry, const char *name) {
1622 __ align(CodeEntryAlignment);
1623 StubCodeMark mark(this, "StubRoutines", name);
1624 address start = __ pc();
1625
1626 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1627 const Register from = rdi; // source array address
1628 const Register to = rsi; // destination array address
1629 const Register count = rdx; // elements count
1630 const Register byte_count = rcx;
1631 const Register qword_count = count;
1632
1633 __ enter(); // required for proper stackwalking of RuntimeStub frame
1634 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1635
1636 if (entry != NULL) {
1637 *entry = __ pc();
1638 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1639 BLOCK_COMMENT("Entry:");
1640 }
1641
1642 array_overlap_test(nooverlap_target, Address::times_1);
1643 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1644 // r9 and r10 may be used to save non-volatile registers
1645
1646 {
1647 // UnsafeCopyMemory page error: continue after ucm
1648 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1649 // 'from', 'to' and 'count' are now valid
1650 __ movptr(byte_count, count);
1651 __ shrptr(count, 3); // count => qword_count
1652
1653 // Copy from high to low addresses.
1654
1655 // Check for and copy trailing byte
1656 __ testl(byte_count, 1);
1657 __ jcc(Assembler::zero, L_copy_2_bytes);
1658 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1659 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1660 __ decrement(byte_count); // Adjust for possible trailing word
1661
1662 // Check for and copy trailing word
1663 __ BIND(L_copy_2_bytes);
1664 __ testl(byte_count, 2);
1665 __ jcc(Assembler::zero, L_copy_4_bytes);
1666 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1667 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1668
1669 // Check for and copy trailing dword
1670 __ BIND(L_copy_4_bytes);
1671 __ testl(byte_count, 4);
1672 __ jcc(Assembler::zero, L_copy_bytes);
1673 __ movl(rax, Address(from, qword_count, Address::times_8));
1674 __ movl(Address(to, qword_count, Address::times_8), rax);
1675 __ jmp(L_copy_bytes);
1676
1677 // Copy trailing qwords
1678 __ BIND(L_copy_8_bytes);
1679 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1680 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1681 __ decrement(qword_count);
1682 __ jcc(Assembler::notZero, L_copy_8_bytes);
1683 }
1684 restore_arg_regs();
1685 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1686 __ xorptr(rax, rax); // return 0
1687 __ vzeroupper();
1688 __ leave(); // required for proper stackwalking of RuntimeStub frame
1689 __ ret(0);
1690
1691 {
1692 // UnsafeCopyMemory page error: continue after ucm
1693 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1694 // Copy in multi-bytes chunks
1695 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1696 }
1697 restore_arg_regs();
1698 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1699 __ xorptr(rax, rax); // return 0
1700 __ vzeroupper();
1701 __ leave(); // required for proper stackwalking of RuntimeStub frame
1702 __ ret(0);
1703
1704 return start;
1705 }
1706
1707 // Arguments:
1708 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1709 // ignored
1710 // name - stub name string
1711 //
1712 // Inputs:
1713 // c_rarg0 - source array address
1714 // c_rarg1 - destination array address
1715 // c_rarg2 - element count, treated as ssize_t, can be zero
1716 //
1717 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1718 // let the hardware handle it. The two or four words within dwords
1719 // or qwords that span cache line boundaries will still be loaded
1720 // and stored atomically.
1721 //
1722 // Side Effects:
1723 // disjoint_short_copy_entry is set to the no-overlap entry point
1724 // used by generate_conjoint_short_copy().
1725 //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1726 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1727 __ align(CodeEntryAlignment);
1728 StubCodeMark mark(this, "StubRoutines", name);
1729 address start = __ pc();
1730
1731 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1732 const Register from = rdi; // source array address
1733 const Register to = rsi; // destination array address
1734 const Register count = rdx; // elements count
1735 const Register word_count = rcx;
1736 const Register qword_count = count;
1737 const Register end_from = from; // source array end address
1738 const Register end_to = to; // destination array end address
1739 // End pointers are inclusive, and if count is not zero they point
1740 // to the last unit copied: end_to[0] := end_from[0]
1741
1742 __ enter(); // required for proper stackwalking of RuntimeStub frame
1743 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1744
1745 if (entry != NULL) {
1746 *entry = __ pc();
1747 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1748 BLOCK_COMMENT("Entry:");
1749 }
1750
1751 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1752 // r9 and r10 may be used to save non-volatile registers
1753
1754 {
1755 // UnsafeCopyMemory page error: continue after ucm
1756 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1757 // 'from', 'to' and 'count' are now valid
1758 __ movptr(word_count, count);
1759 __ shrptr(count, 2); // count => qword_count
1760
1761 // Copy from low to high addresses. Use 'to' as scratch.
1762 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1763 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1764 __ negptr(qword_count);
1765 __ jmp(L_copy_bytes);
1766
1767 // Copy trailing qwords
1768 __ BIND(L_copy_8_bytes);
1769 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1770 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1771 __ increment(qword_count);
1772 __ jcc(Assembler::notZero, L_copy_8_bytes);
1773
1774 // Original 'dest' is trashed, so we can't use it as a
1775 // base register for a possible trailing word copy
1776
1777 // Check for and copy trailing dword
1778 __ BIND(L_copy_4_bytes);
1779 __ testl(word_count, 2);
1780 __ jccb(Assembler::zero, L_copy_2_bytes);
1781 __ movl(rax, Address(end_from, 8));
1782 __ movl(Address(end_to, 8), rax);
1783
1784 __ addptr(end_from, 4);
1785 __ addptr(end_to, 4);
1786
1787 // Check for and copy trailing word
1788 __ BIND(L_copy_2_bytes);
1789 __ testl(word_count, 1);
1790 __ jccb(Assembler::zero, L_exit);
1791 __ movw(rax, Address(end_from, 8));
1792 __ movw(Address(end_to, 8), rax);
1793 }
1794 __ BIND(L_exit);
1795 address ucme_exit_pc = __ pc();
1796 restore_arg_regs();
1797 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1798 __ xorptr(rax, rax); // return 0
1799 __ vzeroupper();
1800 __ leave(); // required for proper stackwalking of RuntimeStub frame
1801 __ ret(0);
1802
1803 {
1804 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1805 // Copy in multi-bytes chunks
1806 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1807 __ jmp(L_copy_4_bytes);
1808 }
1809
1810 return start;
1811 }
1812
generate_fill(BasicType t,bool aligned,const char * name)1813 address generate_fill(BasicType t, bool aligned, const char *name) {
1814 __ align(CodeEntryAlignment);
1815 StubCodeMark mark(this, "StubRoutines", name);
1816 address start = __ pc();
1817
1818 BLOCK_COMMENT("Entry:");
1819
1820 const Register to = c_rarg0; // source array address
1821 const Register value = c_rarg1; // value
1822 const Register count = c_rarg2; // elements count
1823
1824 __ enter(); // required for proper stackwalking of RuntimeStub frame
1825
1826 __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1827
1828 __ vzeroupper();
1829 __ leave(); // required for proper stackwalking of RuntimeStub frame
1830 __ ret(0);
1831 return start;
1832 }
1833
1834 // Arguments:
1835 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1836 // ignored
1837 // name - stub name string
1838 //
1839 // Inputs:
1840 // c_rarg0 - source array address
1841 // c_rarg1 - destination array address
1842 // c_rarg2 - element count, treated as ssize_t, can be zero
1843 //
1844 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1845 // let the hardware handle it. The two or four words within dwords
1846 // or qwords that span cache line boundaries will still be loaded
1847 // and stored atomically.
1848 //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1849 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1850 address *entry, const char *name) {
1851 __ align(CodeEntryAlignment);
1852 StubCodeMark mark(this, "StubRoutines", name);
1853 address start = __ pc();
1854
1855 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1856 const Register from = rdi; // source array address
1857 const Register to = rsi; // destination array address
1858 const Register count = rdx; // elements count
1859 const Register word_count = rcx;
1860 const Register qword_count = count;
1861
1862 __ enter(); // required for proper stackwalking of RuntimeStub frame
1863 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1864
1865 if (entry != NULL) {
1866 *entry = __ pc();
1867 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1868 BLOCK_COMMENT("Entry:");
1869 }
1870
1871 array_overlap_test(nooverlap_target, Address::times_2);
1872 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1873 // r9 and r10 may be used to save non-volatile registers
1874
1875 {
1876 // UnsafeCopyMemory page error: continue after ucm
1877 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1878 // 'from', 'to' and 'count' are now valid
1879 __ movptr(word_count, count);
1880 __ shrptr(count, 2); // count => qword_count
1881
1882 // Copy from high to low addresses. Use 'to' as scratch.
1883
1884 // Check for and copy trailing word
1885 __ testl(word_count, 1);
1886 __ jccb(Assembler::zero, L_copy_4_bytes);
1887 __ movw(rax, Address(from, word_count, Address::times_2, -2));
1888 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1889
1890 // Check for and copy trailing dword
1891 __ BIND(L_copy_4_bytes);
1892 __ testl(word_count, 2);
1893 __ jcc(Assembler::zero, L_copy_bytes);
1894 __ movl(rax, Address(from, qword_count, Address::times_8));
1895 __ movl(Address(to, qword_count, Address::times_8), rax);
1896 __ jmp(L_copy_bytes);
1897
1898 // Copy trailing qwords
1899 __ BIND(L_copy_8_bytes);
1900 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1901 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1902 __ decrement(qword_count);
1903 __ jcc(Assembler::notZero, L_copy_8_bytes);
1904 }
1905 restore_arg_regs();
1906 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1907 __ xorptr(rax, rax); // return 0
1908 __ vzeroupper();
1909 __ leave(); // required for proper stackwalking of RuntimeStub frame
1910 __ ret(0);
1911
1912 {
1913 // UnsafeCopyMemory page error: continue after ucm
1914 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1915 // Copy in multi-bytes chunks
1916 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1917 }
1918 restore_arg_regs();
1919 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1920 __ xorptr(rax, rax); // return 0
1921 __ vzeroupper();
1922 __ leave(); // required for proper stackwalking of RuntimeStub frame
1923 __ ret(0);
1924
1925 return start;
1926 }
1927
1928 // Arguments:
1929 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1930 // ignored
1931 // is_oop - true => oop array, so generate store check code
1932 // name - stub name string
1933 //
1934 // Inputs:
1935 // c_rarg0 - source array address
1936 // c_rarg1 - destination array address
1937 // c_rarg2 - element count, treated as ssize_t, can be zero
1938 //
1939 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1940 // the hardware handle it. The two dwords within qwords that span
1941 // cache line boundaries will still be loaded and stored atomicly.
1942 //
1943 // Side Effects:
1944 // disjoint_int_copy_entry is set to the no-overlap entry point
1945 // used by generate_conjoint_int_oop_copy().
1946 //
generate_disjoint_int_oop_copy(bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1947 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1948 const char *name, bool dest_uninitialized = false) {
1949 __ align(CodeEntryAlignment);
1950 StubCodeMark mark(this, "StubRoutines", name);
1951 address start = __ pc();
1952
1953 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1954 const Register from = rdi; // source array address
1955 const Register to = rsi; // destination array address
1956 const Register count = rdx; // elements count
1957 const Register dword_count = rcx;
1958 const Register qword_count = count;
1959 const Register end_from = from; // source array end address
1960 const Register end_to = to; // destination array end address
1961 // End pointers are inclusive, and if count is not zero they point
1962 // to the last unit copied: end_to[0] := end_from[0]
1963
1964 __ enter(); // required for proper stackwalking of RuntimeStub frame
1965 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1966
1967 if (entry != NULL) {
1968 *entry = __ pc();
1969 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1970 BLOCK_COMMENT("Entry:");
1971 }
1972
1973 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1974 // r9 is used to save r15_thread
1975
1976 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1977 if (dest_uninitialized) {
1978 decorators |= IS_DEST_UNINITIALIZED;
1979 }
1980 if (aligned) {
1981 decorators |= ARRAYCOPY_ALIGNED;
1982 }
1983
1984 BasicType type = is_oop ? T_OBJECT : T_INT;
1985 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1986 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1987
1988 {
1989 // UnsafeCopyMemory page error: continue after ucm
1990 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1991 // 'from', 'to' and 'count' are now valid
1992 __ movptr(dword_count, count);
1993 __ shrptr(count, 1); // count => qword_count
1994
1995 // Copy from low to high addresses. Use 'to' as scratch.
1996 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1997 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1998 __ negptr(qword_count);
1999 __ jmp(L_copy_bytes);
2000
2001 // Copy trailing qwords
2002 __ BIND(L_copy_8_bytes);
2003 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2004 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2005 __ increment(qword_count);
2006 __ jcc(Assembler::notZero, L_copy_8_bytes);
2007
2008 // Check for and copy trailing dword
2009 __ BIND(L_copy_4_bytes);
2010 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2011 __ jccb(Assembler::zero, L_exit);
2012 __ movl(rax, Address(end_from, 8));
2013 __ movl(Address(end_to, 8), rax);
2014 }
2015 __ BIND(L_exit);
2016 address ucme_exit_pc = __ pc();
2017 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2018 restore_arg_regs_using_thread();
2019 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2020 __ vzeroupper();
2021 __ xorptr(rax, rax); // return 0
2022 __ leave(); // required for proper stackwalking of RuntimeStub frame
2023 __ ret(0);
2024
2025 {
2026 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2027 // Copy in multi-bytes chunks
2028 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2029 __ jmp(L_copy_4_bytes);
2030 }
2031
2032 return start;
2033 }
2034
2035 // Arguments:
2036 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2037 // ignored
2038 // is_oop - true => oop array, so generate store check code
2039 // name - stub name string
2040 //
2041 // Inputs:
2042 // c_rarg0 - source array address
2043 // c_rarg1 - destination array address
2044 // c_rarg2 - element count, treated as ssize_t, can be zero
2045 //
2046 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2047 // the hardware handle it. The two dwords within qwords that span
2048 // cache line boundaries will still be loaded and stored atomicly.
2049 //
generate_conjoint_int_oop_copy(bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)2050 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2051 address *entry, const char *name,
2052 bool dest_uninitialized = false) {
2053 __ align(CodeEntryAlignment);
2054 StubCodeMark mark(this, "StubRoutines", name);
2055 address start = __ pc();
2056
2057 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2058 const Register from = rdi; // source array address
2059 const Register to = rsi; // destination array address
2060 const Register count = rdx; // elements count
2061 const Register dword_count = rcx;
2062 const Register qword_count = count;
2063
2064 __ enter(); // required for proper stackwalking of RuntimeStub frame
2065 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2066
2067 if (entry != NULL) {
2068 *entry = __ pc();
2069 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2070 BLOCK_COMMENT("Entry:");
2071 }
2072
2073 array_overlap_test(nooverlap_target, Address::times_4);
2074 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2075 // r9 is used to save r15_thread
2076
2077 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2078 if (dest_uninitialized) {
2079 decorators |= IS_DEST_UNINITIALIZED;
2080 }
2081 if (aligned) {
2082 decorators |= ARRAYCOPY_ALIGNED;
2083 }
2084
2085 BasicType type = is_oop ? T_OBJECT : T_INT;
2086 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2087 // no registers are destroyed by this call
2088 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2089
2090 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2091 {
2092 // UnsafeCopyMemory page error: continue after ucm
2093 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2094 // 'from', 'to' and 'count' are now valid
2095 __ movptr(dword_count, count);
2096 __ shrptr(count, 1); // count => qword_count
2097
2098 // Copy from high to low addresses. Use 'to' as scratch.
2099
2100 // Check for and copy trailing dword
2101 __ testl(dword_count, 1);
2102 __ jcc(Assembler::zero, L_copy_bytes);
2103 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2104 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2105 __ jmp(L_copy_bytes);
2106
2107 // Copy trailing qwords
2108 __ BIND(L_copy_8_bytes);
2109 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2110 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2111 __ decrement(qword_count);
2112 __ jcc(Assembler::notZero, L_copy_8_bytes);
2113 }
2114 if (is_oop) {
2115 __ jmp(L_exit);
2116 }
2117 restore_arg_regs_using_thread();
2118 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2119 __ xorptr(rax, rax); // return 0
2120 __ vzeroupper();
2121 __ leave(); // required for proper stackwalking of RuntimeStub frame
2122 __ ret(0);
2123
2124 {
2125 // UnsafeCopyMemory page error: continue after ucm
2126 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2127 // Copy in multi-bytes chunks
2128 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2129 }
2130
2131 __ BIND(L_exit);
2132 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2133 restore_arg_regs_using_thread();
2134 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2135 __ xorptr(rax, rax); // return 0
2136 __ vzeroupper();
2137 __ leave(); // required for proper stackwalking of RuntimeStub frame
2138 __ ret(0);
2139
2140 return start;
2141 }
2142
2143 // Arguments:
2144 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2145 // ignored
2146 // is_oop - true => oop array, so generate store check code
2147 // name - stub name string
2148 //
2149 // Inputs:
2150 // c_rarg0 - source array address
2151 // c_rarg1 - destination array address
2152 // c_rarg2 - element count, treated as ssize_t, can be zero
2153 //
2154 // Side Effects:
2155 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2156 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2157 //
generate_disjoint_long_oop_copy(bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)2158 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2159 const char *name, bool dest_uninitialized = false) {
2160 __ align(CodeEntryAlignment);
2161 StubCodeMark mark(this, "StubRoutines", name);
2162 address start = __ pc();
2163
2164 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2165 const Register from = rdi; // source array address
2166 const Register to = rsi; // destination array address
2167 const Register qword_count = rdx; // elements count
2168 const Register end_from = from; // source array end address
2169 const Register end_to = rcx; // destination array end address
2170 const Register saved_count = r11;
2171 // End pointers are inclusive, and if count is not zero they point
2172 // to the last unit copied: end_to[0] := end_from[0]
2173
2174 __ enter(); // required for proper stackwalking of RuntimeStub frame
2175 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2176 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2177
2178 if (entry != NULL) {
2179 *entry = __ pc();
2180 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2181 BLOCK_COMMENT("Entry:");
2182 }
2183
2184 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2185 // r9 is used to save r15_thread
2186 // 'from', 'to' and 'qword_count' are now valid
2187
2188 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2189 if (dest_uninitialized) {
2190 decorators |= IS_DEST_UNINITIALIZED;
2191 }
2192 if (aligned) {
2193 decorators |= ARRAYCOPY_ALIGNED;
2194 }
2195
2196 BasicType type = is_oop ? T_OBJECT : T_LONG;
2197 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2198 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2199 {
2200 // UnsafeCopyMemory page error: continue after ucm
2201 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2202
2203 // Copy from low to high addresses. Use 'to' as scratch.
2204 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2205 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2206 __ negptr(qword_count);
2207 __ jmp(L_copy_bytes);
2208
2209 // Copy trailing qwords
2210 __ BIND(L_copy_8_bytes);
2211 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2212 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2213 __ increment(qword_count);
2214 __ jcc(Assembler::notZero, L_copy_8_bytes);
2215 }
2216 if (is_oop) {
2217 __ jmp(L_exit);
2218 } else {
2219 restore_arg_regs_using_thread();
2220 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2221 __ xorptr(rax, rax); // return 0
2222 __ vzeroupper();
2223 __ leave(); // required for proper stackwalking of RuntimeStub frame
2224 __ ret(0);
2225 }
2226
2227 {
2228 // UnsafeCopyMemory page error: continue after ucm
2229 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2230 // Copy in multi-bytes chunks
2231 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2232 }
2233
2234 __ BIND(L_exit);
2235 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2236 restore_arg_regs_using_thread();
2237 if (is_oop) {
2238 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2239 } else {
2240 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2241 }
2242 __ vzeroupper();
2243 __ xorptr(rax, rax); // return 0
2244 __ leave(); // required for proper stackwalking of RuntimeStub frame
2245 __ ret(0);
2246
2247 return start;
2248 }
2249
2250 // Arguments:
2251 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2252 // ignored
2253 // is_oop - true => oop array, so generate store check code
2254 // name - stub name string
2255 //
2256 // Inputs:
2257 // c_rarg0 - source array address
2258 // c_rarg1 - destination array address
2259 // c_rarg2 - element count, treated as ssize_t, can be zero
2260 //
generate_conjoint_long_oop_copy(bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)2261 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2262 address nooverlap_target, address *entry,
2263 const char *name, bool dest_uninitialized = false) {
2264 __ align(CodeEntryAlignment);
2265 StubCodeMark mark(this, "StubRoutines", name);
2266 address start = __ pc();
2267
2268 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2269 const Register from = rdi; // source array address
2270 const Register to = rsi; // destination array address
2271 const Register qword_count = rdx; // elements count
2272 const Register saved_count = rcx;
2273
2274 __ enter(); // required for proper stackwalking of RuntimeStub frame
2275 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2276
2277 if (entry != NULL) {
2278 *entry = __ pc();
2279 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2280 BLOCK_COMMENT("Entry:");
2281 }
2282
2283 array_overlap_test(nooverlap_target, Address::times_8);
2284 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2285 // r9 is used to save r15_thread
2286 // 'from', 'to' and 'qword_count' are now valid
2287
2288 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2289 if (dest_uninitialized) {
2290 decorators |= IS_DEST_UNINITIALIZED;
2291 }
2292 if (aligned) {
2293 decorators |= ARRAYCOPY_ALIGNED;
2294 }
2295
2296 BasicType type = is_oop ? T_OBJECT : T_LONG;
2297 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2298 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2299 {
2300 // UnsafeCopyMemory page error: continue after ucm
2301 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2302
2303 __ jmp(L_copy_bytes);
2304
2305 // Copy trailing qwords
2306 __ BIND(L_copy_8_bytes);
2307 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2308 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2309 __ decrement(qword_count);
2310 __ jcc(Assembler::notZero, L_copy_8_bytes);
2311 }
2312 if (is_oop) {
2313 __ jmp(L_exit);
2314 } else {
2315 restore_arg_regs_using_thread();
2316 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2317 __ xorptr(rax, rax); // return 0
2318 __ vzeroupper();
2319 __ leave(); // required for proper stackwalking of RuntimeStub frame
2320 __ ret(0);
2321 }
2322 {
2323 // UnsafeCopyMemory page error: continue after ucm
2324 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2325
2326 // Copy in multi-bytes chunks
2327 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2328 }
2329 __ BIND(L_exit);
2330 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2331 restore_arg_regs_using_thread();
2332 if (is_oop) {
2333 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2334 } else {
2335 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2336 }
2337 __ vzeroupper();
2338 __ xorptr(rax, rax); // return 0
2339 __ leave(); // required for proper stackwalking of RuntimeStub frame
2340 __ ret(0);
2341
2342 return start;
2343 }
2344
2345
2346 // Helper for generating a dynamic type check.
2347 // Smashes no registers.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)2348 void generate_type_check(Register sub_klass,
2349 Register super_check_offset,
2350 Register super_klass,
2351 Label& L_success) {
2352 assert_different_registers(sub_klass, super_check_offset, super_klass);
2353
2354 BLOCK_COMMENT("type_check:");
2355
2356 Label L_miss;
2357
2358 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL,
2359 super_check_offset);
2360 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2361
2362 // Fall through on failure!
2363 __ BIND(L_miss);
2364 }
2365
2366 //
2367 // Generate checkcasting array copy stub
2368 //
2369 // Input:
2370 // c_rarg0 - source array address
2371 // c_rarg1 - destination array address
2372 // c_rarg2 - element count, treated as ssize_t, can be zero
2373 // c_rarg3 - size_t ckoff (super_check_offset)
2374 // not Win64
2375 // c_rarg4 - oop ckval (super_klass)
2376 // Win64
2377 // rsp+40 - oop ckval (super_klass)
2378 //
2379 // Output:
2380 // rax == 0 - success
2381 // rax == -1^K - failure, where K is partial transfer count
2382 //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)2383 address generate_checkcast_copy(const char *name, address *entry,
2384 bool dest_uninitialized = false) {
2385
2386 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2387
2388 // Input registers (after setup_arg_regs)
2389 const Register from = rdi; // source array address
2390 const Register to = rsi; // destination array address
2391 const Register length = rdx; // elements count
2392 const Register ckoff = rcx; // super_check_offset
2393 const Register ckval = r8; // super_klass
2394
2395 // Registers used as temps (r13, r14 are save-on-entry)
2396 const Register end_from = from; // source array end address
2397 const Register end_to = r13; // destination array end address
2398 const Register count = rdx; // -(count_remaining)
2399 const Register r14_length = r14; // saved copy of length
2400 // End pointers are inclusive, and if length is not zero they point
2401 // to the last unit copied: end_to[0] := end_from[0]
2402
2403 const Register rax_oop = rax; // actual oop copied
2404 const Register r11_klass = r11; // oop._klass
2405
2406 //---------------------------------------------------------------
2407 // Assembler stub will be used for this call to arraycopy
2408 // if the two arrays are subtypes of Object[] but the
2409 // destination array type is not equal to or a supertype
2410 // of the source type. Each element must be separately
2411 // checked.
2412
2413 __ align(CodeEntryAlignment);
2414 StubCodeMark mark(this, "StubRoutines", name);
2415 address start = __ pc();
2416
2417 __ enter(); // required for proper stackwalking of RuntimeStub frame
2418
2419 #ifdef ASSERT
2420 // caller guarantees that the arrays really are different
2421 // otherwise, we would have to make conjoint checks
2422 { Label L;
2423 array_overlap_test(L, TIMES_OOP);
2424 __ stop("checkcast_copy within a single array");
2425 __ bind(L);
2426 }
2427 #endif //ASSERT
2428
2429 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2430 // ckoff => rcx, ckval => r8
2431 // r9 and r10 may be used to save non-volatile registers
2432 #ifdef _WIN64
2433 // last argument (#4) is on stack on Win64
2434 __ movptr(ckval, Address(rsp, 6 * wordSize));
2435 #endif
2436
2437 // Caller of this entry point must set up the argument registers.
2438 if (entry != NULL) {
2439 *entry = __ pc();
2440 BLOCK_COMMENT("Entry:");
2441 }
2442
2443 // allocate spill slots for r13, r14
2444 enum {
2445 saved_r13_offset,
2446 saved_r14_offset,
2447 saved_r10_offset,
2448 saved_rbp_offset
2449 };
2450 __ subptr(rsp, saved_rbp_offset * wordSize);
2451 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2452 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2453 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2454
2455 #ifdef ASSERT
2456 Label L2;
2457 __ get_thread(r14);
2458 __ cmpptr(r15_thread, r14);
2459 __ jcc(Assembler::equal, L2);
2460 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2461 __ bind(L2);
2462 #endif // ASSERT
2463
2464 // check that int operands are properly extended to size_t
2465 assert_clean_int(length, rax);
2466 assert_clean_int(ckoff, rax);
2467
2468 #ifdef ASSERT
2469 BLOCK_COMMENT("assert consistent ckoff/ckval");
2470 // The ckoff and ckval must be mutually consistent,
2471 // even though caller generates both.
2472 { Label L;
2473 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2474 __ cmpl(ckoff, Address(ckval, sco_offset));
2475 __ jcc(Assembler::equal, L);
2476 __ stop("super_check_offset inconsistent");
2477 __ bind(L);
2478 }
2479 #endif //ASSERT
2480
2481 // Loop-invariant addresses. They are exclusive end pointers.
2482 Address end_from_addr(from, length, TIMES_OOP, 0);
2483 Address end_to_addr(to, length, TIMES_OOP, 0);
2484 // Loop-variant addresses. They assume post-incremented count < 0.
2485 Address from_element_addr(end_from, count, TIMES_OOP, 0);
2486 Address to_element_addr(end_to, count, TIMES_OOP, 0);
2487
2488 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2489 if (dest_uninitialized) {
2490 decorators |= IS_DEST_UNINITIALIZED;
2491 }
2492
2493 BasicType type = T_OBJECT;
2494 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2495 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2496
2497 // Copy from low to high addresses, indexed from the end of each array.
2498 __ lea(end_from, end_from_addr);
2499 __ lea(end_to, end_to_addr);
2500 __ movptr(r14_length, length); // save a copy of the length
2501 assert(length == count, ""); // else fix next line:
2502 __ negptr(count); // negate and test the length
2503 __ jcc(Assembler::notZero, L_load_element);
2504
2505 // Empty array: Nothing to do.
2506 __ xorptr(rax, rax); // return 0 on (trivial) success
2507 __ jmp(L_done);
2508
2509 // ======== begin loop ========
2510 // (Loop is rotated; its entry is L_load_element.)
2511 // Loop control:
2512 // for (count = -count; count != 0; count++)
2513 // Base pointers src, dst are biased by 8*(count-1),to last element.
2514 __ align(OptoLoopAlignment);
2515
2516 __ BIND(L_store_element);
2517 __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW); // store the oop
2518 __ increment(count); // increment the count toward zero
2519 __ jcc(Assembler::zero, L_do_card_marks);
2520
2521 // ======== loop entry is here ========
2522 __ BIND(L_load_element);
2523 __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2524 __ testptr(rax_oop, rax_oop);
2525 __ jcc(Assembler::zero, L_store_element);
2526
2527 __ load_klass(r11_klass, rax_oop);// query the object klass
2528 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2529 // ======== end loop ========
2530
2531 // It was a real error; we must depend on the caller to finish the job.
2532 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2533 // Emit GC store barriers for the oops we have copied (r14 + rdx),
2534 // and report their number to the caller.
2535 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2536 Label L_post_barrier;
2537 __ addptr(r14_length, count); // K = (original - remaining) oops
2538 __ movptr(rax, r14_length); // save the value
2539 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
2540 __ jccb(Assembler::notZero, L_post_barrier);
2541 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2542
2543 // Come here on success only.
2544 __ BIND(L_do_card_marks);
2545 __ xorptr(rax, rax); // return 0 on success
2546
2547 __ BIND(L_post_barrier);
2548 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2549
2550 // Common exit point (success or failure).
2551 __ BIND(L_done);
2552 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2553 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2554 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2555 restore_arg_regs();
2556 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2557 __ leave(); // required for proper stackwalking of RuntimeStub frame
2558 __ ret(0);
2559
2560 return start;
2561 }
2562
2563 //
2564 // Generate 'unsafe' array copy stub
2565 // Though just as safe as the other stubs, it takes an unscaled
2566 // size_t argument instead of an element count.
2567 //
2568 // Input:
2569 // c_rarg0 - source array address
2570 // c_rarg1 - destination array address
2571 // c_rarg2 - byte count, treated as ssize_t, can be zero
2572 //
2573 // Examines the alignment of the operands and dispatches
2574 // to a long, int, short, or byte copy loop.
2575 //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)2576 address generate_unsafe_copy(const char *name,
2577 address byte_copy_entry, address short_copy_entry,
2578 address int_copy_entry, address long_copy_entry) {
2579
2580 Label L_long_aligned, L_int_aligned, L_short_aligned;
2581
2582 // Input registers (before setup_arg_regs)
2583 const Register from = c_rarg0; // source array address
2584 const Register to = c_rarg1; // destination array address
2585 const Register size = c_rarg2; // byte count (size_t)
2586
2587 // Register used as a temp
2588 const Register bits = rax; // test copy of low bits
2589
2590 __ align(CodeEntryAlignment);
2591 StubCodeMark mark(this, "StubRoutines", name);
2592 address start = __ pc();
2593
2594 __ enter(); // required for proper stackwalking of RuntimeStub frame
2595
2596 // bump this on entry, not on exit:
2597 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2598
2599 __ mov(bits, from);
2600 __ orptr(bits, to);
2601 __ orptr(bits, size);
2602
2603 __ testb(bits, BytesPerLong-1);
2604 __ jccb(Assembler::zero, L_long_aligned);
2605
2606 __ testb(bits, BytesPerInt-1);
2607 __ jccb(Assembler::zero, L_int_aligned);
2608
2609 __ testb(bits, BytesPerShort-1);
2610 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2611
2612 __ BIND(L_short_aligned);
2613 __ shrptr(size, LogBytesPerShort); // size => short_count
2614 __ jump(RuntimeAddress(short_copy_entry));
2615
2616 __ BIND(L_int_aligned);
2617 __ shrptr(size, LogBytesPerInt); // size => int_count
2618 __ jump(RuntimeAddress(int_copy_entry));
2619
2620 __ BIND(L_long_aligned);
2621 __ shrptr(size, LogBytesPerLong); // size => qword_count
2622 __ jump(RuntimeAddress(long_copy_entry));
2623
2624 return start;
2625 }
2626
2627 // Perform range checks on the proposed arraycopy.
2628 // Kills temp, but nothing else.
2629 // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)2630 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2631 Register src_pos, // source position (c_rarg1)
2632 Register dst, // destination array oo (c_rarg2)
2633 Register dst_pos, // destination position (c_rarg3)
2634 Register length,
2635 Register temp,
2636 Label& L_failed) {
2637 BLOCK_COMMENT("arraycopy_range_checks:");
2638
2639 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2640 __ movl(temp, length);
2641 __ addl(temp, src_pos); // src_pos + length
2642 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2643 __ jcc(Assembler::above, L_failed);
2644
2645 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2646 __ movl(temp, length);
2647 __ addl(temp, dst_pos); // dst_pos + length
2648 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2649 __ jcc(Assembler::above, L_failed);
2650
2651 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2652 // Move with sign extension can be used since they are positive.
2653 __ movslq(src_pos, src_pos);
2654 __ movslq(dst_pos, dst_pos);
2655
2656 BLOCK_COMMENT("arraycopy_range_checks done");
2657 }
2658
2659 //
2660 // Generate generic array copy stubs
2661 //
2662 // Input:
2663 // c_rarg0 - src oop
2664 // c_rarg1 - src_pos (32-bits)
2665 // c_rarg2 - dst oop
2666 // c_rarg3 - dst_pos (32-bits)
2667 // not Win64
2668 // c_rarg4 - element count (32-bits)
2669 // Win64
2670 // rsp+40 - element count (32-bits)
2671 //
2672 // Output:
2673 // rax == 0 - success
2674 // rax == -1^K - failure, where K is partial transfer count
2675 //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)2676 address generate_generic_copy(const char *name,
2677 address byte_copy_entry, address short_copy_entry,
2678 address int_copy_entry, address oop_copy_entry,
2679 address long_copy_entry, address checkcast_copy_entry) {
2680
2681 Label L_failed, L_failed_0, L_objArray;
2682 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2683
2684 // Input registers
2685 const Register src = c_rarg0; // source array oop
2686 const Register src_pos = c_rarg1; // source position
2687 const Register dst = c_rarg2; // destination array oop
2688 const Register dst_pos = c_rarg3; // destination position
2689 #ifndef _WIN64
2690 const Register length = c_rarg4;
2691 #else
2692 const Address length(rsp, 6 * wordSize); // elements count is on stack on Win64
2693 #endif
2694
2695 { int modulus = CodeEntryAlignment;
2696 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2697 int advance = target - (__ offset() % modulus);
2698 if (advance < 0) advance += modulus;
2699 if (advance > 0) __ nop(advance);
2700 }
2701 StubCodeMark mark(this, "StubRoutines", name);
2702
2703 // Short-hop target to L_failed. Makes for denser prologue code.
2704 __ BIND(L_failed_0);
2705 __ jmp(L_failed);
2706 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2707
2708 __ align(CodeEntryAlignment);
2709 address start = __ pc();
2710
2711 __ enter(); // required for proper stackwalking of RuntimeStub frame
2712
2713 // bump this on entry, not on exit:
2714 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2715
2716 //-----------------------------------------------------------------------
2717 // Assembler stub will be used for this call to arraycopy
2718 // if the following conditions are met:
2719 //
2720 // (1) src and dst must not be null.
2721 // (2) src_pos must not be negative.
2722 // (3) dst_pos must not be negative.
2723 // (4) length must not be negative.
2724 // (5) src klass and dst klass should be the same and not NULL.
2725 // (6) src and dst should be arrays.
2726 // (7) src_pos + length must not exceed length of src.
2727 // (8) dst_pos + length must not exceed length of dst.
2728 //
2729
2730 // if (src == NULL) return -1;
2731 __ testptr(src, src); // src oop
2732 size_t j1off = __ offset();
2733 __ jccb(Assembler::zero, L_failed_0);
2734
2735 // if (src_pos < 0) return -1;
2736 __ testl(src_pos, src_pos); // src_pos (32-bits)
2737 __ jccb(Assembler::negative, L_failed_0);
2738
2739 // if (dst == NULL) return -1;
2740 __ testptr(dst, dst); // dst oop
2741 __ jccb(Assembler::zero, L_failed_0);
2742
2743 // if (dst_pos < 0) return -1;
2744 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2745 size_t j4off = __ offset();
2746 __ jccb(Assembler::negative, L_failed_0);
2747
2748 // The first four tests are very dense code,
2749 // but not quite dense enough to put four
2750 // jumps in a 16-byte instruction fetch buffer.
2751 // That's good, because some branch predicters
2752 // do not like jumps so close together.
2753 // Make sure of this.
2754 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2755
2756 // registers used as temp
2757 const Register r11_length = r11; // elements count to copy
2758 const Register r10_src_klass = r10; // array klass
2759
2760 // if (length < 0) return -1;
2761 __ movl(r11_length, length); // length (elements count, 32-bits value)
2762 __ testl(r11_length, r11_length);
2763 __ jccb(Assembler::negative, L_failed_0);
2764
2765 __ load_klass(r10_src_klass, src);
2766 #ifdef ASSERT
2767 // assert(src->klass() != NULL);
2768 {
2769 BLOCK_COMMENT("assert klasses not null {");
2770 Label L1, L2;
2771 __ testptr(r10_src_klass, r10_src_klass);
2772 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
2773 __ bind(L1);
2774 __ stop("broken null klass");
2775 __ bind(L2);
2776 __ load_klass(rax, dst);
2777 __ cmpq(rax, 0);
2778 __ jcc(Assembler::equal, L1); // this would be broken also
2779 BLOCK_COMMENT("} assert klasses not null done");
2780 }
2781 #endif
2782
2783 // Load layout helper (32-bits)
2784 //
2785 // |array_tag| | header_size | element_type | |log2_element_size|
2786 // 32 30 24 16 8 2 0
2787 //
2788 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2789 //
2790
2791 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2792
2793 // Handle objArrays completely differently...
2794 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2795 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2796 __ jcc(Assembler::equal, L_objArray);
2797
2798 // if (src->klass() != dst->klass()) return -1;
2799 __ load_klass(rax, dst);
2800 __ cmpq(r10_src_klass, rax);
2801 __ jcc(Assembler::notEqual, L_failed);
2802
2803 const Register rax_lh = rax; // layout helper
2804 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2805
2806 // if (!src->is_Array()) return -1;
2807 __ cmpl(rax_lh, Klass::_lh_neutral_value);
2808 __ jcc(Assembler::greaterEqual, L_failed);
2809
2810 // At this point, it is known to be a typeArray (array_tag 0x3).
2811 #ifdef ASSERT
2812 {
2813 BLOCK_COMMENT("assert primitive array {");
2814 Label L;
2815 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2816 __ jcc(Assembler::greaterEqual, L);
2817 __ stop("must be a primitive array");
2818 __ bind(L);
2819 BLOCK_COMMENT("} assert primitive array done");
2820 }
2821 #endif
2822
2823 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2824 r10, L_failed);
2825
2826 // TypeArrayKlass
2827 //
2828 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2829 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2830 //
2831
2832 const Register r10_offset = r10; // array offset
2833 const Register rax_elsize = rax_lh; // element size
2834
2835 __ movl(r10_offset, rax_lh);
2836 __ shrl(r10_offset, Klass::_lh_header_size_shift);
2837 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
2838 __ addptr(src, r10_offset); // src array offset
2839 __ addptr(dst, r10_offset); // dst array offset
2840 BLOCK_COMMENT("choose copy loop based on element size");
2841 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2842
2843 // next registers should be set before the jump to corresponding stub
2844 const Register from = c_rarg0; // source array address
2845 const Register to = c_rarg1; // destination array address
2846 const Register count = c_rarg2; // elements count
2847
2848 // 'from', 'to', 'count' registers should be set in such order
2849 // since they are the same as 'src', 'src_pos', 'dst'.
2850
2851 __ BIND(L_copy_bytes);
2852 __ cmpl(rax_elsize, 0);
2853 __ jccb(Assembler::notEqual, L_copy_shorts);
2854 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2855 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2856 __ movl2ptr(count, r11_length); // length
2857 __ jump(RuntimeAddress(byte_copy_entry));
2858
2859 __ BIND(L_copy_shorts);
2860 __ cmpl(rax_elsize, LogBytesPerShort);
2861 __ jccb(Assembler::notEqual, L_copy_ints);
2862 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2863 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2864 __ movl2ptr(count, r11_length); // length
2865 __ jump(RuntimeAddress(short_copy_entry));
2866
2867 __ BIND(L_copy_ints);
2868 __ cmpl(rax_elsize, LogBytesPerInt);
2869 __ jccb(Assembler::notEqual, L_copy_longs);
2870 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2871 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2872 __ movl2ptr(count, r11_length); // length
2873 __ jump(RuntimeAddress(int_copy_entry));
2874
2875 __ BIND(L_copy_longs);
2876 #ifdef ASSERT
2877 {
2878 BLOCK_COMMENT("assert long copy {");
2879 Label L;
2880 __ cmpl(rax_elsize, LogBytesPerLong);
2881 __ jcc(Assembler::equal, L);
2882 __ stop("must be long copy, but elsize is wrong");
2883 __ bind(L);
2884 BLOCK_COMMENT("} assert long copy done");
2885 }
2886 #endif
2887 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2888 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2889 __ movl2ptr(count, r11_length); // length
2890 __ jump(RuntimeAddress(long_copy_entry));
2891
2892 // ObjArrayKlass
2893 __ BIND(L_objArray);
2894 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
2895
2896 Label L_plain_copy, L_checkcast_copy;
2897 // test array classes for subtyping
2898 __ load_klass(rax, dst);
2899 __ cmpq(r10_src_klass, rax); // usual case is exact equality
2900 __ jcc(Assembler::notEqual, L_checkcast_copy);
2901
2902 // Identically typed arrays can be copied without element-wise checks.
2903 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2904 r10, L_failed);
2905
2906 __ lea(from, Address(src, src_pos, TIMES_OOP,
2907 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2908 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2909 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2910 __ movl2ptr(count, r11_length); // length
2911 __ BIND(L_plain_copy);
2912 __ jump(RuntimeAddress(oop_copy_entry));
2913
2914 __ BIND(L_checkcast_copy);
2915 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
2916 {
2917 // Before looking at dst.length, make sure dst is also an objArray.
2918 __ cmpl(Address(rax, lh_offset), objArray_lh);
2919 __ jcc(Assembler::notEqual, L_failed);
2920
2921 // It is safe to examine both src.length and dst.length.
2922 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2923 rax, L_failed);
2924
2925 const Register r11_dst_klass = r11;
2926 __ load_klass(r11_dst_klass, dst); // reload
2927
2928 // Marshal the base address arguments now, freeing registers.
2929 __ lea(from, Address(src, src_pos, TIMES_OOP,
2930 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2931 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
2932 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2933 __ movl(count, length); // length (reloaded)
2934 Register sco_temp = c_rarg3; // this register is free now
2935 assert_different_registers(from, to, count, sco_temp,
2936 r11_dst_klass, r10_src_klass);
2937 assert_clean_int(count, sco_temp);
2938
2939 // Generate the type check.
2940 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2941 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2942 assert_clean_int(sco_temp, rax);
2943 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2944
2945 // Fetch destination element klass from the ObjArrayKlass header.
2946 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2947 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2948 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
2949 assert_clean_int(sco_temp, rax);
2950
2951 // the checkcast_copy loop needs two extra arguments:
2952 assert(c_rarg3 == sco_temp, "#3 already in place");
2953 // Set up arguments for checkcast_copy_entry.
2954 setup_arg_regs(4);
2955 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2956 __ jump(RuntimeAddress(checkcast_copy_entry));
2957 }
2958
2959 __ BIND(L_failed);
2960 __ xorptr(rax, rax);
2961 __ notptr(rax); // return -1
2962 __ leave(); // required for proper stackwalking of RuntimeStub frame
2963 __ ret(0);
2964
2965 return start;
2966 }
2967
generate_arraycopy_stubs()2968 void generate_arraycopy_stubs() {
2969 address entry;
2970 address entry_jbyte_arraycopy;
2971 address entry_jshort_arraycopy;
2972 address entry_jint_arraycopy;
2973 address entry_oop_arraycopy;
2974 address entry_jlong_arraycopy;
2975 address entry_checkcast_arraycopy;
2976
2977 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
2978 "jbyte_disjoint_arraycopy");
2979 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
2980 "jbyte_arraycopy");
2981
2982 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2983 "jshort_disjoint_arraycopy");
2984 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
2985 "jshort_arraycopy");
2986
2987 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry,
2988 "jint_disjoint_arraycopy");
2989 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry,
2990 &entry_jint_arraycopy, "jint_arraycopy");
2991
2992 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry,
2993 "jlong_disjoint_arraycopy");
2994 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry,
2995 &entry_jlong_arraycopy, "jlong_arraycopy");
2996
2997
2998 if (UseCompressedOops) {
2999 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry,
3000 "oop_disjoint_arraycopy");
3001 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry,
3002 &entry_oop_arraycopy, "oop_arraycopy");
3003 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry,
3004 "oop_disjoint_arraycopy_uninit",
3005 /*dest_uninitialized*/true);
3006 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry,
3007 NULL, "oop_arraycopy_uninit",
3008 /*dest_uninitialized*/true);
3009 } else {
3010 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry,
3011 "oop_disjoint_arraycopy");
3012 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry,
3013 &entry_oop_arraycopy, "oop_arraycopy");
3014 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry,
3015 "oop_disjoint_arraycopy_uninit",
3016 /*dest_uninitialized*/true);
3017 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry,
3018 NULL, "oop_arraycopy_uninit",
3019 /*dest_uninitialized*/true);
3020 }
3021
3022 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3023 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3024 /*dest_uninitialized*/true);
3025
3026 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3027 entry_jbyte_arraycopy,
3028 entry_jshort_arraycopy,
3029 entry_jint_arraycopy,
3030 entry_jlong_arraycopy);
3031 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3032 entry_jbyte_arraycopy,
3033 entry_jshort_arraycopy,
3034 entry_jint_arraycopy,
3035 entry_oop_arraycopy,
3036 entry_jlong_arraycopy,
3037 entry_checkcast_arraycopy);
3038
3039 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3040 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3041 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3042 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3043 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3044 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3045
3046 // We don't generate specialized code for HeapWord-aligned source
3047 // arrays, so just use the code we've already generated
3048 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
3049 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
3050
3051 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3052 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
3053
3054 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
3055 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
3056
3057 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
3058 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
3059
3060 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
3061 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
3062
3063 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
3064 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
3065 }
3066
3067 // AES intrinsic stubs
3068 enum {AESBlockSize = 16};
3069
generate_key_shuffle_mask()3070 address generate_key_shuffle_mask() {
3071 __ align(16);
3072 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3073 address start = __ pc();
3074 __ emit_data64( 0x0405060700010203, relocInfo::none );
3075 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3076 return start;
3077 }
3078
generate_counter_shuffle_mask()3079 address generate_counter_shuffle_mask() {
3080 __ align(16);
3081 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3082 address start = __ pc();
3083 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3084 __ emit_data64(0x0001020304050607, relocInfo::none);
3085 return start;
3086 }
3087
3088 // Utility routine for loading a 128-bit key word in little endian format
3089 // can optionally specify that the shuffle mask is already in an xmmregister
load_key(XMMRegister xmmdst,Register key,int offset,XMMRegister xmm_shuf_mask=NULL)3090 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3091 __ movdqu(xmmdst, Address(key, offset));
3092 if (xmm_shuf_mask != NULL) {
3093 __ pshufb(xmmdst, xmm_shuf_mask);
3094 } else {
3095 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3096 }
3097 }
3098
3099 // Utility routine for increase 128bit counter (iv in CTR mode)
inc_counter(Register reg,XMMRegister xmmdst,int inc_delta,Label & next_block)3100 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3101 __ pextrq(reg, xmmdst, 0x0);
3102 __ addq(reg, inc_delta);
3103 __ pinsrq(xmmdst, reg, 0x0);
3104 __ jcc(Assembler::carryClear, next_block); // jump if no carry
3105 __ pextrq(reg, xmmdst, 0x01); // Carry
3106 __ addq(reg, 0x01);
3107 __ pinsrq(xmmdst, reg, 0x01); //Carry end
3108 __ BIND(next_block); // next instruction
3109 }
3110
3111 // Arguments:
3112 //
3113 // Inputs:
3114 // c_rarg0 - source byte array address
3115 // c_rarg1 - destination byte array address
3116 // c_rarg2 - K (key) in little endian int array
3117 //
generate_aescrypt_encryptBlock()3118 address generate_aescrypt_encryptBlock() {
3119 assert(UseAES, "need AES instructions and misaligned SSE support");
3120 __ align(CodeEntryAlignment);
3121 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3122 Label L_doLast;
3123 address start = __ pc();
3124
3125 const Register from = c_rarg0; // source array address
3126 const Register to = c_rarg1; // destination array address
3127 const Register key = c_rarg2; // key array address
3128 const Register keylen = rax;
3129
3130 const XMMRegister xmm_result = xmm0;
3131 const XMMRegister xmm_key_shuf_mask = xmm1;
3132 // On win64 xmm6-xmm15 must be preserved so don't use them.
3133 const XMMRegister xmm_temp1 = xmm2;
3134 const XMMRegister xmm_temp2 = xmm3;
3135 const XMMRegister xmm_temp3 = xmm4;
3136 const XMMRegister xmm_temp4 = xmm5;
3137
3138 __ enter(); // required for proper stackwalking of RuntimeStub frame
3139
3140 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3141 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3142
3143 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3144 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
3145
3146 // For encryption, the java expanded key ordering is just what we need
3147 // we don't know if the key is aligned, hence not using load-execute form
3148
3149 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3150 __ pxor(xmm_result, xmm_temp1);
3151
3152 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3153 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3154 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3155 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3156
3157 __ aesenc(xmm_result, xmm_temp1);
3158 __ aesenc(xmm_result, xmm_temp2);
3159 __ aesenc(xmm_result, xmm_temp3);
3160 __ aesenc(xmm_result, xmm_temp4);
3161
3162 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3163 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3164 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3165 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3166
3167 __ aesenc(xmm_result, xmm_temp1);
3168 __ aesenc(xmm_result, xmm_temp2);
3169 __ aesenc(xmm_result, xmm_temp3);
3170 __ aesenc(xmm_result, xmm_temp4);
3171
3172 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3173 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3174
3175 __ cmpl(keylen, 44);
3176 __ jccb(Assembler::equal, L_doLast);
3177
3178 __ aesenc(xmm_result, xmm_temp1);
3179 __ aesenc(xmm_result, xmm_temp2);
3180
3181 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3182 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3183
3184 __ cmpl(keylen, 52);
3185 __ jccb(Assembler::equal, L_doLast);
3186
3187 __ aesenc(xmm_result, xmm_temp1);
3188 __ aesenc(xmm_result, xmm_temp2);
3189
3190 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3191 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3192
3193 __ BIND(L_doLast);
3194 __ aesenc(xmm_result, xmm_temp1);
3195 __ aesenclast(xmm_result, xmm_temp2);
3196 __ movdqu(Address(to, 0), xmm_result); // store the result
3197 __ xorptr(rax, rax); // return 0
3198 __ leave(); // required for proper stackwalking of RuntimeStub frame
3199 __ ret(0);
3200
3201 return start;
3202 }
3203
3204
3205 // Arguments:
3206 //
3207 // Inputs:
3208 // c_rarg0 - source byte array address
3209 // c_rarg1 - destination byte array address
3210 // c_rarg2 - K (key) in little endian int array
3211 //
generate_aescrypt_decryptBlock()3212 address generate_aescrypt_decryptBlock() {
3213 assert(UseAES, "need AES instructions and misaligned SSE support");
3214 __ align(CodeEntryAlignment);
3215 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3216 Label L_doLast;
3217 address start = __ pc();
3218
3219 const Register from = c_rarg0; // source array address
3220 const Register to = c_rarg1; // destination array address
3221 const Register key = c_rarg2; // key array address
3222 const Register keylen = rax;
3223
3224 const XMMRegister xmm_result = xmm0;
3225 const XMMRegister xmm_key_shuf_mask = xmm1;
3226 // On win64 xmm6-xmm15 must be preserved so don't use them.
3227 const XMMRegister xmm_temp1 = xmm2;
3228 const XMMRegister xmm_temp2 = xmm3;
3229 const XMMRegister xmm_temp3 = xmm4;
3230 const XMMRegister xmm_temp4 = xmm5;
3231
3232 __ enter(); // required for proper stackwalking of RuntimeStub frame
3233
3234 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3235 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3236
3237 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3238 __ movdqu(xmm_result, Address(from, 0));
3239
3240 // for decryption java expanded key ordering is rotated one position from what we want
3241 // so we start from 0x10 here and hit 0x00 last
3242 // we don't know if the key is aligned, hence not using load-execute form
3243 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3244 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3245 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3246 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3247
3248 __ pxor (xmm_result, xmm_temp1);
3249 __ aesdec(xmm_result, xmm_temp2);
3250 __ aesdec(xmm_result, xmm_temp3);
3251 __ aesdec(xmm_result, xmm_temp4);
3252
3253 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3254 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3255 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3256 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3257
3258 __ aesdec(xmm_result, xmm_temp1);
3259 __ aesdec(xmm_result, xmm_temp2);
3260 __ aesdec(xmm_result, xmm_temp3);
3261 __ aesdec(xmm_result, xmm_temp4);
3262
3263 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3264 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3265 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3266
3267 __ cmpl(keylen, 44);
3268 __ jccb(Assembler::equal, L_doLast);
3269
3270 __ aesdec(xmm_result, xmm_temp1);
3271 __ aesdec(xmm_result, xmm_temp2);
3272
3273 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3274 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3275
3276 __ cmpl(keylen, 52);
3277 __ jccb(Assembler::equal, L_doLast);
3278
3279 __ aesdec(xmm_result, xmm_temp1);
3280 __ aesdec(xmm_result, xmm_temp2);
3281
3282 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3283 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3284
3285 __ BIND(L_doLast);
3286 __ aesdec(xmm_result, xmm_temp1);
3287 __ aesdec(xmm_result, xmm_temp2);
3288
3289 // for decryption the aesdeclast operation is always on key+0x00
3290 __ aesdeclast(xmm_result, xmm_temp3);
3291 __ movdqu(Address(to, 0), xmm_result); // store the result
3292 __ xorptr(rax, rax); // return 0
3293 __ leave(); // required for proper stackwalking of RuntimeStub frame
3294 __ ret(0);
3295
3296 return start;
3297 }
3298
3299
3300 // Arguments:
3301 //
3302 // Inputs:
3303 // c_rarg0 - source byte array address
3304 // c_rarg1 - destination byte array address
3305 // c_rarg2 - K (key) in little endian int array
3306 // c_rarg3 - r vector byte array address
3307 // c_rarg4 - input length
3308 //
3309 // Output:
3310 // rax - input length
3311 //
generate_cipherBlockChaining_encryptAESCrypt()3312 address generate_cipherBlockChaining_encryptAESCrypt() {
3313 assert(UseAES, "need AES instructions and misaligned SSE support");
3314 __ align(CodeEntryAlignment);
3315 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3316 address start = __ pc();
3317
3318 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3319 const Register from = c_rarg0; // source array address
3320 const Register to = c_rarg1; // destination array address
3321 const Register key = c_rarg2; // key array address
3322 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3323 // and left with the results of the last encryption block
3324 #ifndef _WIN64
3325 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3326 #else
3327 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3328 const Register len_reg = r11; // pick the volatile windows register
3329 #endif
3330 const Register pos = rax;
3331
3332 // xmm register assignments for the loops below
3333 const XMMRegister xmm_result = xmm0;
3334 const XMMRegister xmm_temp = xmm1;
3335 // keys 0-10 preloaded into xmm2-xmm12
3336 const int XMM_REG_NUM_KEY_FIRST = 2;
3337 const int XMM_REG_NUM_KEY_LAST = 15;
3338 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3339 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3340 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3341 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3342 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3343
3344 __ enter(); // required for proper stackwalking of RuntimeStub frame
3345
3346 #ifdef _WIN64
3347 // on win64, fill len_reg from stack position
3348 __ movl(len_reg, len_mem);
3349 #else
3350 __ push(len_reg); // Save
3351 #endif
3352
3353 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3354 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3355 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3356 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3357 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3358 offset += 0x10;
3359 }
3360 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3361
3362 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3363 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3364 __ cmpl(rax, 44);
3365 __ jcc(Assembler::notEqual, L_key_192_256);
3366
3367 // 128 bit code follows here
3368 __ movptr(pos, 0);
3369 __ align(OptoLoopAlignment);
3370
3371 __ BIND(L_loopTop_128);
3372 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3373 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3374 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3375 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3376 __ aesenc(xmm_result, as_XMMRegister(rnum));
3377 }
3378 __ aesenclast(xmm_result, xmm_key10);
3379 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3380 // no need to store r to memory until we exit
3381 __ addptr(pos, AESBlockSize);
3382 __ subptr(len_reg, AESBlockSize);
3383 __ jcc(Assembler::notEqual, L_loopTop_128);
3384
3385 __ BIND(L_exit);
3386 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3387
3388 #ifdef _WIN64
3389 __ movl(rax, len_mem);
3390 #else
3391 __ pop(rax); // return length
3392 #endif
3393 __ leave(); // required for proper stackwalking of RuntimeStub frame
3394 __ ret(0);
3395
3396 __ BIND(L_key_192_256);
3397 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3398 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3399 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3400 __ cmpl(rax, 52);
3401 __ jcc(Assembler::notEqual, L_key_256);
3402
3403 // 192-bit code follows here (could be changed to use more xmm registers)
3404 __ movptr(pos, 0);
3405 __ align(OptoLoopAlignment);
3406
3407 __ BIND(L_loopTop_192);
3408 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3409 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3410 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3411 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3412 __ aesenc(xmm_result, as_XMMRegister(rnum));
3413 }
3414 __ aesenclast(xmm_result, xmm_key12);
3415 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3416 // no need to store r to memory until we exit
3417 __ addptr(pos, AESBlockSize);
3418 __ subptr(len_reg, AESBlockSize);
3419 __ jcc(Assembler::notEqual, L_loopTop_192);
3420 __ jmp(L_exit);
3421
3422 __ BIND(L_key_256);
3423 // 256-bit code follows here (could be changed to use more xmm registers)
3424 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3425 __ movptr(pos, 0);
3426 __ align(OptoLoopAlignment);
3427
3428 __ BIND(L_loopTop_256);
3429 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3430 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3431 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3432 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3433 __ aesenc(xmm_result, as_XMMRegister(rnum));
3434 }
3435 load_key(xmm_temp, key, 0xe0);
3436 __ aesenclast(xmm_result, xmm_temp);
3437 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3438 // no need to store r to memory until we exit
3439 __ addptr(pos, AESBlockSize);
3440 __ subptr(len_reg, AESBlockSize);
3441 __ jcc(Assembler::notEqual, L_loopTop_256);
3442 __ jmp(L_exit);
3443
3444 return start;
3445 }
3446
3447 // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3448 void generate_safefetch(const char* name, int size, address* entry,
3449 address* fault_pc, address* continuation_pc) {
3450 // safefetch signatures:
3451 // int SafeFetch32(int* adr, int errValue);
3452 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3453 //
3454 // arguments:
3455 // c_rarg0 = adr
3456 // c_rarg1 = errValue
3457 //
3458 // result:
3459 // PPC_RET = *adr or errValue
3460
3461 StubCodeMark mark(this, "StubRoutines", name);
3462
3463 // Entry point, pc or function descriptor.
3464 *entry = __ pc();
3465
3466 // Load *adr into c_rarg1, may fault.
3467 *fault_pc = __ pc();
3468 switch (size) {
3469 case 4:
3470 // int32_t
3471 __ movl(c_rarg1, Address(c_rarg0, 0));
3472 break;
3473 case 8:
3474 // int64_t
3475 __ movq(c_rarg1, Address(c_rarg0, 0));
3476 break;
3477 default:
3478 ShouldNotReachHere();
3479 }
3480
3481 // return errValue or *adr
3482 *continuation_pc = __ pc();
3483 __ movq(rax, c_rarg1);
3484 __ ret(0);
3485 }
3486
3487 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3488 // to hide instruction latency
3489 //
3490 // Arguments:
3491 //
3492 // Inputs:
3493 // c_rarg0 - source byte array address
3494 // c_rarg1 - destination byte array address
3495 // c_rarg2 - K (key) in little endian int array
3496 // c_rarg3 - r vector byte array address
3497 // c_rarg4 - input length
3498 //
3499 // Output:
3500 // rax - input length
3501 //
generate_cipherBlockChaining_decryptAESCrypt_Parallel()3502 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3503 assert(UseAES, "need AES instructions and misaligned SSE support");
3504 __ align(CodeEntryAlignment);
3505 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3506 address start = __ pc();
3507
3508 const Register from = c_rarg0; // source array address
3509 const Register to = c_rarg1; // destination array address
3510 const Register key = c_rarg2; // key array address
3511 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3512 // and left with the results of the last encryption block
3513 #ifndef _WIN64
3514 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3515 #else
3516 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3517 const Register len_reg = r11; // pick the volatile windows register
3518 #endif
3519 const Register pos = rax;
3520
3521 const int PARALLEL_FACTOR = 4;
3522 const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3523
3524 Label L_exit;
3525 Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3526 Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3527 Label L_singleBlock_loopTop[3]; // 128, 192, 256
3528 Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3529 Label L_multiBlock_loopTop[3]; // 128, 192, 256
3530
3531 // keys 0-10 preloaded into xmm5-xmm15
3532 const int XMM_REG_NUM_KEY_FIRST = 5;
3533 const int XMM_REG_NUM_KEY_LAST = 15;
3534 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3535 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3536
3537 __ enter(); // required for proper stackwalking of RuntimeStub frame
3538
3539 #ifdef _WIN64
3540 // on win64, fill len_reg from stack position
3541 __ movl(len_reg, len_mem);
3542 #else
3543 __ push(len_reg); // Save
3544 #endif
3545 __ push(rbx);
3546 // the java expanded key ordering is rotated one position from what we want
3547 // so we start from 0x10 here and hit 0x00 last
3548 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3549 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3550 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3551 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3552 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3553 offset += 0x10;
3554 }
3555 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3556
3557 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3558
3559 // registers holding the four results in the parallelized loop
3560 const XMMRegister xmm_result0 = xmm0;
3561 const XMMRegister xmm_result1 = xmm2;
3562 const XMMRegister xmm_result2 = xmm3;
3563 const XMMRegister xmm_result3 = xmm4;
3564
3565 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
3566
3567 __ xorptr(pos, pos);
3568
3569 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3570 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3571 __ cmpl(rbx, 52);
3572 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3573 __ cmpl(rbx, 60);
3574 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3575
3576 #define DoFour(opc, src_reg) \
3577 __ opc(xmm_result0, src_reg); \
3578 __ opc(xmm_result1, src_reg); \
3579 __ opc(xmm_result2, src_reg); \
3580 __ opc(xmm_result3, src_reg); \
3581
3582 for (int k = 0; k < 3; ++k) {
3583 __ BIND(L_multiBlock_loopTopHead[k]);
3584 if (k != 0) {
3585 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3586 __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3587 }
3588 if (k == 1) {
3589 __ subptr(rsp, 6 * wordSize);
3590 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3591 load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3592 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3593 load_key(xmm1, key, 0xc0); // 0xc0;
3594 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3595 } else if (k == 2) {
3596 __ subptr(rsp, 10 * wordSize);
3597 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3598 load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3599 __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3600 load_key(xmm1, key, 0xe0); // 0xe0;
3601 __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3602 load_key(xmm15, key, 0xb0); // 0xb0;
3603 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3604 load_key(xmm1, key, 0xc0); // 0xc0;
3605 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3606 }
3607 __ align(OptoLoopAlignment);
3608 __ BIND(L_multiBlock_loopTop[k]);
3609 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3610 __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3611
3612 if (k != 0) {
3613 __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3614 __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3615 }
3616
3617 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3618 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3619 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3620 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3621
3622 DoFour(pxor, xmm_key_first);
3623 if (k == 0) {
3624 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3625 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3626 }
3627 DoFour(aesdeclast, xmm_key_last);
3628 } else if (k == 1) {
3629 for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3630 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3631 }
3632 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3633 DoFour(aesdec, xmm1); // key : 0xc0
3634 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3635 DoFour(aesdeclast, xmm_key_last);
3636 } else if (k == 2) {
3637 for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3638 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3639 }
3640 DoFour(aesdec, xmm1); // key : 0xc0
3641 __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3642 __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3643 DoFour(aesdec, xmm15); // key : 0xd0
3644 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3645 DoFour(aesdec, xmm1); // key : 0xe0
3646 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3647 DoFour(aesdeclast, xmm_key_last);
3648 }
3649
3650 // for each result, xor with the r vector of previous cipher block
3651 __ pxor(xmm_result0, xmm_prev_block_cipher);
3652 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3653 __ pxor(xmm_result1, xmm_prev_block_cipher);
3654 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3655 __ pxor(xmm_result2, xmm_prev_block_cipher);
3656 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3657 __ pxor(xmm_result3, xmm_prev_block_cipher);
3658 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
3659 if (k != 0) {
3660 __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3661 }
3662
3663 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
3664 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3665 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3666 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3667
3668 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3669 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3670 __ jmp(L_multiBlock_loopTop[k]);
3671
3672 // registers used in the non-parallelized loops
3673 // xmm register assignments for the loops below
3674 const XMMRegister xmm_result = xmm0;
3675 const XMMRegister xmm_prev_block_cipher_save = xmm2;
3676 const XMMRegister xmm_key11 = xmm3;
3677 const XMMRegister xmm_key12 = xmm4;
3678 const XMMRegister key_tmp = xmm4;
3679
3680 __ BIND(L_singleBlock_loopTopHead[k]);
3681 if (k == 1) {
3682 __ addptr(rsp, 6 * wordSize);
3683 } else if (k == 2) {
3684 __ addptr(rsp, 10 * wordSize);
3685 }
3686 __ cmpptr(len_reg, 0); // any blocks left??
3687 __ jcc(Assembler::equal, L_exit);
3688 __ BIND(L_singleBlock_loopTopHead2[k]);
3689 if (k == 1) {
3690 load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3691 load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3692 }
3693 if (k == 2) {
3694 load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3695 }
3696 __ align(OptoLoopAlignment);
3697 __ BIND(L_singleBlock_loopTop[k]);
3698 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3699 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3700 __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3701 for (int rnum = 1; rnum <= 9 ; rnum++) {
3702 __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3703 }
3704 if (k == 1) {
3705 __ aesdec(xmm_result, xmm_key11);
3706 __ aesdec(xmm_result, xmm_key12);
3707 }
3708 if (k == 2) {
3709 __ aesdec(xmm_result, xmm_key11);
3710 load_key(key_tmp, key, 0xc0);
3711 __ aesdec(xmm_result, key_tmp);
3712 load_key(key_tmp, key, 0xd0);
3713 __ aesdec(xmm_result, key_tmp);
3714 load_key(key_tmp, key, 0xe0);
3715 __ aesdec(xmm_result, key_tmp);
3716 }
3717
3718 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3719 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3720 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3721 // no need to store r to memory until we exit
3722 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3723 __ addptr(pos, AESBlockSize);
3724 __ subptr(len_reg, AESBlockSize);
3725 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3726 if (k != 2) {
3727 __ jmp(L_exit);
3728 }
3729 } //for 128/192/256
3730
3731 __ BIND(L_exit);
3732 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
3733 __ pop(rbx);
3734 #ifdef _WIN64
3735 __ movl(rax, len_mem);
3736 #else
3737 __ pop(rax); // return length
3738 #endif
3739 __ leave(); // required for proper stackwalking of RuntimeStub frame
3740 __ ret(0);
3741 return start;
3742 }
3743
generate_upper_word_mask()3744 address generate_upper_word_mask() {
3745 __ align(64);
3746 StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3747 address start = __ pc();
3748 __ emit_data64(0x0000000000000000, relocInfo::none);
3749 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3750 return start;
3751 }
3752
generate_shuffle_byte_flip_mask()3753 address generate_shuffle_byte_flip_mask() {
3754 __ align(64);
3755 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3756 address start = __ pc();
3757 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3758 __ emit_data64(0x0001020304050607, relocInfo::none);
3759 return start;
3760 }
3761
3762 // ofs and limit are use for multi-block byte array.
3763 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
generate_sha1_implCompress(bool multi_block,const char * name)3764 address generate_sha1_implCompress(bool multi_block, const char *name) {
3765 __ align(CodeEntryAlignment);
3766 StubCodeMark mark(this, "StubRoutines", name);
3767 address start = __ pc();
3768
3769 Register buf = c_rarg0;
3770 Register state = c_rarg1;
3771 Register ofs = c_rarg2;
3772 Register limit = c_rarg3;
3773
3774 const XMMRegister abcd = xmm0;
3775 const XMMRegister e0 = xmm1;
3776 const XMMRegister e1 = xmm2;
3777 const XMMRegister msg0 = xmm3;
3778
3779 const XMMRegister msg1 = xmm4;
3780 const XMMRegister msg2 = xmm5;
3781 const XMMRegister msg3 = xmm6;
3782 const XMMRegister shuf_mask = xmm7;
3783
3784 __ enter();
3785
3786 __ subptr(rsp, 4 * wordSize);
3787
3788 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3789 buf, state, ofs, limit, rsp, multi_block);
3790
3791 __ addptr(rsp, 4 * wordSize);
3792
3793 __ leave();
3794 __ ret(0);
3795 return start;
3796 }
3797
generate_pshuffle_byte_flip_mask()3798 address generate_pshuffle_byte_flip_mask() {
3799 __ align(64);
3800 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3801 address start = __ pc();
3802 __ emit_data64(0x0405060700010203, relocInfo::none);
3803 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3804
3805 if (VM_Version::supports_avx2()) {
3806 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3807 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3808 // _SHUF_00BA
3809 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3810 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3811 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3812 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3813 // _SHUF_DC00
3814 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3815 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3816 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3817 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3818 }
3819
3820 return start;
3821 }
3822
3823 //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
generate_pshuffle_byte_flip_mask_sha512()3824 address generate_pshuffle_byte_flip_mask_sha512() {
3825 __ align(32);
3826 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3827 address start = __ pc();
3828 if (VM_Version::supports_avx2()) {
3829 __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3830 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3831 __ emit_data64(0x1011121314151617, relocInfo::none);
3832 __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3833 __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3834 __ emit_data64(0x0000000000000000, relocInfo::none);
3835 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3836 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3837 }
3838
3839 return start;
3840 }
3841
3842 // ofs and limit are use for multi-block byte array.
3843 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
generate_sha256_implCompress(bool multi_block,const char * name)3844 address generate_sha256_implCompress(bool multi_block, const char *name) {
3845 assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3846 __ align(CodeEntryAlignment);
3847 StubCodeMark mark(this, "StubRoutines", name);
3848 address start = __ pc();
3849
3850 Register buf = c_rarg0;
3851 Register state = c_rarg1;
3852 Register ofs = c_rarg2;
3853 Register limit = c_rarg3;
3854
3855 const XMMRegister msg = xmm0;
3856 const XMMRegister state0 = xmm1;
3857 const XMMRegister state1 = xmm2;
3858 const XMMRegister msgtmp0 = xmm3;
3859
3860 const XMMRegister msgtmp1 = xmm4;
3861 const XMMRegister msgtmp2 = xmm5;
3862 const XMMRegister msgtmp3 = xmm6;
3863 const XMMRegister msgtmp4 = xmm7;
3864
3865 const XMMRegister shuf_mask = xmm8;
3866
3867 __ enter();
3868
3869 __ subptr(rsp, 4 * wordSize);
3870
3871 if (VM_Version::supports_sha()) {
3872 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3873 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3874 } else if (VM_Version::supports_avx2()) {
3875 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3876 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3877 }
3878 __ addptr(rsp, 4 * wordSize);
3879 __ vzeroupper();
3880 __ leave();
3881 __ ret(0);
3882 return start;
3883 }
3884
generate_sha512_implCompress(bool multi_block,const char * name)3885 address generate_sha512_implCompress(bool multi_block, const char *name) {
3886 assert(VM_Version::supports_avx2(), "");
3887 assert(VM_Version::supports_bmi2(), "");
3888 __ align(CodeEntryAlignment);
3889 StubCodeMark mark(this, "StubRoutines", name);
3890 address start = __ pc();
3891
3892 Register buf = c_rarg0;
3893 Register state = c_rarg1;
3894 Register ofs = c_rarg2;
3895 Register limit = c_rarg3;
3896
3897 const XMMRegister msg = xmm0;
3898 const XMMRegister state0 = xmm1;
3899 const XMMRegister state1 = xmm2;
3900 const XMMRegister msgtmp0 = xmm3;
3901 const XMMRegister msgtmp1 = xmm4;
3902 const XMMRegister msgtmp2 = xmm5;
3903 const XMMRegister msgtmp3 = xmm6;
3904 const XMMRegister msgtmp4 = xmm7;
3905
3906 const XMMRegister shuf_mask = xmm8;
3907
3908 __ enter();
3909
3910 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3911 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3912
3913 __ vzeroupper();
3914 __ leave();
3915 __ ret(0);
3916 return start;
3917 }
3918
3919 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3920 // to hide instruction latency
3921 //
3922 // Arguments:
3923 //
3924 // Inputs:
3925 // c_rarg0 - source byte array address
3926 // c_rarg1 - destination byte array address
3927 // c_rarg2 - K (key) in little endian int array
3928 // c_rarg3 - counter vector byte array address
3929 // Linux
3930 // c_rarg4 - input length
3931 // c_rarg5 - saved encryptedCounter start
3932 // rbp + 6 * wordSize - saved used length
3933 // Windows
3934 // rbp + 6 * wordSize - input length
3935 // rbp + 7 * wordSize - saved encryptedCounter start
3936 // rbp + 8 * wordSize - saved used length
3937 //
3938 // Output:
3939 // rax - input length
3940 //
generate_counterMode_AESCrypt_Parallel()3941 address generate_counterMode_AESCrypt_Parallel() {
3942 assert(UseAES, "need AES instructions and misaligned SSE support");
3943 __ align(CodeEntryAlignment);
3944 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3945 address start = __ pc();
3946 const Register from = c_rarg0; // source array address
3947 const Register to = c_rarg1; // destination array address
3948 const Register key = c_rarg2; // key array address
3949 const Register counter = c_rarg3; // counter byte array initialized from counter array address
3950 // and updated with the incremented counter in the end
3951 #ifndef _WIN64
3952 const Register len_reg = c_rarg4;
3953 const Register saved_encCounter_start = c_rarg5;
3954 const Register used_addr = r10;
3955 const Address used_mem(rbp, 2 * wordSize);
3956 const Register used = r11;
3957 #else
3958 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3959 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
3960 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
3961 const Register len_reg = r10; // pick the first volatile windows register
3962 const Register saved_encCounter_start = r11;
3963 const Register used_addr = r13;
3964 const Register used = r14;
3965 #endif
3966 const Register pos = rax;
3967
3968 const int PARALLEL_FACTOR = 6;
3969 const XMMRegister xmm_counter_shuf_mask = xmm0;
3970 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3971 const XMMRegister xmm_curr_counter = xmm2;
3972
3973 const XMMRegister xmm_key_tmp0 = xmm3;
3974 const XMMRegister xmm_key_tmp1 = xmm4;
3975
3976 // registers holding the four results in the parallelized loop
3977 const XMMRegister xmm_result0 = xmm5;
3978 const XMMRegister xmm_result1 = xmm6;
3979 const XMMRegister xmm_result2 = xmm7;
3980 const XMMRegister xmm_result3 = xmm8;
3981 const XMMRegister xmm_result4 = xmm9;
3982 const XMMRegister xmm_result5 = xmm10;
3983
3984 const XMMRegister xmm_from0 = xmm11;
3985 const XMMRegister xmm_from1 = xmm12;
3986 const XMMRegister xmm_from2 = xmm13;
3987 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
3988 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
3989 const XMMRegister xmm_from5 = xmm4;
3990
3991 //for key_128, key_192, key_256
3992 const int rounds[3] = {10, 12, 14};
3993 Label L_exit_preLoop, L_preLoop_start;
3994 Label L_multiBlock_loopTop[3];
3995 Label L_singleBlockLoopTop[3];
3996 Label L__incCounter[3][6]; //for 6 blocks
3997 Label L__incCounter_single[3]; //for single block, key128, key192, key256
3998 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3999 Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4000
4001 Label L_exit;
4002
4003 __ enter(); // required for proper stackwalking of RuntimeStub frame
4004
4005 #ifdef _WIN64
4006 // allocate spill slots for r13, r14
4007 enum {
4008 saved_r13_offset,
4009 saved_r14_offset
4010 };
4011 __ subptr(rsp, 2 * wordSize);
4012 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4013 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4014
4015 // on win64, fill len_reg from stack position
4016 __ movl(len_reg, len_mem);
4017 __ movptr(saved_encCounter_start, saved_encCounter_mem);
4018 __ movptr(used_addr, used_mem);
4019 __ movl(used, Address(used_addr, 0));
4020 #else
4021 __ push(len_reg); // Save
4022 __ movptr(used_addr, used_mem);
4023 __ movl(used, Address(used_addr, 0));
4024 #endif
4025
4026 __ push(rbx); // Save RBX
4027 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4028 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4029 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4030 __ movptr(pos, 0);
4031
4032 // Use the partially used encrpyted counter from last invocation
4033 __ BIND(L_preLoop_start);
4034 __ cmpptr(used, 16);
4035 __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4036 __ cmpptr(len_reg, 0);
4037 __ jcc(Assembler::lessEqual, L_exit_preLoop);
4038 __ movb(rbx, Address(saved_encCounter_start, used));
4039 __ xorb(rbx, Address(from, pos));
4040 __ movb(Address(to, pos), rbx);
4041 __ addptr(pos, 1);
4042 __ addptr(used, 1);
4043 __ subptr(len_reg, 1);
4044
4045 __ jmp(L_preLoop_start);
4046
4047 __ BIND(L_exit_preLoop);
4048 __ movl(Address(used_addr, 0), used);
4049
4050 // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4051 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4052 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4053 __ cmpl(rbx, 52);
4054 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4055 __ cmpl(rbx, 60);
4056 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4057
4058 #define CTR_DoSix(opc, src_reg) \
4059 __ opc(xmm_result0, src_reg); \
4060 __ opc(xmm_result1, src_reg); \
4061 __ opc(xmm_result2, src_reg); \
4062 __ opc(xmm_result3, src_reg); \
4063 __ opc(xmm_result4, src_reg); \
4064 __ opc(xmm_result5, src_reg);
4065
4066 // k == 0 : generate code for key_128
4067 // k == 1 : generate code for key_192
4068 // k == 2 : generate code for key_256
4069 for (int k = 0; k < 3; ++k) {
4070 //multi blocks starts here
4071 __ align(OptoLoopAlignment);
4072 __ BIND(L_multiBlock_loopTop[k]);
4073 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4074 __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4075 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4076
4077 //load, then increase counters
4078 CTR_DoSix(movdqa, xmm_curr_counter);
4079 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4080 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4081 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4082 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4083 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
4084 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4085 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4086 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
4087
4088 //load two ROUND_KEYs at a time
4089 for (int i = 1; i < rounds[k]; ) {
4090 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4091 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4092 CTR_DoSix(aesenc, xmm_key_tmp1);
4093 i++;
4094 if (i != rounds[k]) {
4095 CTR_DoSix(aesenc, xmm_key_tmp0);
4096 } else {
4097 CTR_DoSix(aesenclast, xmm_key_tmp0);
4098 }
4099 i++;
4100 }
4101
4102 // get next PARALLEL_FACTOR blocks into xmm_result registers
4103 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4104 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4105 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4106 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4107 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4108 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4109
4110 __ pxor(xmm_result0, xmm_from0);
4111 __ pxor(xmm_result1, xmm_from1);
4112 __ pxor(xmm_result2, xmm_from2);
4113 __ pxor(xmm_result3, xmm_from3);
4114 __ pxor(xmm_result4, xmm_from4);
4115 __ pxor(xmm_result5, xmm_from5);
4116
4117 // store 6 results into the next 64 bytes of output
4118 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4119 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4120 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4121 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4122 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4123 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4124
4125 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4126 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4127 __ jmp(L_multiBlock_loopTop[k]);
4128
4129 // singleBlock starts here
4130 __ align(OptoLoopAlignment);
4131 __ BIND(L_singleBlockLoopTop[k]);
4132 __ cmpptr(len_reg, 0);
4133 __ jcc(Assembler::lessEqual, L_exit);
4134 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4135 __ movdqa(xmm_result0, xmm_curr_counter);
4136 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4137 __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4138 __ pxor(xmm_result0, xmm_key_tmp0);
4139 for (int i = 1; i < rounds[k]; i++) {
4140 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4141 __ aesenc(xmm_result0, xmm_key_tmp0);
4142 }
4143 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4144 __ aesenclast(xmm_result0, xmm_key_tmp0);
4145 __ cmpptr(len_reg, AESBlockSize);
4146 __ jcc(Assembler::less, L_processTail_insr[k]);
4147 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4148 __ pxor(xmm_result0, xmm_from0);
4149 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4150 __ addptr(pos, AESBlockSize);
4151 __ subptr(len_reg, AESBlockSize);
4152 __ jmp(L_singleBlockLoopTop[k]);
4153 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array
4154 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register
4155 __ testptr(len_reg, 8);
4156 __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4157 __ subptr(pos,8);
4158 __ pinsrq(xmm_from0, Address(from, pos), 0);
4159 __ BIND(L_processTail_4_insr[k]);
4160 __ testptr(len_reg, 4);
4161 __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4162 __ subptr(pos,4);
4163 __ pslldq(xmm_from0, 4);
4164 __ pinsrd(xmm_from0, Address(from, pos), 0);
4165 __ BIND(L_processTail_2_insr[k]);
4166 __ testptr(len_reg, 2);
4167 __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4168 __ subptr(pos, 2);
4169 __ pslldq(xmm_from0, 2);
4170 __ pinsrw(xmm_from0, Address(from, pos), 0);
4171 __ BIND(L_processTail_1_insr[k]);
4172 __ testptr(len_reg, 1);
4173 __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4174 __ subptr(pos, 1);
4175 __ pslldq(xmm_from0, 1);
4176 __ pinsrb(xmm_from0, Address(from, pos), 0);
4177 __ BIND(L_processTail_exit_insr[k]);
4178
4179 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4180 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
4181
4182 __ testptr(len_reg, 8);
4183 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array
4184 __ pextrq(Address(to, pos), xmm_result0, 0);
4185 __ psrldq(xmm_result0, 8);
4186 __ addptr(pos, 8);
4187 __ BIND(L_processTail_4_extr[k]);
4188 __ testptr(len_reg, 4);
4189 __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4190 __ pextrd(Address(to, pos), xmm_result0, 0);
4191 __ psrldq(xmm_result0, 4);
4192 __ addptr(pos, 4);
4193 __ BIND(L_processTail_2_extr[k]);
4194 __ testptr(len_reg, 2);
4195 __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4196 __ pextrw(Address(to, pos), xmm_result0, 0);
4197 __ psrldq(xmm_result0, 2);
4198 __ addptr(pos, 2);
4199 __ BIND(L_processTail_1_extr[k]);
4200 __ testptr(len_reg, 1);
4201 __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4202 __ pextrb(Address(to, pos), xmm_result0, 0);
4203
4204 __ BIND(L_processTail_exit_extr[k]);
4205 __ movl(Address(used_addr, 0), len_reg);
4206 __ jmp(L_exit);
4207
4208 }
4209
4210 __ BIND(L_exit);
4211 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4212 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4213 __ pop(rbx); // pop the saved RBX.
4214 #ifdef _WIN64
4215 __ movl(rax, len_mem);
4216 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4217 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4218 __ addptr(rsp, 2 * wordSize);
4219 #else
4220 __ pop(rax); // return 'len'
4221 #endif
4222 __ leave(); // required for proper stackwalking of RuntimeStub frame
4223 __ ret(0);
4224 return start;
4225 }
4226
roundDec(XMMRegister xmm_reg)4227 void roundDec(XMMRegister xmm_reg) {
4228 __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4229 __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4230 __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4231 __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4232 __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4233 __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4234 __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4235 __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4236 }
4237
roundDeclast(XMMRegister xmm_reg)4238 void roundDeclast(XMMRegister xmm_reg) {
4239 __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4240 __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4241 __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4242 __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4243 __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4244 __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4245 __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4246 __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4247 }
4248
ev_load_key(XMMRegister xmmdst,Register key,int offset,XMMRegister xmm_shuf_mask=NULL)4249 void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4250 __ movdqu(xmmdst, Address(key, offset));
4251 if (xmm_shuf_mask != NULL) {
4252 __ pshufb(xmmdst, xmm_shuf_mask);
4253 } else {
4254 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4255 }
4256 __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4257
4258 }
4259
generate_cipherBlockChaining_decryptVectorAESCrypt()4260 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4261 assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
4262 __ align(CodeEntryAlignment);
4263 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4264 address start = __ pc();
4265
4266 const Register from = c_rarg0; // source array address
4267 const Register to = c_rarg1; // destination array address
4268 const Register key = c_rarg2; // key array address
4269 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
4270 // and left with the results of the last encryption block
4271 #ifndef _WIN64
4272 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
4273 #else
4274 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4275 const Register len_reg = r11; // pick the volatile windows register
4276 #endif
4277
4278 Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4279 Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4280
4281 __ enter();
4282
4283 #ifdef _WIN64
4284 // on win64, fill len_reg from stack position
4285 __ movl(len_reg, len_mem);
4286 #else
4287 __ push(len_reg); // Save
4288 #endif
4289 __ push(rbx);
4290 __ vzeroupper();
4291
4292 // Temporary variable declaration for swapping key bytes
4293 const XMMRegister xmm_key_shuf_mask = xmm1;
4294 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4295
4296 // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4297 const Register rounds = rbx;
4298 __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4299
4300 const XMMRegister IV = xmm0;
4301 // Load IV and broadcast value to 512-bits
4302 __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4303
4304 // Temporary variables for storing round keys
4305 const XMMRegister RK0 = xmm30;
4306 const XMMRegister RK1 = xmm9;
4307 const XMMRegister RK2 = xmm18;
4308 const XMMRegister RK3 = xmm19;
4309 const XMMRegister RK4 = xmm20;
4310 const XMMRegister RK5 = xmm21;
4311 const XMMRegister RK6 = xmm22;
4312 const XMMRegister RK7 = xmm23;
4313 const XMMRegister RK8 = xmm24;
4314 const XMMRegister RK9 = xmm25;
4315 const XMMRegister RK10 = xmm26;
4316
4317 // Load and shuffle key
4318 // the java expanded key ordering is rotated one position from what we want
4319 // so we start from 1*16 here and hit 0*16 last
4320 ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4321 ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4322 ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4323 ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4324 ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4325 ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4326 ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4327 ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4328 ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4329 ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4330 ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4331
4332 // Variables for storing source cipher text
4333 const XMMRegister S0 = xmm10;
4334 const XMMRegister S1 = xmm11;
4335 const XMMRegister S2 = xmm12;
4336 const XMMRegister S3 = xmm13;
4337 const XMMRegister S4 = xmm14;
4338 const XMMRegister S5 = xmm15;
4339 const XMMRegister S6 = xmm16;
4340 const XMMRegister S7 = xmm17;
4341
4342 // Variables for storing decrypted text
4343 const XMMRegister B0 = xmm1;
4344 const XMMRegister B1 = xmm2;
4345 const XMMRegister B2 = xmm3;
4346 const XMMRegister B3 = xmm4;
4347 const XMMRegister B4 = xmm5;
4348 const XMMRegister B5 = xmm6;
4349 const XMMRegister B6 = xmm7;
4350 const XMMRegister B7 = xmm8;
4351
4352 __ cmpl(rounds, 44);
4353 __ jcc(Assembler::greater, KEY_192);
4354 __ jmp(Loop);
4355
4356 __ BIND(KEY_192);
4357 const XMMRegister RK11 = xmm27;
4358 const XMMRegister RK12 = xmm28;
4359 ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4360 ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4361
4362 __ cmpl(rounds, 52);
4363 __ jcc(Assembler::greater, KEY_256);
4364 __ jmp(Loop);
4365
4366 __ BIND(KEY_256);
4367 const XMMRegister RK13 = xmm29;
4368 const XMMRegister RK14 = xmm31;
4369 ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4370 ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4371
4372 __ BIND(Loop);
4373 __ cmpl(len_reg, 512);
4374 __ jcc(Assembler::below, Lcbc_dec_rem);
4375 __ BIND(Loop1);
4376 __ subl(len_reg, 512);
4377 __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4378 __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4379 __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4380 __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4381 __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4382 __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4383 __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4384 __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4385 __ leaq(from, Address(from, 8 * 64));
4386
4387 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4388 __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4389 __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4390 __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4391 __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4392 __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4393 __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4394 __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4395
4396 __ evalignq(IV, S0, IV, 0x06);
4397 __ evalignq(S0, S1, S0, 0x06);
4398 __ evalignq(S1, S2, S1, 0x06);
4399 __ evalignq(S2, S3, S2, 0x06);
4400 __ evalignq(S3, S4, S3, 0x06);
4401 __ evalignq(S4, S5, S4, 0x06);
4402 __ evalignq(S5, S6, S5, 0x06);
4403 __ evalignq(S6, S7, S6, 0x06);
4404
4405 roundDec(RK2);
4406 roundDec(RK3);
4407 roundDec(RK4);
4408 roundDec(RK5);
4409 roundDec(RK6);
4410 roundDec(RK7);
4411 roundDec(RK8);
4412 roundDec(RK9);
4413 roundDec(RK10);
4414
4415 __ cmpl(rounds, 44);
4416 __ jcc(Assembler::belowEqual, L_128);
4417 roundDec(RK11);
4418 roundDec(RK12);
4419
4420 __ cmpl(rounds, 52);
4421 __ jcc(Assembler::belowEqual, L_192);
4422 roundDec(RK13);
4423 roundDec(RK14);
4424
4425 __ BIND(L_256);
4426 roundDeclast(RK0);
4427 __ jmp(Loop2);
4428
4429 __ BIND(L_128);
4430 roundDeclast(RK0);
4431 __ jmp(Loop2);
4432
4433 __ BIND(L_192);
4434 roundDeclast(RK0);
4435
4436 __ BIND(Loop2);
4437 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4438 __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
4439 __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
4440 __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
4441 __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
4442 __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
4443 __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
4444 __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
4445 __ evmovdquq(IV, S7, Assembler::AVX_512bit);
4446
4447 __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
4448 __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
4449 __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
4450 __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
4451 __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
4452 __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
4453 __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
4454 __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
4455 __ leaq(to, Address(to, 8 * 64));
4456 __ jmp(Loop);
4457
4458 __ BIND(Lcbc_dec_rem);
4459 __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
4460
4461 __ BIND(Lcbc_dec_rem_loop);
4462 __ subl(len_reg, 16);
4463 __ jcc(Assembler::carrySet, Lcbc_dec_ret);
4464
4465 __ movdqu(S0, Address(from, 0));
4466 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4467 __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
4468 __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
4469 __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
4470 __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
4471 __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
4472 __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
4473 __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
4474 __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
4475 __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
4476 __ cmpl(rounds, 44);
4477 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4478
4479 __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
4480 __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
4481 __ cmpl(rounds, 52);
4482 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4483
4484 __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
4485 __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
4486
4487 __ BIND(Lcbc_dec_rem_last);
4488 __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
4489
4490 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4491 __ evmovdquq(IV, S0, Assembler::AVX_512bit);
4492 __ movdqu(Address(to, 0), B0);
4493 __ leaq(from, Address(from, 16));
4494 __ leaq(to, Address(to, 16));
4495 __ jmp(Lcbc_dec_rem_loop);
4496
4497 __ BIND(Lcbc_dec_ret);
4498 __ movdqu(Address(rvec, 0), IV);
4499
4500 // Zero out the round keys
4501 __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
4502 __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
4503 __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
4504 __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
4505 __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
4506 __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
4507 __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
4508 __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
4509 __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
4510 __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
4511 __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
4512 __ cmpl(rounds, 44);
4513 __ jcc(Assembler::belowEqual, Lcbc_exit);
4514 __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
4515 __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
4516 __ cmpl(rounds, 52);
4517 __ jcc(Assembler::belowEqual, Lcbc_exit);
4518 __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
4519 __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
4520
4521 __ BIND(Lcbc_exit);
4522 __ pop(rbx);
4523 #ifdef _WIN64
4524 __ movl(rax, len_mem);
4525 #else
4526 __ pop(rax); // return length
4527 #endif
4528 __ leave(); // required for proper stackwalking of RuntimeStub frame
4529 __ ret(0);
4530 return start;
4531 }
4532
4533 // Polynomial x^128+x^127+x^126+x^121+1
ghash_polynomial_addr()4534 address ghash_polynomial_addr() {
4535 __ align(CodeEntryAlignment);
4536 StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
4537 address start = __ pc();
4538 __ emit_data64(0x0000000000000001, relocInfo::none);
4539 __ emit_data64(0xc200000000000000, relocInfo::none);
4540 return start;
4541 }
4542
ghash_shufflemask_addr()4543 address ghash_shufflemask_addr() {
4544 __ align(CodeEntryAlignment);
4545 StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
4546 address start = __ pc();
4547 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4548 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4549 return start;
4550 }
4551
4552 // Ghash single and multi block operations using AVX instructions
generate_avx_ghash_processBlocks()4553 address generate_avx_ghash_processBlocks() {
4554 __ align(CodeEntryAlignment);
4555
4556 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4557 address start = __ pc();
4558
4559 // arguments
4560 const Register state = c_rarg0;
4561 const Register htbl = c_rarg1;
4562 const Register data = c_rarg2;
4563 const Register blocks = c_rarg3;
4564 __ enter();
4565 // Save state before entering routine
4566 __ avx_ghash(state, htbl, data, blocks);
4567 __ leave(); // required for proper stackwalking of RuntimeStub frame
4568 __ ret(0);
4569 return start;
4570 }
4571
4572 // byte swap x86 long
generate_ghash_long_swap_mask()4573 address generate_ghash_long_swap_mask() {
4574 __ align(CodeEntryAlignment);
4575 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4576 address start = __ pc();
4577 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4578 __ emit_data64(0x0706050403020100, relocInfo::none );
4579 return start;
4580 }
4581
4582 // byte swap x86 byte array
generate_ghash_byte_swap_mask()4583 address generate_ghash_byte_swap_mask() {
4584 __ align(CodeEntryAlignment);
4585 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4586 address start = __ pc();
4587 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4588 __ emit_data64(0x0001020304050607, relocInfo::none );
4589 return start;
4590 }
4591
4592 /* Single and multi-block ghash operations */
generate_ghash_processBlocks()4593 address generate_ghash_processBlocks() {
4594 __ align(CodeEntryAlignment);
4595 Label L_ghash_loop, L_exit;
4596 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4597 address start = __ pc();
4598
4599 const Register state = c_rarg0;
4600 const Register subkeyH = c_rarg1;
4601 const Register data = c_rarg2;
4602 const Register blocks = c_rarg3;
4603
4604 const XMMRegister xmm_temp0 = xmm0;
4605 const XMMRegister xmm_temp1 = xmm1;
4606 const XMMRegister xmm_temp2 = xmm2;
4607 const XMMRegister xmm_temp3 = xmm3;
4608 const XMMRegister xmm_temp4 = xmm4;
4609 const XMMRegister xmm_temp5 = xmm5;
4610 const XMMRegister xmm_temp6 = xmm6;
4611 const XMMRegister xmm_temp7 = xmm7;
4612 const XMMRegister xmm_temp8 = xmm8;
4613 const XMMRegister xmm_temp9 = xmm9;
4614 const XMMRegister xmm_temp10 = xmm10;
4615
4616 __ enter();
4617
4618 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4619
4620 __ movdqu(xmm_temp0, Address(state, 0));
4621 __ pshufb(xmm_temp0, xmm_temp10);
4622
4623
4624 __ BIND(L_ghash_loop);
4625 __ movdqu(xmm_temp2, Address(data, 0));
4626 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4627
4628 __ movdqu(xmm_temp1, Address(subkeyH, 0));
4629 __ pshufb(xmm_temp1, xmm_temp10);
4630
4631 __ pxor(xmm_temp0, xmm_temp2);
4632
4633 //
4634 // Multiply with the hash key
4635 //
4636 __ movdqu(xmm_temp3, xmm_temp0);
4637 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
4638 __ movdqu(xmm_temp4, xmm_temp0);
4639 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
4640
4641 __ movdqu(xmm_temp5, xmm_temp0);
4642 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
4643 __ movdqu(xmm_temp6, xmm_temp0);
4644 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
4645
4646 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
4647
4648 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
4649 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
4650 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
4651 __ pxor(xmm_temp3, xmm_temp5);
4652 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
4653 // of the carry-less multiplication of
4654 // xmm0 by xmm1.
4655
4656 // We shift the result of the multiplication by one bit position
4657 // to the left to cope for the fact that the bits are reversed.
4658 __ movdqu(xmm_temp7, xmm_temp3);
4659 __ movdqu(xmm_temp8, xmm_temp6);
4660 __ pslld(xmm_temp3, 1);
4661 __ pslld(xmm_temp6, 1);
4662 __ psrld(xmm_temp7, 31);
4663 __ psrld(xmm_temp8, 31);
4664 __ movdqu(xmm_temp9, xmm_temp7);
4665 __ pslldq(xmm_temp8, 4);
4666 __ pslldq(xmm_temp7, 4);
4667 __ psrldq(xmm_temp9, 12);
4668 __ por(xmm_temp3, xmm_temp7);
4669 __ por(xmm_temp6, xmm_temp8);
4670 __ por(xmm_temp6, xmm_temp9);
4671
4672 //
4673 // First phase of the reduction
4674 //
4675 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4676 // independently.
4677 __ movdqu(xmm_temp7, xmm_temp3);
4678 __ movdqu(xmm_temp8, xmm_temp3);
4679 __ movdqu(xmm_temp9, xmm_temp3);
4680 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
4681 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
4682 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
4683 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
4684 __ pxor(xmm_temp7, xmm_temp9);
4685 __ movdqu(xmm_temp8, xmm_temp7);
4686 __ pslldq(xmm_temp7, 12);
4687 __ psrldq(xmm_temp8, 4);
4688 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
4689
4690 //
4691 // Second phase of the reduction
4692 //
4693 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4694 // shift operations.
4695 __ movdqu(xmm_temp2, xmm_temp3);
4696 __ movdqu(xmm_temp4, xmm_temp3);
4697 __ movdqu(xmm_temp5, xmm_temp3);
4698 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
4699 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
4700 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
4701 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
4702 __ pxor(xmm_temp2, xmm_temp5);
4703 __ pxor(xmm_temp2, xmm_temp8);
4704 __ pxor(xmm_temp3, xmm_temp2);
4705 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
4706
4707 __ decrement(blocks);
4708 __ jcc(Assembler::zero, L_exit);
4709 __ movdqu(xmm_temp0, xmm_temp6);
4710 __ addptr(data, 16);
4711 __ jmp(L_ghash_loop);
4712
4713 __ BIND(L_exit);
4714 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
4715 __ movdqu(Address(state, 0), xmm_temp6); // store the result
4716 __ leave();
4717 __ ret(0);
4718 return start;
4719 }
4720
4721 //base64 character set
base64_charset_addr()4722 address base64_charset_addr() {
4723 __ align(CodeEntryAlignment);
4724 StubCodeMark mark(this, "StubRoutines", "base64_charset");
4725 address start = __ pc();
4726 __ emit_data64(0x0000004200000041, relocInfo::none);
4727 __ emit_data64(0x0000004400000043, relocInfo::none);
4728 __ emit_data64(0x0000004600000045, relocInfo::none);
4729 __ emit_data64(0x0000004800000047, relocInfo::none);
4730 __ emit_data64(0x0000004a00000049, relocInfo::none);
4731 __ emit_data64(0x0000004c0000004b, relocInfo::none);
4732 __ emit_data64(0x0000004e0000004d, relocInfo::none);
4733 __ emit_data64(0x000000500000004f, relocInfo::none);
4734 __ emit_data64(0x0000005200000051, relocInfo::none);
4735 __ emit_data64(0x0000005400000053, relocInfo::none);
4736 __ emit_data64(0x0000005600000055, relocInfo::none);
4737 __ emit_data64(0x0000005800000057, relocInfo::none);
4738 __ emit_data64(0x0000005a00000059, relocInfo::none);
4739 __ emit_data64(0x0000006200000061, relocInfo::none);
4740 __ emit_data64(0x0000006400000063, relocInfo::none);
4741 __ emit_data64(0x0000006600000065, relocInfo::none);
4742 __ emit_data64(0x0000006800000067, relocInfo::none);
4743 __ emit_data64(0x0000006a00000069, relocInfo::none);
4744 __ emit_data64(0x0000006c0000006b, relocInfo::none);
4745 __ emit_data64(0x0000006e0000006d, relocInfo::none);
4746 __ emit_data64(0x000000700000006f, relocInfo::none);
4747 __ emit_data64(0x0000007200000071, relocInfo::none);
4748 __ emit_data64(0x0000007400000073, relocInfo::none);
4749 __ emit_data64(0x0000007600000075, relocInfo::none);
4750 __ emit_data64(0x0000007800000077, relocInfo::none);
4751 __ emit_data64(0x0000007a00000079, relocInfo::none);
4752 __ emit_data64(0x0000003100000030, relocInfo::none);
4753 __ emit_data64(0x0000003300000032, relocInfo::none);
4754 __ emit_data64(0x0000003500000034, relocInfo::none);
4755 __ emit_data64(0x0000003700000036, relocInfo::none);
4756 __ emit_data64(0x0000003900000038, relocInfo::none);
4757 __ emit_data64(0x0000002f0000002b, relocInfo::none);
4758 return start;
4759 }
4760
4761 //base64 url character set
base64url_charset_addr()4762 address base64url_charset_addr() {
4763 __ align(CodeEntryAlignment);
4764 StubCodeMark mark(this, "StubRoutines", "base64url_charset");
4765 address start = __ pc();
4766 __ emit_data64(0x0000004200000041, relocInfo::none);
4767 __ emit_data64(0x0000004400000043, relocInfo::none);
4768 __ emit_data64(0x0000004600000045, relocInfo::none);
4769 __ emit_data64(0x0000004800000047, relocInfo::none);
4770 __ emit_data64(0x0000004a00000049, relocInfo::none);
4771 __ emit_data64(0x0000004c0000004b, relocInfo::none);
4772 __ emit_data64(0x0000004e0000004d, relocInfo::none);
4773 __ emit_data64(0x000000500000004f, relocInfo::none);
4774 __ emit_data64(0x0000005200000051, relocInfo::none);
4775 __ emit_data64(0x0000005400000053, relocInfo::none);
4776 __ emit_data64(0x0000005600000055, relocInfo::none);
4777 __ emit_data64(0x0000005800000057, relocInfo::none);
4778 __ emit_data64(0x0000005a00000059, relocInfo::none);
4779 __ emit_data64(0x0000006200000061, relocInfo::none);
4780 __ emit_data64(0x0000006400000063, relocInfo::none);
4781 __ emit_data64(0x0000006600000065, relocInfo::none);
4782 __ emit_data64(0x0000006800000067, relocInfo::none);
4783 __ emit_data64(0x0000006a00000069, relocInfo::none);
4784 __ emit_data64(0x0000006c0000006b, relocInfo::none);
4785 __ emit_data64(0x0000006e0000006d, relocInfo::none);
4786 __ emit_data64(0x000000700000006f, relocInfo::none);
4787 __ emit_data64(0x0000007200000071, relocInfo::none);
4788 __ emit_data64(0x0000007400000073, relocInfo::none);
4789 __ emit_data64(0x0000007600000075, relocInfo::none);
4790 __ emit_data64(0x0000007800000077, relocInfo::none);
4791 __ emit_data64(0x0000007a00000079, relocInfo::none);
4792 __ emit_data64(0x0000003100000030, relocInfo::none);
4793 __ emit_data64(0x0000003300000032, relocInfo::none);
4794 __ emit_data64(0x0000003500000034, relocInfo::none);
4795 __ emit_data64(0x0000003700000036, relocInfo::none);
4796 __ emit_data64(0x0000003900000038, relocInfo::none);
4797 __ emit_data64(0x0000005f0000002d, relocInfo::none);
4798
4799 return start;
4800 }
4801
base64_bswap_mask_addr()4802 address base64_bswap_mask_addr() {
4803 __ align(CodeEntryAlignment);
4804 StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64");
4805 address start = __ pc();
4806 __ emit_data64(0x0504038002010080, relocInfo::none);
4807 __ emit_data64(0x0b0a098008070680, relocInfo::none);
4808 __ emit_data64(0x0908078006050480, relocInfo::none);
4809 __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none);
4810 __ emit_data64(0x0605048003020180, relocInfo::none);
4811 __ emit_data64(0x0c0b0a8009080780, relocInfo::none);
4812 __ emit_data64(0x0504038002010080, relocInfo::none);
4813 __ emit_data64(0x0b0a098008070680, relocInfo::none);
4814
4815 return start;
4816 }
4817
base64_right_shift_mask_addr()4818 address base64_right_shift_mask_addr() {
4819 __ align(CodeEntryAlignment);
4820 StubCodeMark mark(this, "StubRoutines", "right_shift_mask");
4821 address start = __ pc();
4822 __ emit_data64(0x0006000400020000, relocInfo::none);
4823 __ emit_data64(0x0006000400020000, relocInfo::none);
4824 __ emit_data64(0x0006000400020000, relocInfo::none);
4825 __ emit_data64(0x0006000400020000, relocInfo::none);
4826 __ emit_data64(0x0006000400020000, relocInfo::none);
4827 __ emit_data64(0x0006000400020000, relocInfo::none);
4828 __ emit_data64(0x0006000400020000, relocInfo::none);
4829 __ emit_data64(0x0006000400020000, relocInfo::none);
4830
4831 return start;
4832 }
4833
base64_left_shift_mask_addr()4834 address base64_left_shift_mask_addr() {
4835 __ align(CodeEntryAlignment);
4836 StubCodeMark mark(this, "StubRoutines", "left_shift_mask");
4837 address start = __ pc();
4838 __ emit_data64(0x0000000200040000, relocInfo::none);
4839 __ emit_data64(0x0000000200040000, relocInfo::none);
4840 __ emit_data64(0x0000000200040000, relocInfo::none);
4841 __ emit_data64(0x0000000200040000, relocInfo::none);
4842 __ emit_data64(0x0000000200040000, relocInfo::none);
4843 __ emit_data64(0x0000000200040000, relocInfo::none);
4844 __ emit_data64(0x0000000200040000, relocInfo::none);
4845 __ emit_data64(0x0000000200040000, relocInfo::none);
4846
4847 return start;
4848 }
4849
base64_and_mask_addr()4850 address base64_and_mask_addr() {
4851 __ align(CodeEntryAlignment);
4852 StubCodeMark mark(this, "StubRoutines", "and_mask");
4853 address start = __ pc();
4854 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4855 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4856 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4857 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4858 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4859 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4860 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4861 __ emit_data64(0x3f003f003f000000, relocInfo::none);
4862 return start;
4863 }
4864
base64_gather_mask_addr()4865 address base64_gather_mask_addr() {
4866 __ align(CodeEntryAlignment);
4867 StubCodeMark mark(this, "StubRoutines", "gather_mask");
4868 address start = __ pc();
4869 __ emit_data64(0xffffffffffffffff, relocInfo::none);
4870 return start;
4871 }
4872
4873 // Code for generating Base64 encoding.
4874 // Intrinsic function prototype in Base64.java:
4875 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
generate_base64_encodeBlock()4876 address generate_base64_encodeBlock() {
4877 __ align(CodeEntryAlignment);
4878 StubCodeMark mark(this, "StubRoutines", "implEncode");
4879 address start = __ pc();
4880 __ enter();
4881
4882 // Save callee-saved registers before using them
4883 __ push(r12);
4884 __ push(r13);
4885 __ push(r14);
4886 __ push(r15);
4887
4888 // arguments
4889 const Register source = c_rarg0; // Source Array
4890 const Register start_offset = c_rarg1; // start offset
4891 const Register end_offset = c_rarg2; // end offset
4892 const Register dest = c_rarg3; // destination array
4893
4894 #ifndef _WIN64
4895 const Register dp = c_rarg4; // Position for writing to dest array
4896 const Register isURL = c_rarg5;// Base64 or URL character set
4897 #else
4898 const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
4899 const Address isURL_mem(rbp, 7 * wordSize);
4900 const Register isURL = r10; // pick the volatile windows register
4901 const Register dp = r12;
4902 __ movl(dp, dp_mem);
4903 __ movl(isURL, isURL_mem);
4904 #endif
4905
4906 const Register length = r14;
4907 Label L_process80, L_process32, L_process3, L_exit, L_processdata;
4908
4909 // calculate length from offsets
4910 __ movl(length, end_offset);
4911 __ subl(length, start_offset);
4912 __ cmpl(length, 0);
4913 __ jcc(Assembler::lessEqual, L_exit);
4914
4915 __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
4916 // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
4917 __ cmpl(isURL, 0);
4918 __ jcc(Assembler::equal, L_processdata);
4919 __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr()));
4920
4921 // load masks required for encoding data
4922 __ BIND(L_processdata);
4923 __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
4924 // Set 64 bits of K register.
4925 __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
4926 __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
4927 __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
4928 __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
4929 __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13);
4930
4931 // Vector Base64 implementation, producing 96 bytes of encoded data
4932 __ BIND(L_process80);
4933 __ cmpl(length, 80);
4934 __ jcc(Assembler::below, L_process32);
4935 __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit);
4936 __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit);
4937 __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit);
4938
4939 //permute the input data in such a manner that we have continuity of the source
4940 __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit);
4941 __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit);
4942 __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit);
4943
4944 //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte.
4945 //we can deal with 12 bytes at a time in a 128 bit register
4946 __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit);
4947 __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit);
4948 __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit);
4949
4950 //convert byte to word. Each 128 bit register will have 6 bytes for processing
4951 __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit);
4952 __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit);
4953 __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit);
4954
4955 // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers
4956 __ evpsrlvw(xmm0, xmm3, xmm13, Assembler::AVX_512bit);
4957 __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit);
4958 __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit);
4959
4960 __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit);
4961 __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit);
4962 __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit);
4963
4964 __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit);
4965 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
4966 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
4967
4968 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4969 __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit);
4970 __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit);
4971
4972 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
4973 __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit);
4974 __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit);
4975
4976 // Get the final 4*6 bits base64 encoding
4977 __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit);
4978 __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit);
4979 __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit);
4980
4981 // Shift
4982 __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4983 __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit);
4984 __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit);
4985
4986 // look up 6 bits in the base64 character set to fetch the encoding
4987 // we are converting word to dword as gather instructions need dword indices for looking up encoding
4988 __ vextracti64x4(xmm6, xmm3, 0);
4989 __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit);
4990 __ vextracti64x4(xmm6, xmm3, 1);
4991 __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit);
4992
4993 __ vextracti64x4(xmm6, xmm4, 0);
4994 __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit);
4995 __ vextracti64x4(xmm6, xmm4, 1);
4996 __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit);
4997
4998 __ vextracti64x4(xmm4, xmm5, 0);
4999 __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit);
5000
5001 __ vextracti64x4(xmm4, xmm5, 1);
5002 __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
5003
5004 __ kmovql(k2, k3);
5005 __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
5006 __ kmovql(k2, k3);
5007 __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
5008 __ kmovql(k2, k3);
5009 __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
5010 __ kmovql(k2, k3);
5011 __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
5012 __ kmovql(k2, k3);
5013 __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5014 __ kmovql(k2, k3);
5015 __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
5016
5017 //Down convert dword to byte. Final output is 16*6 = 96 bytes long
5018 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit);
5019 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit);
5020 __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit);
5021 __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit);
5022 __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit);
5023 __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit);
5024
5025 __ addq(dest, 96);
5026 __ addq(source, 72);
5027 __ subq(length, 72);
5028 __ jmp(L_process80);
5029
5030 // Vector Base64 implementation generating 32 bytes of encoded data
5031 __ BIND(L_process32);
5032 __ cmpl(length, 32);
5033 __ jcc(Assembler::below, L_process3);
5034 __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit);
5035 __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit);
5036 __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit);
5037 __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit);
5038 __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit);
5039 __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit);
5040
5041 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5042 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5043 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5044 __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit);
5045 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5046 __ vextracti64x4(xmm9, xmm1, 0);
5047 __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
5048 __ vextracti64x4(xmm9, xmm1, 1);
5049 __ vpmovzxwd(xmm5, xmm9, Assembler::AVX_512bit);
5050 __ kmovql(k2, k3);
5051 __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5052 __ kmovql(k2, k3);
5053 __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
5054 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
5055 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
5056 __ subq(length, 24);
5057 __ addq(dest, 32);
5058 __ addq(source, 24);
5059 __ jmp(L_process32);
5060
5061 // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data
5062 /* This code corresponds to the scalar version of the following snippet in Base64.java
5063 ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff);
5064 ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f];
5065 ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f];
5066 ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f];
5067 ** dst[dp0++] = (byte)base64[bits & 0x3f];*/
5068 __ BIND(L_process3);
5069 __ cmpl(length, 3);
5070 __ jcc(Assembler::below, L_exit);
5071 // Read 1 byte at a time
5072 __ movzbl(rax, Address(source, start_offset));
5073 __ shll(rax, 0x10);
5074 __ movl(r15, rax);
5075 __ movzbl(rax, Address(source, start_offset, Address::times_1, 1));
5076 __ shll(rax, 0x8);
5077 __ movzwl(rax, rax);
5078 __ orl(r15, rax);
5079 __ movzbl(rax, Address(source, start_offset, Address::times_1, 2));
5080 __ orl(rax, r15);
5081 // Save 3 bytes read in r15
5082 __ movl(r15, rax);
5083 __ shrl(rax, 0x12);
5084 __ andl(rax, 0x3f);
5085 // rax contains the index, r11 contains base64 lookup table
5086 __ movb(rax, Address(r11, rax, Address::times_4));
5087 // Write the encoded byte to destination
5088 __ movb(Address(dest, dp, Address::times_1, 0), rax);
5089 __ movl(rax, r15);
5090 __ shrl(rax, 0xc);
5091 __ andl(rax, 0x3f);
5092 __ movb(rax, Address(r11, rax, Address::times_4));
5093 __ movb(Address(dest, dp, Address::times_1, 1), rax);
5094 __ movl(rax, r15);
5095 __ shrl(rax, 0x6);
5096 __ andl(rax, 0x3f);
5097 __ movb(rax, Address(r11, rax, Address::times_4));
5098 __ movb(Address(dest, dp, Address::times_1, 2), rax);
5099 __ movl(rax, r15);
5100 __ andl(rax, 0x3f);
5101 __ movb(rax, Address(r11, rax, Address::times_4));
5102 __ movb(Address(dest, dp, Address::times_1, 3), rax);
5103 __ subl(length, 3);
5104 __ addq(dest, 4);
5105 __ addq(source, 3);
5106 __ jmp(L_process3);
5107 __ BIND(L_exit);
5108 __ pop(r15);
5109 __ pop(r14);
5110 __ pop(r13);
5111 __ pop(r12);
5112 __ leave();
5113 __ ret(0);
5114 return start;
5115 }
5116
5117 /**
5118 * Arguments:
5119 *
5120 * Inputs:
5121 * c_rarg0 - int crc
5122 * c_rarg1 - byte* buf
5123 * c_rarg2 - int length
5124 *
5125 * Ouput:
5126 * rax - int crc result
5127 */
generate_updateBytesCRC32()5128 address generate_updateBytesCRC32() {
5129 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
5130
5131 __ align(CodeEntryAlignment);
5132 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5133
5134 address start = __ pc();
5135 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5136 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5137 // rscratch1: r10
5138 const Register crc = c_rarg0; // crc
5139 const Register buf = c_rarg1; // source java byte array address
5140 const Register len = c_rarg2; // length
5141 const Register table = c_rarg3; // crc_table address (reuse register)
5142 const Register tmp = r11;
5143 assert_different_registers(crc, buf, len, table, tmp, rax);
5144
5145 BLOCK_COMMENT("Entry:");
5146 __ enter(); // required for proper stackwalking of RuntimeStub frame
5147
5148 __ kernel_crc32(crc, buf, len, table, tmp);
5149
5150 __ movl(rax, crc);
5151 __ vzeroupper();
5152 __ leave(); // required for proper stackwalking of RuntimeStub frame
5153 __ ret(0);
5154
5155 return start;
5156 }
5157
5158 /**
5159 * Arguments:
5160 *
5161 * Inputs:
5162 * c_rarg0 - int crc
5163 * c_rarg1 - byte* buf
5164 * c_rarg2 - long length
5165 * c_rarg3 - table_start - optional (present only when doing a library_call,
5166 * not used by x86 algorithm)
5167 *
5168 * Ouput:
5169 * rax - int crc result
5170 */
generate_updateBytesCRC32C(bool is_pclmulqdq_supported)5171 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
5172 assert(UseCRC32CIntrinsics, "need SSE4_2");
5173 __ align(CodeEntryAlignment);
5174 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
5175 address start = __ pc();
5176 //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs
5177 //Windows RCX RDX R8 R9 none none XMM0..XMM3
5178 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7
5179 const Register crc = c_rarg0; // crc
5180 const Register buf = c_rarg1; // source java byte array address
5181 const Register len = c_rarg2; // length
5182 const Register a = rax;
5183 const Register j = r9;
5184 const Register k = r10;
5185 const Register l = r11;
5186 #ifdef _WIN64
5187 const Register y = rdi;
5188 const Register z = rsi;
5189 #else
5190 const Register y = rcx;
5191 const Register z = r8;
5192 #endif
5193 assert_different_registers(crc, buf, len, a, j, k, l, y, z);
5194
5195 BLOCK_COMMENT("Entry:");
5196 __ enter(); // required for proper stackwalking of RuntimeStub frame
5197 #ifdef _WIN64
5198 __ push(y);
5199 __ push(z);
5200 #endif
5201 __ crc32c_ipl_alg2_alt2(crc, buf, len,
5202 a, j, k,
5203 l, y, z,
5204 c_farg0, c_farg1, c_farg2,
5205 is_pclmulqdq_supported);
5206 __ movl(rax, crc);
5207 #ifdef _WIN64
5208 __ pop(z);
5209 __ pop(y);
5210 #endif
5211 __ vzeroupper();
5212 __ leave(); // required for proper stackwalking of RuntimeStub frame
5213 __ ret(0);
5214
5215 return start;
5216 }
5217
5218 /**
5219 * Arguments:
5220 *
5221 * Input:
5222 * c_rarg0 - x address
5223 * c_rarg1 - x length
5224 * c_rarg2 - y address
5225 * c_rarg3 - y length
5226 * not Win64
5227 * c_rarg4 - z address
5228 * c_rarg5 - z length
5229 * Win64
5230 * rsp+40 - z address
5231 * rsp+48 - z length
5232 */
generate_multiplyToLen()5233 address generate_multiplyToLen() {
5234 __ align(CodeEntryAlignment);
5235 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5236
5237 address start = __ pc();
5238 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5239 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5240 const Register x = rdi;
5241 const Register xlen = rax;
5242 const Register y = rsi;
5243 const Register ylen = rcx;
5244 const Register z = r8;
5245 const Register zlen = r11;
5246
5247 // Next registers will be saved on stack in multiply_to_len().
5248 const Register tmp1 = r12;
5249 const Register tmp2 = r13;
5250 const Register tmp3 = r14;
5251 const Register tmp4 = r15;
5252 const Register tmp5 = rbx;
5253
5254 BLOCK_COMMENT("Entry:");
5255 __ enter(); // required for proper stackwalking of RuntimeStub frame
5256
5257 #ifndef _WIN64
5258 __ movptr(zlen, r9); // Save r9 in r11 - zlen
5259 #endif
5260 setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
5261 // ylen => rcx, z => r8, zlen => r11
5262 // r9 and r10 may be used to save non-volatile registers
5263 #ifdef _WIN64
5264 // last 2 arguments (#4, #5) are on stack on Win64
5265 __ movptr(z, Address(rsp, 6 * wordSize));
5266 __ movptr(zlen, Address(rsp, 7 * wordSize));
5267 #endif
5268
5269 __ movptr(xlen, rsi);
5270 __ movptr(y, rdx);
5271 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
5272
5273 restore_arg_regs();
5274
5275 __ leave(); // required for proper stackwalking of RuntimeStub frame
5276 __ ret(0);
5277
5278 return start;
5279 }
5280
5281 /**
5282 * Arguments:
5283 *
5284 * Input:
5285 * c_rarg0 - obja address
5286 * c_rarg1 - objb address
5287 * c_rarg3 - length length
5288 * c_rarg4 - scale log2_array_indxscale
5289 *
5290 * Output:
5291 * rax - int >= mismatched index, < 0 bitwise complement of tail
5292 */
generate_vectorizedMismatch()5293 address generate_vectorizedMismatch() {
5294 __ align(CodeEntryAlignment);
5295 StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
5296 address start = __ pc();
5297
5298 BLOCK_COMMENT("Entry:");
5299 __ enter();
5300
5301 #ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5302 const Register scale = c_rarg0; //rcx, will exchange with r9
5303 const Register objb = c_rarg1; //rdx
5304 const Register length = c_rarg2; //r8
5305 const Register obja = c_rarg3; //r9
5306 __ xchgq(obja, scale); //now obja and scale contains the correct contents
5307
5308 const Register tmp1 = r10;
5309 const Register tmp2 = r11;
5310 #endif
5311 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5312 const Register obja = c_rarg0; //U:rdi
5313 const Register objb = c_rarg1; //U:rsi
5314 const Register length = c_rarg2; //U:rdx
5315 const Register scale = c_rarg3; //U:rcx
5316 const Register tmp1 = r8;
5317 const Register tmp2 = r9;
5318 #endif
5319 const Register result = rax; //return value
5320 const XMMRegister vec0 = xmm0;
5321 const XMMRegister vec1 = xmm1;
5322 const XMMRegister vec2 = xmm2;
5323
5324 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
5325
5326 __ vzeroupper();
5327 __ leave();
5328 __ ret(0);
5329
5330 return start;
5331 }
5332
5333 /**
5334 * Arguments:
5335 *
5336 // Input:
5337 // c_rarg0 - x address
5338 // c_rarg1 - x length
5339 // c_rarg2 - z address
5340 // c_rarg3 - z lenth
5341 *
5342 */
generate_squareToLen()5343 address generate_squareToLen() {
5344
5345 __ align(CodeEntryAlignment);
5346 StubCodeMark mark(this, "StubRoutines", "squareToLen");
5347
5348 address start = __ pc();
5349 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5350 // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
5351 const Register x = rdi;
5352 const Register len = rsi;
5353 const Register z = r8;
5354 const Register zlen = rcx;
5355
5356 const Register tmp1 = r12;
5357 const Register tmp2 = r13;
5358 const Register tmp3 = r14;
5359 const Register tmp4 = r15;
5360 const Register tmp5 = rbx;
5361
5362 BLOCK_COMMENT("Entry:");
5363 __ enter(); // required for proper stackwalking of RuntimeStub frame
5364
5365 setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
5366 // zlen => rcx
5367 // r9 and r10 may be used to save non-volatile registers
5368 __ movptr(r8, rdx);
5369 __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5370
5371 restore_arg_regs();
5372
5373 __ leave(); // required for proper stackwalking of RuntimeStub frame
5374 __ ret(0);
5375
5376 return start;
5377 }
5378
generate_method_entry_barrier()5379 address generate_method_entry_barrier() {
5380 __ align(CodeEntryAlignment);
5381 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5382
5383 Label deoptimize_label;
5384
5385 address start = __ pc();
5386
5387 __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
5388
5389 BLOCK_COMMENT("Entry:");
5390 __ enter(); // save rbp
5391
5392 // save c_rarg0, because we want to use that value.
5393 // We could do without it but then we depend on the number of slots used by pusha
5394 __ push(c_rarg0);
5395
5396 __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
5397
5398 __ pusha();
5399
5400 // The method may have floats as arguments, and we must spill them before calling
5401 // the VM runtime.
5402 assert(Argument::n_float_register_parameters_j == 8, "Assumption");
5403 const int xmm_size = wordSize * 2;
5404 const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
5405 __ subptr(rsp, xmm_spill_size);
5406 __ movdqu(Address(rsp, xmm_size * 7), xmm7);
5407 __ movdqu(Address(rsp, xmm_size * 6), xmm6);
5408 __ movdqu(Address(rsp, xmm_size * 5), xmm5);
5409 __ movdqu(Address(rsp, xmm_size * 4), xmm4);
5410 __ movdqu(Address(rsp, xmm_size * 3), xmm3);
5411 __ movdqu(Address(rsp, xmm_size * 2), xmm2);
5412 __ movdqu(Address(rsp, xmm_size * 1), xmm1);
5413 __ movdqu(Address(rsp, xmm_size * 0), xmm0);
5414
5415 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
5416
5417 __ movdqu(xmm0, Address(rsp, xmm_size * 0));
5418 __ movdqu(xmm1, Address(rsp, xmm_size * 1));
5419 __ movdqu(xmm2, Address(rsp, xmm_size * 2));
5420 __ movdqu(xmm3, Address(rsp, xmm_size * 3));
5421 __ movdqu(xmm4, Address(rsp, xmm_size * 4));
5422 __ movdqu(xmm5, Address(rsp, xmm_size * 5));
5423 __ movdqu(xmm6, Address(rsp, xmm_size * 6));
5424 __ movdqu(xmm7, Address(rsp, xmm_size * 7));
5425 __ addptr(rsp, xmm_spill_size);
5426
5427 __ cmpl(rax, 1); // 1 means deoptimize
5428 __ jcc(Assembler::equal, deoptimize_label);
5429
5430 __ popa();
5431 __ pop(c_rarg0);
5432
5433 __ leave();
5434
5435 __ addptr(rsp, 1 * wordSize); // cookie
5436 __ ret(0);
5437
5438
5439 __ BIND(deoptimize_label);
5440
5441 __ popa();
5442 __ pop(c_rarg0);
5443
5444 __ leave();
5445
5446 // this can be taken out, but is good for verification purposes. getting a SIGSEGV
5447 // here while still having a correct stack is valuable
5448 __ testptr(rsp, Address(rsp, 0));
5449
5450 __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
5451 __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
5452
5453 return start;
5454 }
5455
5456 /**
5457 * Arguments:
5458 *
5459 * Input:
5460 * c_rarg0 - out address
5461 * c_rarg1 - in address
5462 * c_rarg2 - offset
5463 * c_rarg3 - len
5464 * not Win64
5465 * c_rarg4 - k
5466 * Win64
5467 * rsp+40 - k
5468 */
generate_mulAdd()5469 address generate_mulAdd() {
5470 __ align(CodeEntryAlignment);
5471 StubCodeMark mark(this, "StubRoutines", "mulAdd");
5472
5473 address start = __ pc();
5474 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5475 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5476 const Register out = rdi;
5477 const Register in = rsi;
5478 const Register offset = r11;
5479 const Register len = rcx;
5480 const Register k = r8;
5481
5482 // Next registers will be saved on stack in mul_add().
5483 const Register tmp1 = r12;
5484 const Register tmp2 = r13;
5485 const Register tmp3 = r14;
5486 const Register tmp4 = r15;
5487 const Register tmp5 = rbx;
5488
5489 BLOCK_COMMENT("Entry:");
5490 __ enter(); // required for proper stackwalking of RuntimeStub frame
5491
5492 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
5493 // len => rcx, k => r8
5494 // r9 and r10 may be used to save non-volatile registers
5495 #ifdef _WIN64
5496 // last argument is on stack on Win64
5497 __ movl(k, Address(rsp, 6 * wordSize));
5498 #endif
5499 __ movptr(r11, rdx); // move offset in rdx to offset(r11)
5500 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5501
5502 restore_arg_regs();
5503
5504 __ leave(); // required for proper stackwalking of RuntimeStub frame
5505 __ ret(0);
5506
5507 return start;
5508 }
5509
generate_libmExp()5510 address generate_libmExp() {
5511 StubCodeMark mark(this, "StubRoutines", "libmExp");
5512
5513 address start = __ pc();
5514
5515 const XMMRegister x0 = xmm0;
5516 const XMMRegister x1 = xmm1;
5517 const XMMRegister x2 = xmm2;
5518 const XMMRegister x3 = xmm3;
5519
5520 const XMMRegister x4 = xmm4;
5521 const XMMRegister x5 = xmm5;
5522 const XMMRegister x6 = xmm6;
5523 const XMMRegister x7 = xmm7;
5524
5525 const Register tmp = r11;
5526
5527 BLOCK_COMMENT("Entry:");
5528 __ enter(); // required for proper stackwalking of RuntimeStub frame
5529
5530 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5531
5532 __ leave(); // required for proper stackwalking of RuntimeStub frame
5533 __ ret(0);
5534
5535 return start;
5536
5537 }
5538
generate_libmLog()5539 address generate_libmLog() {
5540 StubCodeMark mark(this, "StubRoutines", "libmLog");
5541
5542 address start = __ pc();
5543
5544 const XMMRegister x0 = xmm0;
5545 const XMMRegister x1 = xmm1;
5546 const XMMRegister x2 = xmm2;
5547 const XMMRegister x3 = xmm3;
5548
5549 const XMMRegister x4 = xmm4;
5550 const XMMRegister x5 = xmm5;
5551 const XMMRegister x6 = xmm6;
5552 const XMMRegister x7 = xmm7;
5553
5554 const Register tmp1 = r11;
5555 const Register tmp2 = r8;
5556
5557 BLOCK_COMMENT("Entry:");
5558 __ enter(); // required for proper stackwalking of RuntimeStub frame
5559
5560 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
5561
5562 __ leave(); // required for proper stackwalking of RuntimeStub frame
5563 __ ret(0);
5564
5565 return start;
5566
5567 }
5568
generate_libmLog10()5569 address generate_libmLog10() {
5570 StubCodeMark mark(this, "StubRoutines", "libmLog10");
5571
5572 address start = __ pc();
5573
5574 const XMMRegister x0 = xmm0;
5575 const XMMRegister x1 = xmm1;
5576 const XMMRegister x2 = xmm2;
5577 const XMMRegister x3 = xmm3;
5578
5579 const XMMRegister x4 = xmm4;
5580 const XMMRegister x5 = xmm5;
5581 const XMMRegister x6 = xmm6;
5582 const XMMRegister x7 = xmm7;
5583
5584 const Register tmp = r11;
5585
5586 BLOCK_COMMENT("Entry:");
5587 __ enter(); // required for proper stackwalking of RuntimeStub frame
5588
5589 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5590
5591 __ leave(); // required for proper stackwalking of RuntimeStub frame
5592 __ ret(0);
5593
5594 return start;
5595
5596 }
5597
generate_libmPow()5598 address generate_libmPow() {
5599 StubCodeMark mark(this, "StubRoutines", "libmPow");
5600
5601 address start = __ pc();
5602
5603 const XMMRegister x0 = xmm0;
5604 const XMMRegister x1 = xmm1;
5605 const XMMRegister x2 = xmm2;
5606 const XMMRegister x3 = xmm3;
5607
5608 const XMMRegister x4 = xmm4;
5609 const XMMRegister x5 = xmm5;
5610 const XMMRegister x6 = xmm6;
5611 const XMMRegister x7 = xmm7;
5612
5613 const Register tmp1 = r8;
5614 const Register tmp2 = r9;
5615 const Register tmp3 = r10;
5616 const Register tmp4 = r11;
5617
5618 BLOCK_COMMENT("Entry:");
5619 __ enter(); // required for proper stackwalking of RuntimeStub frame
5620
5621 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5622
5623 __ leave(); // required for proper stackwalking of RuntimeStub frame
5624 __ ret(0);
5625
5626 return start;
5627
5628 }
5629
generate_libmSin()5630 address generate_libmSin() {
5631 StubCodeMark mark(this, "StubRoutines", "libmSin");
5632
5633 address start = __ pc();
5634
5635 const XMMRegister x0 = xmm0;
5636 const XMMRegister x1 = xmm1;
5637 const XMMRegister x2 = xmm2;
5638 const XMMRegister x3 = xmm3;
5639
5640 const XMMRegister x4 = xmm4;
5641 const XMMRegister x5 = xmm5;
5642 const XMMRegister x6 = xmm6;
5643 const XMMRegister x7 = xmm7;
5644
5645 const Register tmp1 = r8;
5646 const Register tmp2 = r9;
5647 const Register tmp3 = r10;
5648 const Register tmp4 = r11;
5649
5650 BLOCK_COMMENT("Entry:");
5651 __ enter(); // required for proper stackwalking of RuntimeStub frame
5652
5653 #ifdef _WIN64
5654 __ push(rsi);
5655 __ push(rdi);
5656 #endif
5657 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5658
5659 #ifdef _WIN64
5660 __ pop(rdi);
5661 __ pop(rsi);
5662 #endif
5663
5664 __ leave(); // required for proper stackwalking of RuntimeStub frame
5665 __ ret(0);
5666
5667 return start;
5668
5669 }
5670
generate_libmCos()5671 address generate_libmCos() {
5672 StubCodeMark mark(this, "StubRoutines", "libmCos");
5673
5674 address start = __ pc();
5675
5676 const XMMRegister x0 = xmm0;
5677 const XMMRegister x1 = xmm1;
5678 const XMMRegister x2 = xmm2;
5679 const XMMRegister x3 = xmm3;
5680
5681 const XMMRegister x4 = xmm4;
5682 const XMMRegister x5 = xmm5;
5683 const XMMRegister x6 = xmm6;
5684 const XMMRegister x7 = xmm7;
5685
5686 const Register tmp1 = r8;
5687 const Register tmp2 = r9;
5688 const Register tmp3 = r10;
5689 const Register tmp4 = r11;
5690
5691 BLOCK_COMMENT("Entry:");
5692 __ enter(); // required for proper stackwalking of RuntimeStub frame
5693
5694 #ifdef _WIN64
5695 __ push(rsi);
5696 __ push(rdi);
5697 #endif
5698 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5699
5700 #ifdef _WIN64
5701 __ pop(rdi);
5702 __ pop(rsi);
5703 #endif
5704
5705 __ leave(); // required for proper stackwalking of RuntimeStub frame
5706 __ ret(0);
5707
5708 return start;
5709
5710 }
5711
generate_libmTan()5712 address generate_libmTan() {
5713 StubCodeMark mark(this, "StubRoutines", "libmTan");
5714
5715 address start = __ pc();
5716
5717 const XMMRegister x0 = xmm0;
5718 const XMMRegister x1 = xmm1;
5719 const XMMRegister x2 = xmm2;
5720 const XMMRegister x3 = xmm3;
5721
5722 const XMMRegister x4 = xmm4;
5723 const XMMRegister x5 = xmm5;
5724 const XMMRegister x6 = xmm6;
5725 const XMMRegister x7 = xmm7;
5726
5727 const Register tmp1 = r8;
5728 const Register tmp2 = r9;
5729 const Register tmp3 = r10;
5730 const Register tmp4 = r11;
5731
5732 BLOCK_COMMENT("Entry:");
5733 __ enter(); // required for proper stackwalking of RuntimeStub frame
5734
5735 #ifdef _WIN64
5736 __ push(rsi);
5737 __ push(rdi);
5738 #endif
5739 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5740
5741 #ifdef _WIN64
5742 __ pop(rdi);
5743 __ pop(rsi);
5744 #endif
5745
5746 __ leave(); // required for proper stackwalking of RuntimeStub frame
5747 __ ret(0);
5748
5749 return start;
5750
5751 }
5752
5753 #undef __
5754 #define __ masm->
5755
5756 // Continuation point for throwing of implicit exceptions that are
5757 // not handled in the current activation. Fabricates an exception
5758 // oop and initiates normal exception dispatching in this
5759 // frame. Since we need to preserve callee-saved values (currently
5760 // only for C2, but done for C1 as well) we need a callee-saved oop
5761 // map and therefore have to make these stubs into RuntimeStubs
5762 // rather than BufferBlobs. If the compiler needs all registers to
5763 // be preserved between the fault point and the exception handler
5764 // then it must assume responsibility for that in
5765 // AbstractCompiler::continuation_for_implicit_null_exception or
5766 // continuation_for_implicit_division_by_zero_exception. All other
5767 // implicit exceptions (e.g., NullPointerException or
5768 // AbstractMethodError on entry) are either at call sites or
5769 // otherwise assume that stack unwinding will be initiated, so
5770 // caller saved registers were assumed volatile in the compiler.
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)5771 address generate_throw_exception(const char* name,
5772 address runtime_entry,
5773 Register arg1 = noreg,
5774 Register arg2 = noreg) {
5775 // Information about frame layout at time of blocking runtime call.
5776 // Note that we only have to preserve callee-saved registers since
5777 // the compilers are responsible for supplying a continuation point
5778 // if they expect all registers to be preserved.
5779 enum layout {
5780 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5781 rbp_off2,
5782 return_off,
5783 return_off2,
5784 framesize // inclusive of return address
5785 };
5786
5787 int insts_size = 512;
5788 int locs_size = 64;
5789
5790 CodeBuffer code(name, insts_size, locs_size);
5791 OopMapSet* oop_maps = new OopMapSet();
5792 MacroAssembler* masm = new MacroAssembler(&code);
5793
5794 address start = __ pc();
5795
5796 // This is an inlined and slightly modified version of call_VM
5797 // which has the ability to fetch the return PC out of
5798 // thread-local storage and also sets up last_Java_sp slightly
5799 // differently than the real call_VM
5800
5801 __ enter(); // required for proper stackwalking of RuntimeStub frame
5802
5803 assert(is_even(framesize/2), "sp not 16-byte aligned");
5804
5805 // return address and rbp are already in place
5806 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5807
5808 int frame_complete = __ pc() - start;
5809
5810 // Set up last_Java_sp and last_Java_fp
5811 address the_pc = __ pc();
5812 __ set_last_Java_frame(rsp, rbp, the_pc);
5813 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
5814
5815 // Call runtime
5816 if (arg1 != noreg) {
5817 assert(arg2 != c_rarg1, "clobbered");
5818 __ movptr(c_rarg1, arg1);
5819 }
5820 if (arg2 != noreg) {
5821 __ movptr(c_rarg2, arg2);
5822 }
5823 __ movptr(c_rarg0, r15_thread);
5824 BLOCK_COMMENT("call runtime_entry");
5825 __ call(RuntimeAddress(runtime_entry));
5826
5827 // Generate oop map
5828 OopMap* map = new OopMap(framesize, 0);
5829
5830 oop_maps->add_gc_map(the_pc - start, map);
5831
5832 __ reset_last_Java_frame(true);
5833
5834 __ leave(); // required for proper stackwalking of RuntimeStub frame
5835
5836 // check for pending exceptions
5837 #ifdef ASSERT
5838 Label L;
5839 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5840 (int32_t) NULL_WORD);
5841 __ jcc(Assembler::notEqual, L);
5842 __ should_not_reach_here();
5843 __ bind(L);
5844 #endif // ASSERT
5845 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5846
5847
5848 // codeBlob framesize is in words (not VMRegImpl::slot_size)
5849 RuntimeStub* stub =
5850 RuntimeStub::new_runtime_stub(name,
5851 &code,
5852 frame_complete,
5853 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5854 oop_maps, false);
5855 return stub->entry_point();
5856 }
5857
create_control_words()5858 void create_control_words() {
5859 // Round to nearest, 53-bit mode, exceptions masked
5860 StubRoutines::_fpu_cntrl_wrd_std = 0x027F;
5861 // Round to zero, 53-bit mode, exception mased
5862 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5863 // Round to nearest, 24-bit mode, exceptions masked
5864 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F;
5865 // Round to nearest, 64-bit mode, exceptions masked
5866 StubRoutines::_mxcsr_std = 0x1F80;
5867 // Note: the following two constants are 80-bit values
5868 // layout is critical for correct loading by FPU.
5869 // Bias for strict fp multiply/divide
5870 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5871 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5872 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5873 // Un-Bias for strict fp multiply/divide
5874 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5875 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5876 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5877 }
5878
5879 // Initialization
generate_initial()5880 void generate_initial() {
5881 // Generates all stubs and initializes the entry points
5882
5883 // This platform-specific settings are needed by generate_call_stub()
5884 create_control_words();
5885
5886 // entry points that exist in all platforms Note: This is code
5887 // that could be shared among different platforms - however the
5888 // benefit seems to be smaller than the disadvantage of having a
5889 // much more complicated generator structure. See also comment in
5890 // stubRoutines.hpp.
5891
5892 StubRoutines::_forward_exception_entry = generate_forward_exception();
5893
5894 StubRoutines::_call_stub_entry =
5895 generate_call_stub(StubRoutines::_call_stub_return_address);
5896
5897 // is referenced by megamorphic call
5898 StubRoutines::_catch_exception_entry = generate_catch_exception();
5899
5900 // atomic calls
5901 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
5902 StubRoutines::_atomic_xchg_long_entry = generate_atomic_xchg_long();
5903 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
5904 StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte();
5905 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
5906 StubRoutines::_atomic_add_entry = generate_atomic_add();
5907 StubRoutines::_atomic_add_long_entry = generate_atomic_add_long();
5908 StubRoutines::_fence_entry = generate_orderaccess_fence();
5909
5910 // platform dependent
5911 StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
5912 StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
5913
5914 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr();
5915
5916 // Build this early so it's available for the interpreter.
5917 StubRoutines::_throw_StackOverflowError_entry =
5918 generate_throw_exception("StackOverflowError throw_exception",
5919 CAST_FROM_FN_PTR(address,
5920 SharedRuntime::
5921 throw_StackOverflowError));
5922 StubRoutines::_throw_delayed_StackOverflowError_entry =
5923 generate_throw_exception("delayed StackOverflowError throw_exception",
5924 CAST_FROM_FN_PTR(address,
5925 SharedRuntime::
5926 throw_delayed_StackOverflowError));
5927 if (UseCRC32Intrinsics) {
5928 // set table address before stub generation which use it
5929 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5930 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5931 }
5932
5933 if (UseCRC32CIntrinsics) {
5934 bool supports_clmul = VM_Version::supports_clmul();
5935 StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5936 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5937 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5938 }
5939 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5940 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5941 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5942 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5943 StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5944 StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5945 StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5946 StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5947 StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5948 StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5949 StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5950 StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5951 StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5952 StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5953 StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5954 StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5955 StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5956 StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5957 }
5958 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5959 StubRoutines::_dexp = generate_libmExp();
5960 }
5961 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5962 StubRoutines::_dlog = generate_libmLog();
5963 }
5964 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5965 StubRoutines::_dlog10 = generate_libmLog10();
5966 }
5967 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5968 StubRoutines::_dpow = generate_libmPow();
5969 }
5970 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5971 StubRoutines::_dsin = generate_libmSin();
5972 }
5973 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5974 StubRoutines::_dcos = generate_libmCos();
5975 }
5976 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5977 StubRoutines::_dtan = generate_libmTan();
5978 }
5979 }
5980 }
5981
generate_all()5982 void generate_all() {
5983 // Generates all stubs and initializes the entry points
5984
5985 // These entry points require SharedInfo::stack0 to be set up in
5986 // non-core builds and need to be relocatable, so they each
5987 // fabricate a RuntimeStub internally.
5988 StubRoutines::_throw_AbstractMethodError_entry =
5989 generate_throw_exception("AbstractMethodError throw_exception",
5990 CAST_FROM_FN_PTR(address,
5991 SharedRuntime::
5992 throw_AbstractMethodError));
5993
5994 StubRoutines::_throw_IncompatibleClassChangeError_entry =
5995 generate_throw_exception("IncompatibleClassChangeError throw_exception",
5996 CAST_FROM_FN_PTR(address,
5997 SharedRuntime::
5998 throw_IncompatibleClassChangeError));
5999
6000 StubRoutines::_throw_NullPointerException_at_call_entry =
6001 generate_throw_exception("NullPointerException at call throw_exception",
6002 CAST_FROM_FN_PTR(address,
6003 SharedRuntime::
6004 throw_NullPointerException_at_call));
6005
6006 // entry points that are platform specific
6007 StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
6008 StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
6009 StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
6010 StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
6011
6012 StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
6013 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
6014 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6015 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
6016 StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
6017 StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
6018 StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6019 StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
6020 StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
6021 StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
6022 StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
6023
6024 // support for verify_oop (must happen after universe_init)
6025 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6026
6027 // arraycopy stubs used by compilers
6028 generate_arraycopy_stubs();
6029
6030 // don't bother generating these AES intrinsic stubs unless global flag is set
6031 if (UseAESIntrinsics) {
6032 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
6033 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6034 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6035 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6036 if (VM_Version::supports_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
6037 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
6038 } else {
6039 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
6040 }
6041 }
6042 if (UseAESCTRIntrinsics){
6043 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
6044 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
6045 }
6046
6047 if (UseSHA1Intrinsics) {
6048 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
6049 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
6050 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
6051 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
6052 }
6053 if (UseSHA256Intrinsics) {
6054 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
6055 char* dst = (char*)StubRoutines::x86::_k256_W;
6056 char* src = (char*)StubRoutines::x86::_k256;
6057 for (int ii = 0; ii < 16; ++ii) {
6058 memcpy(dst + 32 * ii, src + 16 * ii, 16);
6059 memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
6060 }
6061 StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
6062 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
6063 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
6064 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
6065 }
6066 if (UseSHA512Intrinsics) {
6067 StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
6068 StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
6069 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
6070 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
6071 }
6072
6073 // Generate GHASH intrinsics code
6074 if (UseGHASHIntrinsics) {
6075 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
6076 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
6077 if (VM_Version::supports_avx()) {
6078 StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
6079 StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
6080 StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
6081 } else {
6082 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6083 }
6084 }
6085
6086 if (UseBASE64Intrinsics) {
6087 StubRoutines::x86::_and_mask = base64_and_mask_addr();
6088 StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr();
6089 StubRoutines::x86::_base64_charset = base64_charset_addr();
6090 StubRoutines::x86::_url_charset = base64url_charset_addr();
6091 StubRoutines::x86::_gather_mask = base64_gather_mask_addr();
6092 StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr();
6093 StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr();
6094 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6095 }
6096
6097 // Safefetch stubs.
6098 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
6099 &StubRoutines::_safefetch32_fault_pc,
6100 &StubRoutines::_safefetch32_continuation_pc);
6101 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6102 &StubRoutines::_safefetchN_fault_pc,
6103 &StubRoutines::_safefetchN_continuation_pc);
6104
6105 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6106 if (bs_nm != NULL) {
6107 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
6108 }
6109 #ifdef COMPILER2
6110 if (UseMultiplyToLenIntrinsic) {
6111 StubRoutines::_multiplyToLen = generate_multiplyToLen();
6112 }
6113 if (UseSquareToLenIntrinsic) {
6114 StubRoutines::_squareToLen = generate_squareToLen();
6115 }
6116 if (UseMulAddIntrinsic) {
6117 StubRoutines::_mulAdd = generate_mulAdd();
6118 }
6119 if (UseMontgomeryMultiplyIntrinsic) {
6120 StubRoutines::_montgomeryMultiply
6121 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
6122 }
6123 if (UseMontgomerySquareIntrinsic) {
6124 StubRoutines::_montgomerySquare
6125 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
6126 }
6127 #endif // COMPILER2
6128
6129 if (UseVectorizedMismatchIntrinsic) {
6130 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
6131 }
6132 }
6133
6134 public:
StubGenerator(CodeBuffer * code,bool all)6135 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6136 if (all) {
6137 generate_all();
6138 } else {
6139 generate_initial();
6140 }
6141 }
6142 }; // end class declaration
6143
6144 #define UCM_TABLE_MAX_ENTRIES 16
StubGenerator_generate(CodeBuffer * code,bool all)6145 void StubGenerator_generate(CodeBuffer* code, bool all) {
6146 if (UnsafeCopyMemory::_table == NULL) {
6147 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
6148 }
6149 StubGenerator g(code, all);
6150 }
6151