1 /*
2 * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "gc/shared/barrierSet.hpp"
29 #include "gc/shared/barrierSetAssembler.hpp"
30 #include "gc/shared/barrierSetNMethod.hpp"
31 #include "interpreter/interpreter.hpp"
32 #include "memory/universe.hpp"
33 #include "nativeInst_x86.hpp"
34 #include "oops/instanceOop.hpp"
35 #include "oops/method.hpp"
36 #include "oops/objArrayKlass.hpp"
37 #include "oops/oop.inline.hpp"
38 #include "prims/methodHandles.hpp"
39 #include "runtime/frame.inline.hpp"
40 #include "runtime/handles.inline.hpp"
41 #include "runtime/sharedRuntime.hpp"
42 #include "runtime/stubCodeGenerator.hpp"
43 #include "runtime/stubRoutines.hpp"
44 #include "runtime/thread.inline.hpp"
45 #ifdef COMPILER2
46 #include "opto/runtime.hpp"
47 #endif
48
49 // Declaration and definition of StubGenerator (no .hpp file).
50 // For a more detailed description of the stub routine structure
51 // see the comment in stubRoutines.hpp
52
53 #define __ _masm->
54 #define a__ ((Assembler*)_masm)->
55
56 #ifdef PRODUCT
57 #define BLOCK_COMMENT(str) /* nothing */
58 #else
59 #define BLOCK_COMMENT(str) __ block_comment(str)
60 #endif
61
62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
63
64 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
65 const int FPU_CNTRL_WRD_MASK = 0xFFFF;
66
67 // -------------------------------------------------------------------------------------------------------------------------
68 // Stub Code definitions
69
70 class StubGenerator: public StubCodeGenerator {
71 private:
72
73 #ifdef PRODUCT
74 #define inc_counter_np(counter) ((void)0)
75 #else
76 void inc_counter_np_(int& counter) {
77 __ incrementl(ExternalAddress((address)&counter));
78 }
79 #define inc_counter_np(counter) \
80 BLOCK_COMMENT("inc_counter " #counter); \
81 inc_counter_np_(counter);
82 #endif //PRODUCT
83
inc_copy_counter_np(BasicType t)84 void inc_copy_counter_np(BasicType t) {
85 #ifndef PRODUCT
86 switch (t) {
87 case T_BYTE: inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
88 case T_SHORT: inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
89 case T_INT: inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
90 case T_LONG: inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
91 case T_OBJECT: inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
92 default: ShouldNotReachHere();
93 }
94 #endif //PRODUCT
95 }
96
97 //------------------------------------------------------------------------------------------------------------------------
98 // Call stubs are used to call Java from C
99 //
100 // [ return_from_Java ] <--- rsp
101 // [ argument word n ]
102 // ...
103 // -N [ argument word 1 ]
104 // -7 [ Possible padding for stack alignment ]
105 // -6 [ Possible padding for stack alignment ]
106 // -5 [ Possible padding for stack alignment ]
107 // -4 [ mxcsr save ] <--- rsp_after_call
108 // -3 [ saved rbx, ]
109 // -2 [ saved rsi ]
110 // -1 [ saved rdi ]
111 // 0 [ saved rbp, ] <--- rbp,
112 // 1 [ return address ]
113 // 2 [ ptr. to call wrapper ]
114 // 3 [ result ]
115 // 4 [ result_type ]
116 // 5 [ method ]
117 // 6 [ entry_point ]
118 // 7 [ parameters ]
119 // 8 [ parameter_size ]
120 // 9 [ thread ]
121
122
generate_call_stub(address & return_address)123 address generate_call_stub(address& return_address) {
124 StubCodeMark mark(this, "StubRoutines", "call_stub");
125 address start = __ pc();
126
127 // stub code parameters / addresses
128 assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
129 bool sse_save = false;
130 const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
131 const int locals_count_in_bytes (4*wordSize);
132 const Address mxcsr_save (rbp, -4 * wordSize);
133 const Address saved_rbx (rbp, -3 * wordSize);
134 const Address saved_rsi (rbp, -2 * wordSize);
135 const Address saved_rdi (rbp, -1 * wordSize);
136 const Address result (rbp, 3 * wordSize);
137 const Address result_type (rbp, 4 * wordSize);
138 const Address method (rbp, 5 * wordSize);
139 const Address entry_point (rbp, 6 * wordSize);
140 const Address parameters (rbp, 7 * wordSize);
141 const Address parameter_size(rbp, 8 * wordSize);
142 const Address thread (rbp, 9 * wordSize); // same as in generate_catch_exception()!
143 sse_save = UseSSE > 0;
144
145 // stub code
146 __ enter();
147 __ movptr(rcx, parameter_size); // parameter counter
148 __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
149 __ addptr(rcx, locals_count_in_bytes); // reserve space for register saves
150 __ subptr(rsp, rcx);
151 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
152
153 // save rdi, rsi, & rbx, according to C calling conventions
154 __ movptr(saved_rdi, rdi);
155 __ movptr(saved_rsi, rsi);
156 __ movptr(saved_rbx, rbx);
157
158 // save and initialize %mxcsr
159 if (sse_save) {
160 Label skip_ldmx;
161 __ stmxcsr(mxcsr_save);
162 __ movl(rax, mxcsr_save);
163 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
164 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
165 __ cmp32(rax, mxcsr_std);
166 __ jcc(Assembler::equal, skip_ldmx);
167 __ ldmxcsr(mxcsr_std);
168 __ bind(skip_ldmx);
169 }
170
171 // make sure the control word is correct.
172 __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
173
174 #ifdef ASSERT
175 // make sure we have no pending exceptions
176 { Label L;
177 __ movptr(rcx, thread);
178 __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
179 __ jcc(Assembler::equal, L);
180 __ stop("StubRoutines::call_stub: entered with pending exception");
181 __ bind(L);
182 }
183 #endif
184
185 // pass parameters if any
186 BLOCK_COMMENT("pass parameters if any");
187 Label parameters_done;
188 __ movl(rcx, parameter_size); // parameter counter
189 __ testl(rcx, rcx);
190 __ jcc(Assembler::zero, parameters_done);
191
192 // parameter passing loop
193
194 Label loop;
195 // Copy Java parameters in reverse order (receiver last)
196 // Note that the argument order is inverted in the process
197 // source is rdx[rcx: N-1..0]
198 // dest is rsp[rbx: 0..N-1]
199
200 __ movptr(rdx, parameters); // parameter pointer
201 __ xorptr(rbx, rbx);
202
203 __ BIND(loop);
204
205 // get parameter
206 __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
207 __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
208 Interpreter::expr_offset_in_bytes(0)), rax); // store parameter
209 __ increment(rbx);
210 __ decrement(rcx);
211 __ jcc(Assembler::notZero, loop);
212
213 // call Java function
214 __ BIND(parameters_done);
215 __ movptr(rbx, method); // get Method*
216 __ movptr(rax, entry_point); // get entry_point
217 __ mov(rsi, rsp); // set sender sp
218 BLOCK_COMMENT("call Java function");
219 __ call(rax);
220
221 BLOCK_COMMENT("call_stub_return_address:");
222 return_address = __ pc();
223
224 #ifdef COMPILER2
225 {
226 Label L_skip;
227 if (UseSSE >= 2) {
228 __ verify_FPU(0, "call_stub_return");
229 } else {
230 for (int i = 1; i < 8; i++) {
231 __ ffree(i);
232 }
233
234 // UseSSE <= 1 so double result should be left on TOS
235 __ movl(rsi, result_type);
236 __ cmpl(rsi, T_DOUBLE);
237 __ jcc(Assembler::equal, L_skip);
238 if (UseSSE == 0) {
239 // UseSSE == 0 so float result should be left on TOS
240 __ cmpl(rsi, T_FLOAT);
241 __ jcc(Assembler::equal, L_skip);
242 }
243 __ ffree(0);
244 }
245 __ BIND(L_skip);
246 }
247 #endif // COMPILER2
248
249 // store result depending on type
250 // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
251 __ movptr(rdi, result);
252 Label is_long, is_float, is_double, exit;
253 __ movl(rsi, result_type);
254 __ cmpl(rsi, T_LONG);
255 __ jcc(Assembler::equal, is_long);
256 __ cmpl(rsi, T_FLOAT);
257 __ jcc(Assembler::equal, is_float);
258 __ cmpl(rsi, T_DOUBLE);
259 __ jcc(Assembler::equal, is_double);
260
261 // handle T_INT case
262 __ movl(Address(rdi, 0), rax);
263 __ BIND(exit);
264
265 // check that FPU stack is empty
266 __ verify_FPU(0, "generate_call_stub");
267
268 // pop parameters
269 __ lea(rsp, rsp_after_call);
270
271 // restore %mxcsr
272 if (sse_save) {
273 __ ldmxcsr(mxcsr_save);
274 }
275
276 // restore rdi, rsi and rbx,
277 __ movptr(rbx, saved_rbx);
278 __ movptr(rsi, saved_rsi);
279 __ movptr(rdi, saved_rdi);
280 __ addptr(rsp, 4*wordSize);
281
282 // return
283 __ pop(rbp);
284 __ ret(0);
285
286 // handle return types different from T_INT
287 __ BIND(is_long);
288 __ movl(Address(rdi, 0 * wordSize), rax);
289 __ movl(Address(rdi, 1 * wordSize), rdx);
290 __ jmp(exit);
291
292 __ BIND(is_float);
293 // interpreter uses xmm0 for return values
294 if (UseSSE >= 1) {
295 __ movflt(Address(rdi, 0), xmm0);
296 } else {
297 __ fstp_s(Address(rdi, 0));
298 }
299 __ jmp(exit);
300
301 __ BIND(is_double);
302 // interpreter uses xmm0 for return values
303 if (UseSSE >= 2) {
304 __ movdbl(Address(rdi, 0), xmm0);
305 } else {
306 __ fstp_d(Address(rdi, 0));
307 }
308 __ jmp(exit);
309
310 return start;
311 }
312
313
314 //------------------------------------------------------------------------------------------------------------------------
315 // Return point for a Java call if there's an exception thrown in Java code.
316 // The exception is caught and transformed into a pending exception stored in
317 // JavaThread that can be tested from within the VM.
318 //
319 // Note: Usually the parameters are removed by the callee. In case of an exception
320 // crossing an activation frame boundary, that is not the case if the callee
321 // is compiled code => need to setup the rsp.
322 //
323 // rax,: exception oop
324
generate_catch_exception()325 address generate_catch_exception() {
326 StubCodeMark mark(this, "StubRoutines", "catch_exception");
327 const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
328 const Address thread (rbp, 9 * wordSize); // same as in generate_call_stub()!
329 address start = __ pc();
330
331 // get thread directly
332 __ movptr(rcx, thread);
333 #ifdef ASSERT
334 // verify that threads correspond
335 { Label L;
336 __ get_thread(rbx);
337 __ cmpptr(rbx, rcx);
338 __ jcc(Assembler::equal, L);
339 __ stop("StubRoutines::catch_exception: threads must correspond");
340 __ bind(L);
341 }
342 #endif
343 // set pending exception
344 __ verify_oop(rax);
345 __ movptr(Address(rcx, Thread::pending_exception_offset()), rax );
346 __ lea(Address(rcx, Thread::exception_file_offset ()),
347 ExternalAddress((address)__FILE__));
348 __ movl(Address(rcx, Thread::exception_line_offset ()), __LINE__ );
349 // complete return to VM
350 assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
351 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
352
353 return start;
354 }
355
356
357 //------------------------------------------------------------------------------------------------------------------------
358 // Continuation point for runtime calls returning with a pending exception.
359 // The pending exception check happened in the runtime or native call stub.
360 // The pending exception in Thread is converted into a Java-level exception.
361 //
362 // Contract with Java-level exception handlers:
363 // rax: exception
364 // rdx: throwing pc
365 //
366 // NOTE: At entry of this stub, exception-pc must be on stack !!
367
generate_forward_exception()368 address generate_forward_exception() {
369 StubCodeMark mark(this, "StubRoutines", "forward exception");
370 address start = __ pc();
371 const Register thread = rcx;
372
373 // other registers used in this stub
374 const Register exception_oop = rax;
375 const Register handler_addr = rbx;
376 const Register exception_pc = rdx;
377
378 // Upon entry, the sp points to the return address returning into Java
379 // (interpreted or compiled) code; i.e., the return address becomes the
380 // throwing pc.
381 //
382 // Arguments pushed before the runtime call are still on the stack but
383 // the exception handler will reset the stack pointer -> ignore them.
384 // A potential result in registers can be ignored as well.
385
386 #ifdef ASSERT
387 // make sure this code is only executed if there is a pending exception
388 { Label L;
389 __ get_thread(thread);
390 __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
391 __ jcc(Assembler::notEqual, L);
392 __ stop("StubRoutines::forward exception: no pending exception (1)");
393 __ bind(L);
394 }
395 #endif
396
397 // compute exception handler into rbx,
398 __ get_thread(thread);
399 __ movptr(exception_pc, Address(rsp, 0));
400 BLOCK_COMMENT("call exception_handler_for_return_address");
401 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
402 __ mov(handler_addr, rax);
403
404 // setup rax & rdx, remove return address & clear pending exception
405 __ get_thread(thread);
406 __ pop(exception_pc);
407 __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
408 __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
409
410 #ifdef ASSERT
411 // make sure exception is set
412 { Label L;
413 __ testptr(exception_oop, exception_oop);
414 __ jcc(Assembler::notEqual, L);
415 __ stop("StubRoutines::forward exception: no pending exception (2)");
416 __ bind(L);
417 }
418 #endif
419
420 // Verify that there is really a valid exception in RAX.
421 __ verify_oop(exception_oop);
422
423 // continue at exception handler (return address removed)
424 // rax: exception
425 // rbx: exception handler
426 // rdx: throwing pc
427 __ jmp(handler_addr);
428
429 return start;
430 }
431
432
433 //----------------------------------------------------------------------------------------------------
434 // Implementation of int32_t atomic_xchg(int32_t exchange_value, volatile int32_t* dest)
435 // used by Atomic::xchg(volatile int32_t* dest, int32_t exchange_value)
436 //
437 // xchg exists as far back as 8086, lock needed for MP only
438 // Stack layout immediately after call:
439 //
440 // 0 [ret addr ] <--- rsp
441 // 1 [ ex ]
442 // 2 [ dest ]
443 //
444 // Result: *dest <- ex, return (old *dest)
445 //
446 // Note: win32 does not currently use this code
447
generate_atomic_xchg()448 address generate_atomic_xchg() {
449 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
450 address start = __ pc();
451
452 __ push(rdx);
453 Address exchange(rsp, 2 * wordSize);
454 Address dest_addr(rsp, 3 * wordSize);
455 __ movl(rax, exchange);
456 __ movptr(rdx, dest_addr);
457 __ xchgl(rax, Address(rdx, 0));
458 __ pop(rdx);
459 __ ret(0);
460
461 return start;
462 }
463
464 //----------------------------------------------------------------------------------------------------
465 // Support for void verify_mxcsr()
466 //
467 // This routine is used with -Xcheck:jni to verify that native
468 // JNI code does not return to Java code without restoring the
469 // MXCSR register to our expected state.
470
471
generate_verify_mxcsr()472 address generate_verify_mxcsr() {
473 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
474 address start = __ pc();
475
476 const Address mxcsr_save(rsp, 0);
477
478 if (CheckJNICalls && UseSSE > 0 ) {
479 Label ok_ret;
480 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
481 __ push(rax);
482 __ subptr(rsp, wordSize); // allocate a temp location
483 __ stmxcsr(mxcsr_save);
484 __ movl(rax, mxcsr_save);
485 __ andl(rax, MXCSR_MASK);
486 __ cmp32(rax, mxcsr_std);
487 __ jcc(Assembler::equal, ok_ret);
488
489 __ warn("MXCSR changed by native JNI code.");
490
491 __ ldmxcsr(mxcsr_std);
492
493 __ bind(ok_ret);
494 __ addptr(rsp, wordSize);
495 __ pop(rax);
496 }
497
498 __ ret(0);
499
500 return start;
501 }
502
503
504 //---------------------------------------------------------------------------
505 // Support for void verify_fpu_cntrl_wrd()
506 //
507 // This routine is used with -Xcheck:jni to verify that native
508 // JNI code does not return to Java code without restoring the
509 // FP control word to our expected state.
510
generate_verify_fpu_cntrl_wrd()511 address generate_verify_fpu_cntrl_wrd() {
512 StubCodeMark mark(this, "StubRoutines", "verify_spcw");
513 address start = __ pc();
514
515 const Address fpu_cntrl_wrd_save(rsp, 0);
516
517 if (CheckJNICalls) {
518 Label ok_ret;
519 __ push(rax);
520 __ subptr(rsp, wordSize); // allocate a temp location
521 __ fnstcw(fpu_cntrl_wrd_save);
522 __ movl(rax, fpu_cntrl_wrd_save);
523 __ andl(rax, FPU_CNTRL_WRD_MASK);
524 ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
525 __ cmp32(rax, fpu_std);
526 __ jcc(Assembler::equal, ok_ret);
527
528 __ warn("Floating point control word changed by native JNI code.");
529
530 __ fldcw(fpu_std);
531
532 __ bind(ok_ret);
533 __ addptr(rsp, wordSize);
534 __ pop(rax);
535 }
536
537 __ ret(0);
538
539 return start;
540 }
541
542 //---------------------------------------------------------------------------
543 // Wrapper for slow-case handling of double-to-integer conversion
544 // d2i or f2i fast case failed either because it is nan or because
545 // of under/overflow.
546 // Input: FPU TOS: float value
547 // Output: rax, (rdx): integer (long) result
548
generate_d2i_wrapper(BasicType t,address fcn)549 address generate_d2i_wrapper(BasicType t, address fcn) {
550 StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
551 address start = __ pc();
552
553 // Capture info about frame layout
554 enum layout { FPUState_off = 0,
555 rbp_off = FPUStateSizeInWords,
556 rdi_off,
557 rsi_off,
558 rcx_off,
559 rbx_off,
560 saved_argument_off,
561 saved_argument_off2, // 2nd half of double
562 framesize
563 };
564
565 assert(FPUStateSizeInWords == 27, "update stack layout");
566
567 // Save outgoing argument to stack across push_FPU_state()
568 __ subptr(rsp, wordSize * 2);
569 __ fstp_d(Address(rsp, 0));
570
571 // Save CPU & FPU state
572 __ push(rbx);
573 __ push(rcx);
574 __ push(rsi);
575 __ push(rdi);
576 __ push(rbp);
577 __ push_FPU_state();
578
579 // push_FPU_state() resets the FP top of stack
580 // Load original double into FP top of stack
581 __ fld_d(Address(rsp, saved_argument_off * wordSize));
582 // Store double into stack as outgoing argument
583 __ subptr(rsp, wordSize*2);
584 __ fst_d(Address(rsp, 0));
585
586 // Prepare FPU for doing math in C-land
587 __ empty_FPU_stack();
588 // Call the C code to massage the double. Result in EAX
589 if (t == T_INT)
590 { BLOCK_COMMENT("SharedRuntime::d2i"); }
591 else if (t == T_LONG)
592 { BLOCK_COMMENT("SharedRuntime::d2l"); }
593 __ call_VM_leaf( fcn, 2 );
594
595 // Restore CPU & FPU state
596 __ pop_FPU_state();
597 __ pop(rbp);
598 __ pop(rdi);
599 __ pop(rsi);
600 __ pop(rcx);
601 __ pop(rbx);
602 __ addptr(rsp, wordSize * 2);
603
604 __ ret(0);
605
606 return start;
607 }
608 //---------------------------------------------------------------------------------------------------
609
generate_vector_mask(const char * stub_name,int32_t mask)610 address generate_vector_mask(const char *stub_name, int32_t mask) {
611 __ align(CodeEntryAlignment);
612 StubCodeMark mark(this, "StubRoutines", stub_name);
613 address start = __ pc();
614
615 for (int i = 0; i < 16; i++) {
616 __ emit_data(mask, relocInfo::none, 0);
617 }
618
619 return start;
620 }
621
generate_vector_mask_long_double(const char * stub_name,int32_t maskhi,int32_t masklo)622 address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
623 __ align(CodeEntryAlignment);
624 StubCodeMark mark(this, "StubRoutines", stub_name);
625 address start = __ pc();
626
627 for (int i = 0; i < 8; i++) {
628 __ emit_data(masklo, relocInfo::none, 0);
629 __ emit_data(maskhi, relocInfo::none, 0);
630 }
631
632 return start;
633 }
634
635 //----------------------------------------------------------------------------------------------------
636
generate_vector_byte_perm_mask(const char * stub_name)637 address generate_vector_byte_perm_mask(const char *stub_name) {
638 __ align(CodeEntryAlignment);
639 StubCodeMark mark(this, "StubRoutines", stub_name);
640 address start = __ pc();
641
642 __ emit_data(0x00000001, relocInfo::none, 0);
643 __ emit_data(0x00000000, relocInfo::none, 0);
644 __ emit_data(0x00000003, relocInfo::none, 0);
645 __ emit_data(0x00000000, relocInfo::none, 0);
646 __ emit_data(0x00000005, relocInfo::none, 0);
647 __ emit_data(0x00000000, relocInfo::none, 0);
648 __ emit_data(0x00000007, relocInfo::none, 0);
649 __ emit_data(0x00000000, relocInfo::none, 0);
650 __ emit_data(0x00000000, relocInfo::none, 0);
651 __ emit_data(0x00000000, relocInfo::none, 0);
652 __ emit_data(0x00000002, relocInfo::none, 0);
653 __ emit_data(0x00000000, relocInfo::none, 0);
654 __ emit_data(0x00000004, relocInfo::none, 0);
655 __ emit_data(0x00000000, relocInfo::none, 0);
656 __ emit_data(0x00000006, relocInfo::none, 0);
657 __ emit_data(0x00000000, relocInfo::none, 0);
658
659 return start;
660 }
661
662 //----------------------------------------------------------------------------------------------------
663 // Non-destructive plausibility checks for oops
664
generate_verify_oop()665 address generate_verify_oop() {
666 StubCodeMark mark(this, "StubRoutines", "verify_oop");
667 address start = __ pc();
668
669 // Incoming arguments on stack after saving rax,:
670 //
671 // [tos ]: saved rdx
672 // [tos + 1]: saved EFLAGS
673 // [tos + 2]: return address
674 // [tos + 3]: char* error message
675 // [tos + 4]: oop object to verify
676 // [tos + 5]: saved rax, - saved by caller and bashed
677
678 Label exit, error;
679 __ pushf();
680 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
681 __ push(rdx); // save rdx
682 // make sure object is 'reasonable'
683 __ movptr(rax, Address(rsp, 4 * wordSize)); // get object
684 __ testptr(rax, rax);
685 __ jcc(Assembler::zero, exit); // if obj is NULL it is ok
686
687 // Check if the oop is in the right area of memory
688 const int oop_mask = Universe::verify_oop_mask();
689 const int oop_bits = Universe::verify_oop_bits();
690 __ mov(rdx, rax);
691 __ andptr(rdx, oop_mask);
692 __ cmpptr(rdx, oop_bits);
693 __ jcc(Assembler::notZero, error);
694
695 // make sure klass is 'reasonable', which is not zero.
696 __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
697 __ testptr(rax, rax);
698 __ jcc(Assembler::zero, error); // if klass is NULL it is broken
699
700 // return if everything seems ok
701 __ bind(exit);
702 __ movptr(rax, Address(rsp, 5 * wordSize)); // get saved rax, back
703 __ pop(rdx); // restore rdx
704 __ popf(); // restore EFLAGS
705 __ ret(3 * wordSize); // pop arguments
706
707 // handle errors
708 __ bind(error);
709 __ movptr(rax, Address(rsp, 5 * wordSize)); // get saved rax, back
710 __ pop(rdx); // get saved rdx back
711 __ popf(); // get saved EFLAGS off stack -- will be ignored
712 __ pusha(); // push registers (eip = return address & msg are already pushed)
713 BLOCK_COMMENT("call MacroAssembler::debug");
714 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
715 __ hlt();
716 return start;
717 }
718
719
720 // Copy 64 bytes chunks
721 //
722 // Inputs:
723 // from - source array address
724 // to_from - destination array address - from
725 // qword_count - 8-bytes element count, negative
726 //
xmm_copy_forward(Register from,Register to_from,Register qword_count)727 void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
728 assert( UseSSE >= 2, "supported cpu only" );
729 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
730
731 // Copy 64-byte chunks
732 __ jmpb(L_copy_64_bytes);
733 __ align(OptoLoopAlignment);
734 __ BIND(L_copy_64_bytes_loop);
735
736 if (UseUnalignedLoadStores) {
737 if (UseAVX > 2) {
738 __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
739 __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
740 } else if (UseAVX == 2) {
741 __ vmovdqu(xmm0, Address(from, 0));
742 __ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0);
743 __ vmovdqu(xmm1, Address(from, 32));
744 __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
745 } else {
746 __ movdqu(xmm0, Address(from, 0));
747 __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
748 __ movdqu(xmm1, Address(from, 16));
749 __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
750 __ movdqu(xmm2, Address(from, 32));
751 __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
752 __ movdqu(xmm3, Address(from, 48));
753 __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
754 }
755 } else {
756 __ movq(xmm0, Address(from, 0));
757 __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
758 __ movq(xmm1, Address(from, 8));
759 __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
760 __ movq(xmm2, Address(from, 16));
761 __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
762 __ movq(xmm3, Address(from, 24));
763 __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
764 __ movq(xmm4, Address(from, 32));
765 __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
766 __ movq(xmm5, Address(from, 40));
767 __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
768 __ movq(xmm6, Address(from, 48));
769 __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
770 __ movq(xmm7, Address(from, 56));
771 __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
772 }
773
774 __ addl(from, 64);
775 __ BIND(L_copy_64_bytes);
776 __ subl(qword_count, 8);
777 __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
778
779 if (UseUnalignedLoadStores && (UseAVX == 2)) {
780 // clean upper bits of YMM registers
781 __ vpxor(xmm0, xmm0);
782 __ vpxor(xmm1, xmm1);
783 }
784 __ addl(qword_count, 8);
785 __ jccb(Assembler::zero, L_exit);
786 //
787 // length is too short, just copy qwords
788 //
789 __ BIND(L_copy_8_bytes);
790 __ movq(xmm0, Address(from, 0));
791 __ movq(Address(from, to_from, Address::times_1), xmm0);
792 __ addl(from, 8);
793 __ decrement(qword_count);
794 __ jcc(Assembler::greater, L_copy_8_bytes);
795 __ BIND(L_exit);
796 }
797
generate_disjoint_copy(BasicType t,bool aligned,Address::ScaleFactor sf,address * entry,const char * name,bool dest_uninitialized=false)798 address generate_disjoint_copy(BasicType t, bool aligned,
799 Address::ScaleFactor sf,
800 address* entry, const char *name,
801 bool dest_uninitialized = false) {
802 __ align(CodeEntryAlignment);
803 StubCodeMark mark(this, "StubRoutines", name);
804 address start = __ pc();
805
806 Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
807 Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
808
809 int shift = Address::times_ptr - sf;
810
811 const Register from = rsi; // source array address
812 const Register to = rdi; // destination array address
813 const Register count = rcx; // elements count
814 const Register to_from = to; // (to - from)
815 const Register saved_to = rdx; // saved destination array address
816
817 __ enter(); // required for proper stackwalking of RuntimeStub frame
818 __ push(rsi);
819 __ push(rdi);
820 __ movptr(from , Address(rsp, 12+ 4));
821 __ movptr(to , Address(rsp, 12+ 8));
822 __ movl(count, Address(rsp, 12+ 12));
823
824 if (entry != NULL) {
825 *entry = __ pc(); // Entry point from conjoint arraycopy stub.
826 BLOCK_COMMENT("Entry:");
827 }
828
829 if (t == T_OBJECT) {
830 __ testl(count, count);
831 __ jcc(Assembler::zero, L_0_count);
832 }
833
834 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
835 if (dest_uninitialized) {
836 decorators |= IS_DEST_UNINITIALIZED;
837 }
838 if (aligned) {
839 decorators |= ARRAYCOPY_ALIGNED;
840 }
841
842 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
843 bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
844 {
845 bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
846 // UnsafeCopyMemory page error: continue after ucm
847 UnsafeCopyMemoryMark ucmm(this, add_entry, true);
848 __ subptr(to, from); // to --> to_from
849 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
850 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
851 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
852 // align source address at 4 bytes address boundary
853 if (t == T_BYTE) {
854 // One byte misalignment happens only for byte arrays
855 __ testl(from, 1);
856 __ jccb(Assembler::zero, L_skip_align1);
857 __ movb(rax, Address(from, 0));
858 __ movb(Address(from, to_from, Address::times_1, 0), rax);
859 __ increment(from);
860 __ decrement(count);
861 __ BIND(L_skip_align1);
862 }
863 // Two bytes misalignment happens only for byte and short (char) arrays
864 __ testl(from, 2);
865 __ jccb(Assembler::zero, L_skip_align2);
866 __ movw(rax, Address(from, 0));
867 __ movw(Address(from, to_from, Address::times_1, 0), rax);
868 __ addptr(from, 2);
869 __ subl(count, 1<<(shift-1));
870 __ BIND(L_skip_align2);
871 }
872 if (!UseXMMForArrayCopy) {
873 __ mov(rax, count); // save 'count'
874 __ shrl(count, shift); // bytes count
875 __ addptr(to_from, from);// restore 'to'
876 __ rep_mov();
877 __ subptr(to_from, from);// restore 'to_from'
878 __ mov(count, rax); // restore 'count'
879 __ jmpb(L_copy_2_bytes); // all dwords were copied
880 } else {
881 if (!UseUnalignedLoadStores) {
882 // align to 8 bytes, we know we are 4 byte aligned to start
883 __ testptr(from, 4);
884 __ jccb(Assembler::zero, L_copy_64_bytes);
885 __ movl(rax, Address(from, 0));
886 __ movl(Address(from, to_from, Address::times_1, 0), rax);
887 __ addptr(from, 4);
888 __ subl(count, 1<<shift);
889 }
890 __ BIND(L_copy_64_bytes);
891 __ mov(rax, count);
892 __ shrl(rax, shift+1); // 8 bytes chunk count
893 //
894 // Copy 8-byte chunks through XMM registers, 8 per iteration of the loop
895 //
896 xmm_copy_forward(from, to_from, rax);
897 }
898 // copy tailing dword
899 __ BIND(L_copy_4_bytes);
900 __ testl(count, 1<<shift);
901 __ jccb(Assembler::zero, L_copy_2_bytes);
902 __ movl(rax, Address(from, 0));
903 __ movl(Address(from, to_from, Address::times_1, 0), rax);
904 if (t == T_BYTE || t == T_SHORT) {
905 __ addptr(from, 4);
906 __ BIND(L_copy_2_bytes);
907 // copy tailing word
908 __ testl(count, 1<<(shift-1));
909 __ jccb(Assembler::zero, L_copy_byte);
910 __ movw(rax, Address(from, 0));
911 __ movw(Address(from, to_from, Address::times_1, 0), rax);
912 if (t == T_BYTE) {
913 __ addptr(from, 2);
914 __ BIND(L_copy_byte);
915 // copy tailing byte
916 __ testl(count, 1);
917 __ jccb(Assembler::zero, L_exit);
918 __ movb(rax, Address(from, 0));
919 __ movb(Address(from, to_from, Address::times_1, 0), rax);
920 __ BIND(L_exit);
921 } else {
922 __ BIND(L_copy_byte);
923 }
924 } else {
925 __ BIND(L_copy_2_bytes);
926 }
927 }
928
929 __ movl(count, Address(rsp, 12+12)); // reread 'count'
930 bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
931
932 if (t == T_OBJECT) {
933 __ BIND(L_0_count);
934 }
935 inc_copy_counter_np(t);
936 __ pop(rdi);
937 __ pop(rsi);
938 __ leave(); // required for proper stackwalking of RuntimeStub frame
939 __ vzeroupper();
940 __ xorptr(rax, rax); // return 0
941 __ ret(0);
942 return start;
943 }
944
945
generate_fill(BasicType t,bool aligned,const char * name)946 address generate_fill(BasicType t, bool aligned, const char *name) {
947 __ align(CodeEntryAlignment);
948 StubCodeMark mark(this, "StubRoutines", name);
949 address start = __ pc();
950
951 BLOCK_COMMENT("Entry:");
952
953 const Register to = rdi; // source array address
954 const Register value = rdx; // value
955 const Register count = rsi; // elements count
956
957 __ enter(); // required for proper stackwalking of RuntimeStub frame
958 __ push(rsi);
959 __ push(rdi);
960 __ movptr(to , Address(rsp, 12+ 4));
961 __ movl(value, Address(rsp, 12+ 8));
962 __ movl(count, Address(rsp, 12+ 12));
963
964 __ generate_fill(t, aligned, to, value, count, rax, xmm0);
965
966 __ pop(rdi);
967 __ pop(rsi);
968 __ leave(); // required for proper stackwalking of RuntimeStub frame
969 __ ret(0);
970 return start;
971 }
972
generate_conjoint_copy(BasicType t,bool aligned,Address::ScaleFactor sf,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)973 address generate_conjoint_copy(BasicType t, bool aligned,
974 Address::ScaleFactor sf,
975 address nooverlap_target,
976 address* entry, const char *name,
977 bool dest_uninitialized = false) {
978 __ align(CodeEntryAlignment);
979 StubCodeMark mark(this, "StubRoutines", name);
980 address start = __ pc();
981
982 Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
983 Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
984
985 int shift = Address::times_ptr - sf;
986
987 const Register src = rax; // source array address
988 const Register dst = rdx; // destination array address
989 const Register from = rsi; // source array address
990 const Register to = rdi; // destination array address
991 const Register count = rcx; // elements count
992 const Register end = rax; // array end address
993
994 __ enter(); // required for proper stackwalking of RuntimeStub frame
995 __ push(rsi);
996 __ push(rdi);
997 __ movptr(src , Address(rsp, 12+ 4)); // from
998 __ movptr(dst , Address(rsp, 12+ 8)); // to
999 __ movl2ptr(count, Address(rsp, 12+12)); // count
1000
1001 if (entry != NULL) {
1002 *entry = __ pc(); // Entry point from generic arraycopy stub.
1003 BLOCK_COMMENT("Entry:");
1004 }
1005
1006 // nooverlap_target expects arguments in rsi and rdi.
1007 __ mov(from, src);
1008 __ mov(to , dst);
1009
1010 // arrays overlap test: dispatch to disjoint stub if necessary.
1011 RuntimeAddress nooverlap(nooverlap_target);
1012 __ cmpptr(dst, src);
1013 __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1014 __ jump_cc(Assembler::belowEqual, nooverlap);
1015 __ cmpptr(dst, end);
1016 __ jump_cc(Assembler::aboveEqual, nooverlap);
1017
1018 if (t == T_OBJECT) {
1019 __ testl(count, count);
1020 __ jcc(Assembler::zero, L_0_count);
1021 }
1022
1023 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1024 if (dest_uninitialized) {
1025 decorators |= IS_DEST_UNINITIALIZED;
1026 }
1027 if (aligned) {
1028 decorators |= ARRAYCOPY_ALIGNED;
1029 }
1030
1031 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1032 bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1033
1034 {
1035 bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1036 // UnsafeCopyMemory page error: continue after ucm
1037 UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1038 // copy from high to low
1039 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1040 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1041 if (t == T_BYTE || t == T_SHORT) {
1042 // Align the end of destination array at 4 bytes address boundary
1043 __ lea(end, Address(dst, count, sf, 0));
1044 if (t == T_BYTE) {
1045 // One byte misalignment happens only for byte arrays
1046 __ testl(end, 1);
1047 __ jccb(Assembler::zero, L_skip_align1);
1048 __ decrement(count);
1049 __ movb(rdx, Address(from, count, sf, 0));
1050 __ movb(Address(to, count, sf, 0), rdx);
1051 __ BIND(L_skip_align1);
1052 }
1053 // Two bytes misalignment happens only for byte and short (char) arrays
1054 __ testl(end, 2);
1055 __ jccb(Assembler::zero, L_skip_align2);
1056 __ subptr(count, 1<<(shift-1));
1057 __ movw(rdx, Address(from, count, sf, 0));
1058 __ movw(Address(to, count, sf, 0), rdx);
1059 __ BIND(L_skip_align2);
1060 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1061 __ jcc(Assembler::below, L_copy_4_bytes);
1062 }
1063
1064 if (!UseXMMForArrayCopy) {
1065 __ std();
1066 __ mov(rax, count); // Save 'count'
1067 __ mov(rdx, to); // Save 'to'
1068 __ lea(rsi, Address(from, count, sf, -4));
1069 __ lea(rdi, Address(to , count, sf, -4));
1070 __ shrptr(count, shift); // bytes count
1071 __ rep_mov();
1072 __ cld();
1073 __ mov(count, rax); // restore 'count'
1074 __ andl(count, (1<<shift)-1); // mask the number of rest elements
1075 __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1076 __ mov(to, rdx); // restore 'to'
1077 __ jmpb(L_copy_2_bytes); // all dword were copied
1078 } else {
1079 // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1080 __ testptr(end, 4);
1081 __ jccb(Assembler::zero, L_copy_8_bytes);
1082 __ subl(count, 1<<shift);
1083 __ movl(rdx, Address(from, count, sf, 0));
1084 __ movl(Address(to, count, sf, 0), rdx);
1085 __ jmpb(L_copy_8_bytes);
1086
1087 __ align(OptoLoopAlignment);
1088 // Move 8 bytes
1089 __ BIND(L_copy_8_bytes_loop);
1090 __ movq(xmm0, Address(from, count, sf, 0));
1091 __ movq(Address(to, count, sf, 0), xmm0);
1092 __ BIND(L_copy_8_bytes);
1093 __ subl(count, 2<<shift);
1094 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1095 __ addl(count, 2<<shift);
1096 }
1097 __ BIND(L_copy_4_bytes);
1098 // copy prefix qword
1099 __ testl(count, 1<<shift);
1100 __ jccb(Assembler::zero, L_copy_2_bytes);
1101 __ movl(rdx, Address(from, count, sf, -4));
1102 __ movl(Address(to, count, sf, -4), rdx);
1103
1104 if (t == T_BYTE || t == T_SHORT) {
1105 __ subl(count, (1<<shift));
1106 __ BIND(L_copy_2_bytes);
1107 // copy prefix dword
1108 __ testl(count, 1<<(shift-1));
1109 __ jccb(Assembler::zero, L_copy_byte);
1110 __ movw(rdx, Address(from, count, sf, -2));
1111 __ movw(Address(to, count, sf, -2), rdx);
1112 if (t == T_BYTE) {
1113 __ subl(count, 1<<(shift-1));
1114 __ BIND(L_copy_byte);
1115 // copy prefix byte
1116 __ testl(count, 1);
1117 __ jccb(Assembler::zero, L_exit);
1118 __ movb(rdx, Address(from, 0));
1119 __ movb(Address(to, 0), rdx);
1120 __ BIND(L_exit);
1121 } else {
1122 __ BIND(L_copy_byte);
1123 }
1124 } else {
1125 __ BIND(L_copy_2_bytes);
1126 }
1127 }
1128
1129 __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1130 bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1131
1132 if (t == T_OBJECT) {
1133 __ BIND(L_0_count);
1134 }
1135 inc_copy_counter_np(t);
1136 __ pop(rdi);
1137 __ pop(rsi);
1138 __ leave(); // required for proper stackwalking of RuntimeStub frame
1139 __ xorptr(rax, rax); // return 0
1140 __ ret(0);
1141 return start;
1142 }
1143
1144
generate_disjoint_long_copy(address * entry,const char * name)1145 address generate_disjoint_long_copy(address* entry, const char *name) {
1146 __ align(CodeEntryAlignment);
1147 StubCodeMark mark(this, "StubRoutines", name);
1148 address start = __ pc();
1149
1150 Label L_copy_8_bytes, L_copy_8_bytes_loop;
1151 const Register from = rax; // source array address
1152 const Register to = rdx; // destination array address
1153 const Register count = rcx; // elements count
1154 const Register to_from = rdx; // (to - from)
1155
1156 __ enter(); // required for proper stackwalking of RuntimeStub frame
1157 __ movptr(from , Address(rsp, 8+0)); // from
1158 __ movptr(to , Address(rsp, 8+4)); // to
1159 __ movl2ptr(count, Address(rsp, 8+8)); // count
1160
1161 *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1162 BLOCK_COMMENT("Entry:");
1163
1164 {
1165 // UnsafeCopyMemory page error: continue after ucm
1166 UnsafeCopyMemoryMark ucmm(this, true, true);
1167 __ subptr(to, from); // to --> to_from
1168 if (UseXMMForArrayCopy) {
1169 xmm_copy_forward(from, to_from, count);
1170 } else {
1171 __ jmpb(L_copy_8_bytes);
1172 __ align(OptoLoopAlignment);
1173 __ BIND(L_copy_8_bytes_loop);
1174 __ fild_d(Address(from, 0));
1175 __ fistp_d(Address(from, to_from, Address::times_1));
1176 __ addptr(from, 8);
1177 __ BIND(L_copy_8_bytes);
1178 __ decrement(count);
1179 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1180 }
1181 }
1182 inc_copy_counter_np(T_LONG);
1183 __ leave(); // required for proper stackwalking of RuntimeStub frame
1184 __ vzeroupper();
1185 __ xorptr(rax, rax); // return 0
1186 __ ret(0);
1187 return start;
1188 }
1189
generate_conjoint_long_copy(address nooverlap_target,address * entry,const char * name)1190 address generate_conjoint_long_copy(address nooverlap_target,
1191 address* entry, const char *name) {
1192 __ align(CodeEntryAlignment);
1193 StubCodeMark mark(this, "StubRoutines", name);
1194 address start = __ pc();
1195
1196 Label L_copy_8_bytes, L_copy_8_bytes_loop;
1197 const Register from = rax; // source array address
1198 const Register to = rdx; // destination array address
1199 const Register count = rcx; // elements count
1200 const Register end_from = rax; // source array end address
1201
1202 __ enter(); // required for proper stackwalking of RuntimeStub frame
1203 __ movptr(from , Address(rsp, 8+0)); // from
1204 __ movptr(to , Address(rsp, 8+4)); // to
1205 __ movl2ptr(count, Address(rsp, 8+8)); // count
1206
1207 *entry = __ pc(); // Entry point from generic arraycopy stub.
1208 BLOCK_COMMENT("Entry:");
1209
1210 // arrays overlap test
1211 __ cmpptr(to, from);
1212 RuntimeAddress nooverlap(nooverlap_target);
1213 __ jump_cc(Assembler::belowEqual, nooverlap);
1214 __ lea(end_from, Address(from, count, Address::times_8, 0));
1215 __ cmpptr(to, end_from);
1216 __ movptr(from, Address(rsp, 8)); // from
1217 __ jump_cc(Assembler::aboveEqual, nooverlap);
1218
1219 {
1220 // UnsafeCopyMemory page error: continue after ucm
1221 UnsafeCopyMemoryMark ucmm(this, true, true);
1222
1223 __ jmpb(L_copy_8_bytes);
1224
1225 __ align(OptoLoopAlignment);
1226 __ BIND(L_copy_8_bytes_loop);
1227 if (UseXMMForArrayCopy) {
1228 __ movq(xmm0, Address(from, count, Address::times_8));
1229 __ movq(Address(to, count, Address::times_8), xmm0);
1230 } else {
1231 __ fild_d(Address(from, count, Address::times_8));
1232 __ fistp_d(Address(to, count, Address::times_8));
1233 }
1234 __ BIND(L_copy_8_bytes);
1235 __ decrement(count);
1236 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1237
1238 }
1239 inc_copy_counter_np(T_LONG);
1240 __ leave(); // required for proper stackwalking of RuntimeStub frame
1241 __ xorptr(rax, rax); // return 0
1242 __ ret(0);
1243 return start;
1244 }
1245
1246
1247 // Helper for generating a dynamic type check.
1248 // The sub_klass must be one of {rbx, rdx, rsi}.
1249 // The temp is killed.
generate_type_check(Register sub_klass,Address & super_check_offset_addr,Address & super_klass_addr,Register temp,Label * L_success,Label * L_failure)1250 void generate_type_check(Register sub_klass,
1251 Address& super_check_offset_addr,
1252 Address& super_klass_addr,
1253 Register temp,
1254 Label* L_success, Label* L_failure) {
1255 BLOCK_COMMENT("type_check:");
1256
1257 Label L_fallthrough;
1258 #define LOCAL_JCC(assembler_con, label_ptr) \
1259 if (label_ptr != NULL) __ jcc(assembler_con, *(label_ptr)); \
1260 else __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1261
1262 // The following is a strange variation of the fast path which requires
1263 // one less register, because needed values are on the argument stack.
1264 // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1265 // L_success, L_failure, NULL);
1266 assert_different_registers(sub_klass, temp);
1267
1268 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1269
1270 // if the pointers are equal, we are done (e.g., String[] elements)
1271 __ cmpptr(sub_klass, super_klass_addr);
1272 LOCAL_JCC(Assembler::equal, L_success);
1273
1274 // check the supertype display:
1275 __ movl2ptr(temp, super_check_offset_addr);
1276 Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1277 __ movptr(temp, super_check_addr); // load displayed supertype
1278 __ cmpptr(temp, super_klass_addr); // test the super type
1279 LOCAL_JCC(Assembler::equal, L_success);
1280
1281 // if it was a primary super, we can just fail immediately
1282 __ cmpl(super_check_offset_addr, sc_offset);
1283 LOCAL_JCC(Assembler::notEqual, L_failure);
1284
1285 // The repne_scan instruction uses fixed registers, which will get spilled.
1286 // We happen to know this works best when super_klass is in rax.
1287 Register super_klass = temp;
1288 __ movptr(super_klass, super_klass_addr);
1289 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1290 L_success, L_failure);
1291
1292 __ bind(L_fallthrough);
1293
1294 if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
1295 if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
1296
1297 #undef LOCAL_JCC
1298 }
1299
1300 //
1301 // Generate checkcasting array copy stub
1302 //
1303 // Input:
1304 // 4(rsp) - source array address
1305 // 8(rsp) - destination array address
1306 // 12(rsp) - element count, can be zero
1307 // 16(rsp) - size_t ckoff (super_check_offset)
1308 // 20(rsp) - oop ckval (super_klass)
1309 //
1310 // Output:
1311 // rax, == 0 - success
1312 // rax, == -1^K - failure, where K is partial transfer count
1313 //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)1314 address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1315 __ align(CodeEntryAlignment);
1316 StubCodeMark mark(this, "StubRoutines", name);
1317 address start = __ pc();
1318
1319 Label L_load_element, L_store_element, L_do_card_marks, L_done;
1320
1321 // register use:
1322 // rax, rdx, rcx -- loop control (end_from, end_to, count)
1323 // rdi, rsi -- element access (oop, klass)
1324 // rbx, -- temp
1325 const Register from = rax; // source array address
1326 const Register to = rdx; // destination array address
1327 const Register length = rcx; // elements count
1328 const Register elem = rdi; // each oop copied
1329 const Register elem_klass = rsi; // each elem._klass (sub_klass)
1330 const Register temp = rbx; // lone remaining temp
1331
1332 __ enter(); // required for proper stackwalking of RuntimeStub frame
1333
1334 __ push(rsi);
1335 __ push(rdi);
1336 __ push(rbx);
1337
1338 Address from_arg(rsp, 16+ 4); // from
1339 Address to_arg(rsp, 16+ 8); // to
1340 Address length_arg(rsp, 16+12); // elements count
1341 Address ckoff_arg(rsp, 16+16); // super_check_offset
1342 Address ckval_arg(rsp, 16+20); // super_klass
1343
1344 // Load up:
1345 __ movptr(from, from_arg);
1346 __ movptr(to, to_arg);
1347 __ movl2ptr(length, length_arg);
1348
1349 if (entry != NULL) {
1350 *entry = __ pc(); // Entry point from generic arraycopy stub.
1351 BLOCK_COMMENT("Entry:");
1352 }
1353
1354 //---------------------------------------------------------------
1355 // Assembler stub will be used for this call to arraycopy
1356 // if the two arrays are subtypes of Object[] but the
1357 // destination array type is not equal to or a supertype
1358 // of the source type. Each element must be separately
1359 // checked.
1360
1361 // Loop-invariant addresses. They are exclusive end pointers.
1362 Address end_from_addr(from, length, Address::times_ptr, 0);
1363 Address end_to_addr(to, length, Address::times_ptr, 0);
1364
1365 Register end_from = from; // re-use
1366 Register end_to = to; // re-use
1367 Register count = length; // re-use
1368
1369 // Loop-variant addresses. They assume post-incremented count < 0.
1370 Address from_element_addr(end_from, count, Address::times_ptr, 0);
1371 Address to_element_addr(end_to, count, Address::times_ptr, 0);
1372 Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1373
1374 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1375 if (dest_uninitialized) {
1376 decorators |= IS_DEST_UNINITIALIZED;
1377 }
1378
1379 BasicType type = T_OBJECT;
1380 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1381 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1382
1383 // Copy from low to high addresses, indexed from the end of each array.
1384 __ lea(end_from, end_from_addr);
1385 __ lea(end_to, end_to_addr);
1386 assert(length == count, ""); // else fix next line:
1387 __ negptr(count); // negate and test the length
1388 __ jccb(Assembler::notZero, L_load_element);
1389
1390 // Empty array: Nothing to do.
1391 __ xorptr(rax, rax); // return 0 on (trivial) success
1392 __ jmp(L_done);
1393
1394 // ======== begin loop ========
1395 // (Loop is rotated; its entry is L_load_element.)
1396 // Loop control:
1397 // for (count = -count; count != 0; count++)
1398 // Base pointers src, dst are biased by 8*count,to last element.
1399 __ align(OptoLoopAlignment);
1400
1401 __ BIND(L_store_element);
1402 __ movptr(to_element_addr, elem); // store the oop
1403 __ increment(count); // increment the count toward zero
1404 __ jccb(Assembler::zero, L_do_card_marks);
1405
1406 // ======== loop entry is here ========
1407 __ BIND(L_load_element);
1408 __ movptr(elem, from_element_addr); // load the oop
1409 __ testptr(elem, elem);
1410 __ jccb(Assembler::zero, L_store_element);
1411
1412 // (Could do a trick here: Remember last successful non-null
1413 // element stored and make a quick oop equality check on it.)
1414
1415 __ movptr(elem_klass, elem_klass_addr); // query the object klass
1416 generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1417 &L_store_element, NULL);
1418 // (On fall-through, we have failed the element type check.)
1419 // ======== end loop ========
1420
1421 // It was a real error; we must depend on the caller to finish the job.
1422 // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1423 // Emit GC store barriers for the oops we have copied (length_arg + count),
1424 // and report their number to the caller.
1425 assert_different_registers(to, count, rax);
1426 Label L_post_barrier;
1427 __ addl(count, length_arg); // transfers = (length - remaining)
1428 __ movl2ptr(rax, count); // save the value
1429 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
1430 __ jccb(Assembler::notZero, L_post_barrier);
1431 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1432
1433 // Come here on success only.
1434 __ BIND(L_do_card_marks);
1435 __ xorptr(rax, rax); // return 0 on success
1436 __ movl2ptr(count, length_arg);
1437
1438 __ BIND(L_post_barrier);
1439 __ movptr(to, to_arg); // reload
1440 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1441
1442 // Common exit point (success or failure).
1443 __ BIND(L_done);
1444 __ pop(rbx);
1445 __ pop(rdi);
1446 __ pop(rsi);
1447 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1448 __ leave(); // required for proper stackwalking of RuntimeStub frame
1449 __ ret(0);
1450
1451 return start;
1452 }
1453
1454 //
1455 // Generate 'unsafe' array copy stub
1456 // Though just as safe as the other stubs, it takes an unscaled
1457 // size_t argument instead of an element count.
1458 //
1459 // Input:
1460 // 4(rsp) - source array address
1461 // 8(rsp) - destination array address
1462 // 12(rsp) - byte count, can be zero
1463 //
1464 // Output:
1465 // rax, == 0 - success
1466 // rax, == -1 - need to call System.arraycopy
1467 //
1468 // Examines the alignment of the operands and dispatches
1469 // to a long, int, short, or byte copy loop.
1470 //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)1471 address generate_unsafe_copy(const char *name,
1472 address byte_copy_entry,
1473 address short_copy_entry,
1474 address int_copy_entry,
1475 address long_copy_entry) {
1476
1477 Label L_long_aligned, L_int_aligned, L_short_aligned;
1478
1479 __ align(CodeEntryAlignment);
1480 StubCodeMark mark(this, "StubRoutines", name);
1481 address start = __ pc();
1482
1483 const Register from = rax; // source array address
1484 const Register to = rdx; // destination array address
1485 const Register count = rcx; // elements count
1486
1487 __ enter(); // required for proper stackwalking of RuntimeStub frame
1488 __ push(rsi);
1489 __ push(rdi);
1490 Address from_arg(rsp, 12+ 4); // from
1491 Address to_arg(rsp, 12+ 8); // to
1492 Address count_arg(rsp, 12+12); // byte count
1493
1494 // Load up:
1495 __ movptr(from , from_arg);
1496 __ movptr(to , to_arg);
1497 __ movl2ptr(count, count_arg);
1498
1499 // bump this on entry, not on exit:
1500 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1501
1502 const Register bits = rsi;
1503 __ mov(bits, from);
1504 __ orptr(bits, to);
1505 __ orptr(bits, count);
1506
1507 __ testl(bits, BytesPerLong-1);
1508 __ jccb(Assembler::zero, L_long_aligned);
1509
1510 __ testl(bits, BytesPerInt-1);
1511 __ jccb(Assembler::zero, L_int_aligned);
1512
1513 __ testl(bits, BytesPerShort-1);
1514 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1515
1516 __ BIND(L_short_aligned);
1517 __ shrptr(count, LogBytesPerShort); // size => short_count
1518 __ movl(count_arg, count); // update 'count'
1519 __ jump(RuntimeAddress(short_copy_entry));
1520
1521 __ BIND(L_int_aligned);
1522 __ shrptr(count, LogBytesPerInt); // size => int_count
1523 __ movl(count_arg, count); // update 'count'
1524 __ jump(RuntimeAddress(int_copy_entry));
1525
1526 __ BIND(L_long_aligned);
1527 __ shrptr(count, LogBytesPerLong); // size => qword_count
1528 __ movl(count_arg, count); // update 'count'
1529 __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1530 __ pop(rsi);
1531 __ jump(RuntimeAddress(long_copy_entry));
1532
1533 return start;
1534 }
1535
1536
1537 // Perform range checks on the proposed arraycopy.
1538 // Smashes src_pos and dst_pos. (Uses them up for temps.)
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Address & length,Label & L_failed)1539 void arraycopy_range_checks(Register src,
1540 Register src_pos,
1541 Register dst,
1542 Register dst_pos,
1543 Address& length,
1544 Label& L_failed) {
1545 BLOCK_COMMENT("arraycopy_range_checks:");
1546 const Register src_end = src_pos; // source array end position
1547 const Register dst_end = dst_pos; // destination array end position
1548 __ addl(src_end, length); // src_pos + length
1549 __ addl(dst_end, length); // dst_pos + length
1550
1551 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
1552 __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1553 __ jcc(Assembler::above, L_failed);
1554
1555 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1556 __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1557 __ jcc(Assembler::above, L_failed);
1558
1559 BLOCK_COMMENT("arraycopy_range_checks done");
1560 }
1561
1562
1563 //
1564 // Generate generic array copy stubs
1565 //
1566 // Input:
1567 // 4(rsp) - src oop
1568 // 8(rsp) - src_pos
1569 // 12(rsp) - dst oop
1570 // 16(rsp) - dst_pos
1571 // 20(rsp) - element count
1572 //
1573 // Output:
1574 // rax, == 0 - success
1575 // rax, == -1^K - failure, where K is partial transfer count
1576 //
generate_generic_copy(const char * name,address entry_jbyte_arraycopy,address entry_jshort_arraycopy,address entry_jint_arraycopy,address entry_oop_arraycopy,address entry_jlong_arraycopy,address entry_checkcast_arraycopy)1577 address generate_generic_copy(const char *name,
1578 address entry_jbyte_arraycopy,
1579 address entry_jshort_arraycopy,
1580 address entry_jint_arraycopy,
1581 address entry_oop_arraycopy,
1582 address entry_jlong_arraycopy,
1583 address entry_checkcast_arraycopy) {
1584 Label L_failed, L_failed_0, L_objArray;
1585
1586 { int modulus = CodeEntryAlignment;
1587 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
1588 int advance = target - (__ offset() % modulus);
1589 if (advance < 0) advance += modulus;
1590 if (advance > 0) __ nop(advance);
1591 }
1592 StubCodeMark mark(this, "StubRoutines", name);
1593
1594 // Short-hop target to L_failed. Makes for denser prologue code.
1595 __ BIND(L_failed_0);
1596 __ jmp(L_failed);
1597 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1598
1599 __ align(CodeEntryAlignment);
1600 address start = __ pc();
1601
1602 __ enter(); // required for proper stackwalking of RuntimeStub frame
1603 __ push(rsi);
1604 __ push(rdi);
1605
1606 // bump this on entry, not on exit:
1607 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1608
1609 // Input values
1610 Address SRC (rsp, 12+ 4);
1611 Address SRC_POS (rsp, 12+ 8);
1612 Address DST (rsp, 12+12);
1613 Address DST_POS (rsp, 12+16);
1614 Address LENGTH (rsp, 12+20);
1615
1616 //-----------------------------------------------------------------------
1617 // Assembler stub will be used for this call to arraycopy
1618 // if the following conditions are met:
1619 //
1620 // (1) src and dst must not be null.
1621 // (2) src_pos must not be negative.
1622 // (3) dst_pos must not be negative.
1623 // (4) length must not be negative.
1624 // (5) src klass and dst klass should be the same and not NULL.
1625 // (6) src and dst should be arrays.
1626 // (7) src_pos + length must not exceed length of src.
1627 // (8) dst_pos + length must not exceed length of dst.
1628 //
1629
1630 const Register src = rax; // source array oop
1631 const Register src_pos = rsi;
1632 const Register dst = rdx; // destination array oop
1633 const Register dst_pos = rdi;
1634 const Register length = rcx; // transfer count
1635
1636 // if (src == NULL) return -1;
1637 __ movptr(src, SRC); // src oop
1638 __ testptr(src, src);
1639 __ jccb(Assembler::zero, L_failed_0);
1640
1641 // if (src_pos < 0) return -1;
1642 __ movl2ptr(src_pos, SRC_POS); // src_pos
1643 __ testl(src_pos, src_pos);
1644 __ jccb(Assembler::negative, L_failed_0);
1645
1646 // if (dst == NULL) return -1;
1647 __ movptr(dst, DST); // dst oop
1648 __ testptr(dst, dst);
1649 __ jccb(Assembler::zero, L_failed_0);
1650
1651 // if (dst_pos < 0) return -1;
1652 __ movl2ptr(dst_pos, DST_POS); // dst_pos
1653 __ testl(dst_pos, dst_pos);
1654 __ jccb(Assembler::negative, L_failed_0);
1655
1656 // if (length < 0) return -1;
1657 __ movl2ptr(length, LENGTH); // length
1658 __ testl(length, length);
1659 __ jccb(Assembler::negative, L_failed_0);
1660
1661 // if (src->klass() == NULL) return -1;
1662 Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1663 Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1664 const Register rcx_src_klass = rcx; // array klass
1665 __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1666
1667 #ifdef ASSERT
1668 // assert(src->klass() != NULL);
1669 BLOCK_COMMENT("assert klasses not null");
1670 { Label L1, L2;
1671 __ testptr(rcx_src_klass, rcx_src_klass);
1672 __ jccb(Assembler::notZero, L2); // it is broken if klass is NULL
1673 __ bind(L1);
1674 __ stop("broken null klass");
1675 __ bind(L2);
1676 __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1677 __ jccb(Assembler::equal, L1); // this would be broken also
1678 BLOCK_COMMENT("assert done");
1679 }
1680 #endif //ASSERT
1681
1682 // Load layout helper (32-bits)
1683 //
1684 // |array_tag| | header_size | element_type | |log2_element_size|
1685 // 32 30 24 16 8 2 0
1686 //
1687 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1688 //
1689
1690 int lh_offset = in_bytes(Klass::layout_helper_offset());
1691 Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1692
1693 // Handle objArrays completely differently...
1694 jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1695 __ cmpl(src_klass_lh_addr, objArray_lh);
1696 __ jcc(Assembler::equal, L_objArray);
1697
1698 // if (src->klass() != dst->klass()) return -1;
1699 __ cmpptr(rcx_src_klass, dst_klass_addr);
1700 __ jccb(Assembler::notEqual, L_failed_0);
1701
1702 const Register rcx_lh = rcx; // layout helper
1703 assert(rcx_lh == rcx_src_klass, "known alias");
1704 __ movl(rcx_lh, src_klass_lh_addr);
1705
1706 // if (!src->is_Array()) return -1;
1707 __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1708 __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1709
1710 // At this point, it is known to be a typeArray (array_tag 0x3).
1711 #ifdef ASSERT
1712 { Label L;
1713 __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1714 __ jcc(Assembler::greaterEqual, L); // signed cmp
1715 __ stop("must be a primitive array");
1716 __ bind(L);
1717 }
1718 #endif
1719
1720 assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1721 arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1722
1723 // TypeArrayKlass
1724 //
1725 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1726 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1727 //
1728 const Register rsi_offset = rsi; // array offset
1729 const Register src_array = src; // src array offset
1730 const Register dst_array = dst; // dst array offset
1731 const Register rdi_elsize = rdi; // log2 element size
1732
1733 __ mov(rsi_offset, rcx_lh);
1734 __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1735 __ andptr(rsi_offset, Klass::_lh_header_size_mask); // array_offset
1736 __ addptr(src_array, rsi_offset); // src array offset
1737 __ addptr(dst_array, rsi_offset); // dst array offset
1738 __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1739
1740 // next registers should be set before the jump to corresponding stub
1741 const Register from = src; // source array address
1742 const Register to = dst; // destination array address
1743 const Register count = rcx; // elements count
1744 // some of them should be duplicated on stack
1745 #define FROM Address(rsp, 12+ 4)
1746 #define TO Address(rsp, 12+ 8) // Not used now
1747 #define COUNT Address(rsp, 12+12) // Only for oop arraycopy
1748
1749 BLOCK_COMMENT("scale indexes to element size");
1750 __ movl2ptr(rsi, SRC_POS); // src_pos
1751 __ shlptr(rsi); // src_pos << rcx (log2 elsize)
1752 assert(src_array == from, "");
1753 __ addptr(from, rsi); // from = src_array + SRC_POS << log2 elsize
1754 __ movl2ptr(rdi, DST_POS); // dst_pos
1755 __ shlptr(rdi); // dst_pos << rcx (log2 elsize)
1756 assert(dst_array == to, "");
1757 __ addptr(to, rdi); // to = dst_array + DST_POS << log2 elsize
1758 __ movptr(FROM, from); // src_addr
1759 __ mov(rdi_elsize, rcx_lh); // log2 elsize
1760 __ movl2ptr(count, LENGTH); // elements count
1761
1762 BLOCK_COMMENT("choose copy loop based on element size");
1763 __ cmpl(rdi_elsize, 0);
1764
1765 __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1766 __ cmpl(rdi_elsize, LogBytesPerShort);
1767 __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1768 __ cmpl(rdi_elsize, LogBytesPerInt);
1769 __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1770 #ifdef ASSERT
1771 __ cmpl(rdi_elsize, LogBytesPerLong);
1772 __ jccb(Assembler::notEqual, L_failed);
1773 #endif
1774 __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1775 __ pop(rsi);
1776 __ jump(RuntimeAddress(entry_jlong_arraycopy));
1777
1778 __ BIND(L_failed);
1779 __ xorptr(rax, rax);
1780 __ notptr(rax); // return -1
1781 __ pop(rdi);
1782 __ pop(rsi);
1783 __ leave(); // required for proper stackwalking of RuntimeStub frame
1784 __ ret(0);
1785
1786 // ObjArrayKlass
1787 __ BIND(L_objArray);
1788 // live at this point: rcx_src_klass, src[_pos], dst[_pos]
1789
1790 Label L_plain_copy, L_checkcast_copy;
1791 // test array classes for subtyping
1792 __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1793 __ jccb(Assembler::notEqual, L_checkcast_copy);
1794
1795 // Identically typed arrays can be copied without element-wise checks.
1796 assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1797 arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1798
1799 __ BIND(L_plain_copy);
1800 __ movl2ptr(count, LENGTH); // elements count
1801 __ movl2ptr(src_pos, SRC_POS); // reload src_pos
1802 __ lea(from, Address(src, src_pos, Address::times_ptr,
1803 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1804 __ movl2ptr(dst_pos, DST_POS); // reload dst_pos
1805 __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1806 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1807 __ movptr(FROM, from); // src_addr
1808 __ movptr(TO, to); // dst_addr
1809 __ movl(COUNT, count); // count
1810 __ jump(RuntimeAddress(entry_oop_arraycopy));
1811
1812 __ BIND(L_checkcast_copy);
1813 // live at this point: rcx_src_klass, dst[_pos], src[_pos]
1814 {
1815 // Handy offsets:
1816 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1817 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1818
1819 Register rsi_dst_klass = rsi;
1820 Register rdi_temp = rdi;
1821 assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
1822 assert(rdi_temp == dst_pos, "expected alias w/ dst_pos");
1823 Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
1824
1825 // Before looking at dst.length, make sure dst is also an objArray.
1826 __ movptr(rsi_dst_klass, dst_klass_addr);
1827 __ cmpl(dst_klass_lh_addr, objArray_lh);
1828 __ jccb(Assembler::notEqual, L_failed);
1829
1830 // It is safe to examine both src.length and dst.length.
1831 __ movl2ptr(src_pos, SRC_POS); // reload rsi
1832 arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1833 // (Now src_pos and dst_pos are killed, but not src and dst.)
1834
1835 // We'll need this temp (don't forget to pop it after the type check).
1836 __ push(rbx);
1837 Register rbx_src_klass = rbx;
1838
1839 __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
1840 __ movptr(rsi_dst_klass, dst_klass_addr);
1841 Address super_check_offset_addr(rsi_dst_klass, sco_offset);
1842 Label L_fail_array_check;
1843 generate_type_check(rbx_src_klass,
1844 super_check_offset_addr, dst_klass_addr,
1845 rdi_temp, NULL, &L_fail_array_check);
1846 // (On fall-through, we have passed the array type check.)
1847 __ pop(rbx);
1848 __ jmp(L_plain_copy);
1849
1850 __ BIND(L_fail_array_check);
1851 // Reshuffle arguments so we can call checkcast_arraycopy:
1852
1853 // match initial saves for checkcast_arraycopy
1854 // push(rsi); // already done; see above
1855 // push(rdi); // already done; see above
1856 // push(rbx); // already done; see above
1857
1858 // Marshal outgoing arguments now, freeing registers.
1859 Address from_arg(rsp, 16+ 4); // from
1860 Address to_arg(rsp, 16+ 8); // to
1861 Address length_arg(rsp, 16+12); // elements count
1862 Address ckoff_arg(rsp, 16+16); // super_check_offset
1863 Address ckval_arg(rsp, 16+20); // super_klass
1864
1865 Address SRC_POS_arg(rsp, 16+ 8);
1866 Address DST_POS_arg(rsp, 16+16);
1867 Address LENGTH_arg(rsp, 16+20);
1868 // push rbx, changed the incoming offsets (why not just use rbp,??)
1869 // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
1870
1871 __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
1872 __ movl2ptr(length, LENGTH_arg); // reload elements count
1873 __ movl2ptr(src_pos, SRC_POS_arg); // reload src_pos
1874 __ movl2ptr(dst_pos, DST_POS_arg); // reload dst_pos
1875
1876 __ movptr(ckval_arg, rbx); // destination element type
1877 __ movl(rbx, Address(rbx, sco_offset));
1878 __ movl(ckoff_arg, rbx); // corresponding class check offset
1879
1880 __ movl(length_arg, length); // outgoing length argument
1881
1882 __ lea(from, Address(src, src_pos, Address::times_ptr,
1883 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1884 __ movptr(from_arg, from);
1885
1886 __ lea(to, Address(dst, dst_pos, Address::times_ptr,
1887 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
1888 __ movptr(to_arg, to);
1889 __ jump(RuntimeAddress(entry_checkcast_arraycopy));
1890 }
1891
1892 return start;
1893 }
1894
generate_arraycopy_stubs()1895 void generate_arraycopy_stubs() {
1896 address entry;
1897 address entry_jbyte_arraycopy;
1898 address entry_jshort_arraycopy;
1899 address entry_jint_arraycopy;
1900 address entry_oop_arraycopy;
1901 address entry_jlong_arraycopy;
1902 address entry_checkcast_arraycopy;
1903
1904 StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
1905 generate_disjoint_copy(T_BYTE, true, Address::times_1, &entry,
1906 "arrayof_jbyte_disjoint_arraycopy");
1907 StubRoutines::_arrayof_jbyte_arraycopy =
1908 generate_conjoint_copy(T_BYTE, true, Address::times_1, entry,
1909 NULL, "arrayof_jbyte_arraycopy");
1910 StubRoutines::_jbyte_disjoint_arraycopy =
1911 generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
1912 "jbyte_disjoint_arraycopy");
1913 StubRoutines::_jbyte_arraycopy =
1914 generate_conjoint_copy(T_BYTE, false, Address::times_1, entry,
1915 &entry_jbyte_arraycopy, "jbyte_arraycopy");
1916
1917 StubRoutines::_arrayof_jshort_disjoint_arraycopy =
1918 generate_disjoint_copy(T_SHORT, true, Address::times_2, &entry,
1919 "arrayof_jshort_disjoint_arraycopy");
1920 StubRoutines::_arrayof_jshort_arraycopy =
1921 generate_conjoint_copy(T_SHORT, true, Address::times_2, entry,
1922 NULL, "arrayof_jshort_arraycopy");
1923 StubRoutines::_jshort_disjoint_arraycopy =
1924 generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
1925 "jshort_disjoint_arraycopy");
1926 StubRoutines::_jshort_arraycopy =
1927 generate_conjoint_copy(T_SHORT, false, Address::times_2, entry,
1928 &entry_jshort_arraycopy, "jshort_arraycopy");
1929
1930 // Next arrays are always aligned on 4 bytes at least.
1931 StubRoutines::_jint_disjoint_arraycopy =
1932 generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
1933 "jint_disjoint_arraycopy");
1934 StubRoutines::_jint_arraycopy =
1935 generate_conjoint_copy(T_INT, true, Address::times_4, entry,
1936 &entry_jint_arraycopy, "jint_arraycopy");
1937
1938 StubRoutines::_oop_disjoint_arraycopy =
1939 generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
1940 "oop_disjoint_arraycopy");
1941 StubRoutines::_oop_arraycopy =
1942 generate_conjoint_copy(T_OBJECT, true, Address::times_ptr, entry,
1943 &entry_oop_arraycopy, "oop_arraycopy");
1944
1945 StubRoutines::_oop_disjoint_arraycopy_uninit =
1946 generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
1947 "oop_disjoint_arraycopy_uninit",
1948 /*dest_uninitialized*/true);
1949 StubRoutines::_oop_arraycopy_uninit =
1950 generate_conjoint_copy(T_OBJECT, true, Address::times_ptr, entry,
1951 NULL, "oop_arraycopy_uninit",
1952 /*dest_uninitialized*/true);
1953
1954 StubRoutines::_jlong_disjoint_arraycopy =
1955 generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
1956 StubRoutines::_jlong_arraycopy =
1957 generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
1958 "jlong_arraycopy");
1959
1960 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
1961 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
1962 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
1963 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
1964 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
1965 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
1966
1967 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
1968 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
1969 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
1970 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
1971
1972 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
1973 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
1974 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
1975 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
1976
1977 StubRoutines::_checkcast_arraycopy =
1978 generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
1979 StubRoutines::_checkcast_arraycopy_uninit =
1980 generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
1981
1982 StubRoutines::_unsafe_arraycopy =
1983 generate_unsafe_copy("unsafe_arraycopy",
1984 entry_jbyte_arraycopy,
1985 entry_jshort_arraycopy,
1986 entry_jint_arraycopy,
1987 entry_jlong_arraycopy);
1988
1989 StubRoutines::_generic_arraycopy =
1990 generate_generic_copy("generic_arraycopy",
1991 entry_jbyte_arraycopy,
1992 entry_jshort_arraycopy,
1993 entry_jint_arraycopy,
1994 entry_oop_arraycopy,
1995 entry_jlong_arraycopy,
1996 entry_checkcast_arraycopy);
1997 }
1998
1999 // AES intrinsic stubs
2000 enum {AESBlockSize = 16};
2001
generate_key_shuffle_mask()2002 address generate_key_shuffle_mask() {
2003 __ align(16);
2004 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2005 address start = __ pc();
2006 __ emit_data(0x00010203, relocInfo::none, 0 );
2007 __ emit_data(0x04050607, relocInfo::none, 0 );
2008 __ emit_data(0x08090a0b, relocInfo::none, 0 );
2009 __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
2010 return start;
2011 }
2012
generate_counter_shuffle_mask()2013 address generate_counter_shuffle_mask() {
2014 __ align(16);
2015 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2016 address start = __ pc();
2017 __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2018 __ emit_data(0x08090a0b, relocInfo::none, 0);
2019 __ emit_data(0x04050607, relocInfo::none, 0);
2020 __ emit_data(0x00010203, relocInfo::none, 0);
2021 return start;
2022 }
2023
2024 // Utility routine for loading a 128-bit key word in little endian format
2025 // can optionally specify that the shuffle mask is already in an xmmregister
load_key(XMMRegister xmmdst,Register key,int offset,XMMRegister xmm_shuf_mask=NULL)2026 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2027 __ movdqu(xmmdst, Address(key, offset));
2028 if (xmm_shuf_mask != NULL) {
2029 __ pshufb(xmmdst, xmm_shuf_mask);
2030 } else {
2031 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2032 }
2033 }
2034
2035 // aesenc using specified key+offset
2036 // can optionally specify that the shuffle mask is already in an xmmregister
aes_enc_key(XMMRegister xmmdst,XMMRegister xmmtmp,Register key,int offset,XMMRegister xmm_shuf_mask=NULL)2037 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2038 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2039 __ aesenc(xmmdst, xmmtmp);
2040 }
2041
2042 // aesdec using specified key+offset
2043 // can optionally specify that the shuffle mask is already in an xmmregister
aes_dec_key(XMMRegister xmmdst,XMMRegister xmmtmp,Register key,int offset,XMMRegister xmm_shuf_mask=NULL)2044 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2045 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2046 __ aesdec(xmmdst, xmmtmp);
2047 }
2048
2049 // Utility routine for increase 128bit counter (iv in CTR mode)
2050 // XMM_128bit, D3, D2, D1, D0
inc_counter(Register reg,XMMRegister xmmdst,int inc_delta,Label & next_block)2051 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2052 __ pextrd(reg, xmmdst, 0x0);
2053 __ addl(reg, inc_delta);
2054 __ pinsrd(xmmdst, reg, 0x0);
2055 __ jcc(Assembler::carryClear, next_block); // jump if no carry
2056
2057 __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
2058 __ addl(reg, 0x01);
2059 __ pinsrd(xmmdst, reg, 0x01);
2060 __ jcc(Assembler::carryClear, next_block); // jump if no carry
2061
2062 __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
2063 __ addl(reg, 0x01);
2064 __ pinsrd(xmmdst, reg, 0x02);
2065 __ jcc(Assembler::carryClear, next_block); // jump if no carry
2066
2067 __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
2068 __ addl(reg, 0x01);
2069 __ pinsrd(xmmdst, reg, 0x03);
2070
2071 __ BIND(next_block); // next instruction
2072 }
2073
2074
2075 // Arguments:
2076 //
2077 // Inputs:
2078 // c_rarg0 - source byte array address
2079 // c_rarg1 - destination byte array address
2080 // c_rarg2 - K (key) in little endian int array
2081 //
generate_aescrypt_encryptBlock()2082 address generate_aescrypt_encryptBlock() {
2083 assert(UseAES, "need AES instructions and misaligned SSE support");
2084 __ align(CodeEntryAlignment);
2085 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2086 Label L_doLast;
2087 address start = __ pc();
2088
2089 const Register from = rdx; // source array address
2090 const Register to = rdx; // destination array address
2091 const Register key = rcx; // key array address
2092 const Register keylen = rax;
2093 const Address from_param(rbp, 8+0);
2094 const Address to_param (rbp, 8+4);
2095 const Address key_param (rbp, 8+8);
2096
2097 const XMMRegister xmm_result = xmm0;
2098 const XMMRegister xmm_key_shuf_mask = xmm1;
2099 const XMMRegister xmm_temp1 = xmm2;
2100 const XMMRegister xmm_temp2 = xmm3;
2101 const XMMRegister xmm_temp3 = xmm4;
2102 const XMMRegister xmm_temp4 = xmm5;
2103
2104 __ enter(); // required for proper stackwalking of RuntimeStub frame
2105
2106 __ movptr(from, from_param);
2107 __ movptr(key, key_param);
2108
2109 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2110 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2111
2112 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2113 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
2114 __ movptr(to, to_param);
2115
2116 // For encryption, the java expanded key ordering is just what we need
2117
2118 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2119 __ pxor(xmm_result, xmm_temp1);
2120
2121 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2122 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2123 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2124 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2125
2126 __ aesenc(xmm_result, xmm_temp1);
2127 __ aesenc(xmm_result, xmm_temp2);
2128 __ aesenc(xmm_result, xmm_temp3);
2129 __ aesenc(xmm_result, xmm_temp4);
2130
2131 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2132 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2133 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2134 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2135
2136 __ aesenc(xmm_result, xmm_temp1);
2137 __ aesenc(xmm_result, xmm_temp2);
2138 __ aesenc(xmm_result, xmm_temp3);
2139 __ aesenc(xmm_result, xmm_temp4);
2140
2141 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2142 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2143
2144 __ cmpl(keylen, 44);
2145 __ jccb(Assembler::equal, L_doLast);
2146
2147 __ aesenc(xmm_result, xmm_temp1);
2148 __ aesenc(xmm_result, xmm_temp2);
2149
2150 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2151 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2152
2153 __ cmpl(keylen, 52);
2154 __ jccb(Assembler::equal, L_doLast);
2155
2156 __ aesenc(xmm_result, xmm_temp1);
2157 __ aesenc(xmm_result, xmm_temp2);
2158
2159 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2160 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2161
2162 __ BIND(L_doLast);
2163 __ aesenc(xmm_result, xmm_temp1);
2164 __ aesenclast(xmm_result, xmm_temp2);
2165 __ movdqu(Address(to, 0), xmm_result); // store the result
2166 __ xorptr(rax, rax); // return 0
2167 __ leave(); // required for proper stackwalking of RuntimeStub frame
2168 __ ret(0);
2169
2170 return start;
2171 }
2172
2173
2174 // Arguments:
2175 //
2176 // Inputs:
2177 // c_rarg0 - source byte array address
2178 // c_rarg1 - destination byte array address
2179 // c_rarg2 - K (key) in little endian int array
2180 //
generate_aescrypt_decryptBlock()2181 address generate_aescrypt_decryptBlock() {
2182 assert(UseAES, "need AES instructions and misaligned SSE support");
2183 __ align(CodeEntryAlignment);
2184 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2185 Label L_doLast;
2186 address start = __ pc();
2187
2188 const Register from = rdx; // source array address
2189 const Register to = rdx; // destination array address
2190 const Register key = rcx; // key array address
2191 const Register keylen = rax;
2192 const Address from_param(rbp, 8+0);
2193 const Address to_param (rbp, 8+4);
2194 const Address key_param (rbp, 8+8);
2195
2196 const XMMRegister xmm_result = xmm0;
2197 const XMMRegister xmm_key_shuf_mask = xmm1;
2198 const XMMRegister xmm_temp1 = xmm2;
2199 const XMMRegister xmm_temp2 = xmm3;
2200 const XMMRegister xmm_temp3 = xmm4;
2201 const XMMRegister xmm_temp4 = xmm5;
2202
2203 __ enter(); // required for proper stackwalking of RuntimeStub frame
2204
2205 __ movptr(from, from_param);
2206 __ movptr(key, key_param);
2207
2208 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2209 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2210
2211 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2212 __ movdqu(xmm_result, Address(from, 0));
2213 __ movptr(to, to_param);
2214
2215 // for decryption java expanded key ordering is rotated one position from what we want
2216 // so we start from 0x10 here and hit 0x00 last
2217 // we don't know if the key is aligned, hence not using load-execute form
2218 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2219 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2220 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2221 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2222
2223 __ pxor (xmm_result, xmm_temp1);
2224 __ aesdec(xmm_result, xmm_temp2);
2225 __ aesdec(xmm_result, xmm_temp3);
2226 __ aesdec(xmm_result, xmm_temp4);
2227
2228 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2229 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2230 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2231 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2232
2233 __ aesdec(xmm_result, xmm_temp1);
2234 __ aesdec(xmm_result, xmm_temp2);
2235 __ aesdec(xmm_result, xmm_temp3);
2236 __ aesdec(xmm_result, xmm_temp4);
2237
2238 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2239 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2240 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2241
2242 __ cmpl(keylen, 44);
2243 __ jccb(Assembler::equal, L_doLast);
2244
2245 __ aesdec(xmm_result, xmm_temp1);
2246 __ aesdec(xmm_result, xmm_temp2);
2247
2248 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2249 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2250
2251 __ cmpl(keylen, 52);
2252 __ jccb(Assembler::equal, L_doLast);
2253
2254 __ aesdec(xmm_result, xmm_temp1);
2255 __ aesdec(xmm_result, xmm_temp2);
2256
2257 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2258 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2259
2260 __ BIND(L_doLast);
2261 __ aesdec(xmm_result, xmm_temp1);
2262 __ aesdec(xmm_result, xmm_temp2);
2263
2264 // for decryption the aesdeclast operation is always on key+0x00
2265 __ aesdeclast(xmm_result, xmm_temp3);
2266 __ movdqu(Address(to, 0), xmm_result); // store the result
2267 __ xorptr(rax, rax); // return 0
2268 __ leave(); // required for proper stackwalking of RuntimeStub frame
2269 __ ret(0);
2270
2271 return start;
2272 }
2273
handleSOERegisters(bool saving)2274 void handleSOERegisters(bool saving) {
2275 const int saveFrameSizeInBytes = 4 * wordSize;
2276 const Address saved_rbx (rbp, -3 * wordSize);
2277 const Address saved_rsi (rbp, -2 * wordSize);
2278 const Address saved_rdi (rbp, -1 * wordSize);
2279
2280 if (saving) {
2281 __ subptr(rsp, saveFrameSizeInBytes);
2282 __ movptr(saved_rsi, rsi);
2283 __ movptr(saved_rdi, rdi);
2284 __ movptr(saved_rbx, rbx);
2285 } else {
2286 // restoring
2287 __ movptr(rsi, saved_rsi);
2288 __ movptr(rdi, saved_rdi);
2289 __ movptr(rbx, saved_rbx);
2290 }
2291 }
2292
2293 // Arguments:
2294 //
2295 // Inputs:
2296 // c_rarg0 - source byte array address
2297 // c_rarg1 - destination byte array address
2298 // c_rarg2 - K (key) in little endian int array
2299 // c_rarg3 - r vector byte array address
2300 // c_rarg4 - input length
2301 //
2302 // Output:
2303 // rax - input length
2304 //
generate_cipherBlockChaining_encryptAESCrypt()2305 address generate_cipherBlockChaining_encryptAESCrypt() {
2306 assert(UseAES, "need AES instructions and misaligned SSE support");
2307 __ align(CodeEntryAlignment);
2308 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2309 address start = __ pc();
2310
2311 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2312 const Register from = rsi; // source array address
2313 const Register to = rdx; // destination array address
2314 const Register key = rcx; // key array address
2315 const Register rvec = rdi; // r byte array initialized from initvector array address
2316 // and left with the results of the last encryption block
2317 const Register len_reg = rbx; // src len (must be multiple of blocksize 16)
2318 const Register pos = rax;
2319
2320 // xmm register assignments for the loops below
2321 const XMMRegister xmm_result = xmm0;
2322 const XMMRegister xmm_temp = xmm1;
2323 // first 6 keys preloaded into xmm2-xmm7
2324 const int XMM_REG_NUM_KEY_FIRST = 2;
2325 const int XMM_REG_NUM_KEY_LAST = 7;
2326 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2327
2328 __ enter(); // required for proper stackwalking of RuntimeStub frame
2329 handleSOERegisters(true /*saving*/);
2330
2331 // load registers from incoming parameters
2332 const Address from_param(rbp, 8+0);
2333 const Address to_param (rbp, 8+4);
2334 const Address key_param (rbp, 8+8);
2335 const Address rvec_param (rbp, 8+12);
2336 const Address len_param (rbp, 8+16);
2337 __ movptr(from , from_param);
2338 __ movptr(to , to_param);
2339 __ movptr(key , key_param);
2340 __ movptr(rvec , rvec_param);
2341 __ movptr(len_reg , len_param);
2342
2343 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
2344 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2345 // load up xmm regs 2 thru 7 with keys 0-5
2346 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2347 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2348 offset += 0x10;
2349 }
2350
2351 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
2352
2353 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2354 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2355 __ cmpl(rax, 44);
2356 __ jcc(Assembler::notEqual, L_key_192_256);
2357
2358 // 128 bit code follows here
2359 __ movl(pos, 0);
2360 __ align(OptoLoopAlignment);
2361 __ BIND(L_loopTop_128);
2362 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2363 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
2364
2365 __ pxor (xmm_result, xmm_key0); // do the aes rounds
2366 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2367 __ aesenc(xmm_result, as_XMMRegister(rnum));
2368 }
2369 for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2370 aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2371 }
2372 load_key(xmm_temp, key, 0xa0);
2373 __ aesenclast(xmm_result, xmm_temp);
2374
2375 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
2376 // no need to store r to memory until we exit
2377 __ addptr(pos, AESBlockSize);
2378 __ subptr(len_reg, AESBlockSize);
2379 __ jcc(Assembler::notEqual, L_loopTop_128);
2380
2381 __ BIND(L_exit);
2382 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
2383
2384 handleSOERegisters(false /*restoring*/);
2385 __ movptr(rax, len_param); // return length
2386 __ leave(); // required for proper stackwalking of RuntimeStub frame
2387 __ ret(0);
2388
2389 __ BIND(L_key_192_256);
2390 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2391 __ cmpl(rax, 52);
2392 __ jcc(Assembler::notEqual, L_key_256);
2393
2394 // 192-bit code follows here (could be changed to use more xmm registers)
2395 __ movl(pos, 0);
2396 __ align(OptoLoopAlignment);
2397 __ BIND(L_loopTop_192);
2398 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2399 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
2400
2401 __ pxor (xmm_result, xmm_key0); // do the aes rounds
2402 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2403 __ aesenc(xmm_result, as_XMMRegister(rnum));
2404 }
2405 for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2406 aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2407 }
2408 load_key(xmm_temp, key, 0xc0);
2409 __ aesenclast(xmm_result, xmm_temp);
2410
2411 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
2412 // no need to store r to memory until we exit
2413 __ addptr(pos, AESBlockSize);
2414 __ subptr(len_reg, AESBlockSize);
2415 __ jcc(Assembler::notEqual, L_loopTop_192);
2416 __ jmp(L_exit);
2417
2418 __ BIND(L_key_256);
2419 // 256-bit code follows here (could be changed to use more xmm registers)
2420 __ movl(pos, 0);
2421 __ align(OptoLoopAlignment);
2422 __ BIND(L_loopTop_256);
2423 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2424 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
2425
2426 __ pxor (xmm_result, xmm_key0); // do the aes rounds
2427 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2428 __ aesenc(xmm_result, as_XMMRegister(rnum));
2429 }
2430 for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2431 aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2432 }
2433 load_key(xmm_temp, key, 0xe0);
2434 __ aesenclast(xmm_result, xmm_temp);
2435
2436 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
2437 // no need to store r to memory until we exit
2438 __ addptr(pos, AESBlockSize);
2439 __ subptr(len_reg, AESBlockSize);
2440 __ jcc(Assembler::notEqual, L_loopTop_256);
2441 __ jmp(L_exit);
2442
2443 return start;
2444 }
2445
2446
2447 // CBC AES Decryption.
2448 // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2449 //
2450 // Arguments:
2451 //
2452 // Inputs:
2453 // c_rarg0 - source byte array address
2454 // c_rarg1 - destination byte array address
2455 // c_rarg2 - K (key) in little endian int array
2456 // c_rarg3 - r vector byte array address
2457 // c_rarg4 - input length
2458 //
2459 // Output:
2460 // rax - input length
2461 //
2462
generate_cipherBlockChaining_decryptAESCrypt_Parallel()2463 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
2464 assert(UseAES, "need AES instructions and misaligned SSE support");
2465 __ align(CodeEntryAlignment);
2466 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2467 address start = __ pc();
2468
2469 const Register from = rsi; // source array address
2470 const Register to = rdx; // destination array address
2471 const Register key = rcx; // key array address
2472 const Register rvec = rdi; // r byte array initialized from initvector array address
2473 // and left with the results of the last encryption block
2474 const Register len_reg = rbx; // src len (must be multiple of blocksize 16)
2475 const Register pos = rax;
2476
2477 const int PARALLEL_FACTOR = 4;
2478 const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
2479
2480 Label L_exit;
2481 Label L_singleBlock_loopTop[3]; //128, 192, 256
2482 Label L_multiBlock_loopTop[3]; //128, 192, 256
2483
2484 const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
2485 const XMMRegister xmm_key_shuf_mask = xmm1;
2486
2487 const XMMRegister xmm_key_tmp0 = xmm2;
2488 const XMMRegister xmm_key_tmp1 = xmm3;
2489
2490 // registers holding the six results in the parallelized loop
2491 const XMMRegister xmm_result0 = xmm4;
2492 const XMMRegister xmm_result1 = xmm5;
2493 const XMMRegister xmm_result2 = xmm6;
2494 const XMMRegister xmm_result3 = xmm7;
2495
2496 __ enter(); // required for proper stackwalking of RuntimeStub frame
2497 handleSOERegisters(true /*saving*/);
2498
2499 // load registers from incoming parameters
2500 const Address from_param(rbp, 8+0);
2501 const Address to_param (rbp, 8+4);
2502 const Address key_param (rbp, 8+8);
2503 const Address rvec_param (rbp, 8+12);
2504 const Address len_param (rbp, 8+16);
2505
2506 __ movptr(from , from_param);
2507 __ movptr(to , to_param);
2508 __ movptr(key , key_param);
2509 __ movptr(rvec , rvec_param);
2510 __ movptr(len_reg , len_param);
2511
2512 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2513 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
2514
2515 __ xorptr(pos, pos);
2516
2517 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2518 // rvec is reused
2519 __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2520 __ cmpl(rvec, 52);
2521 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
2522 __ cmpl(rvec, 60);
2523 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
2524
2525 #define DoFour(opc, src_reg) \
2526 __ opc(xmm_result0, src_reg); \
2527 __ opc(xmm_result1, src_reg); \
2528 __ opc(xmm_result2, src_reg); \
2529 __ opc(xmm_result3, src_reg); \
2530
2531 for (int k = 0; k < 3; ++k) {
2532 __ align(OptoLoopAlignment);
2533 __ BIND(L_multiBlock_loopTop[k]);
2534 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
2535 __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
2536
2537 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
2538 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2539 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2540 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2541
2542 // the java expanded key ordering is rotated one position from what we want
2543 // so we start from 0x10 here and hit 0x00 last
2544 load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2545 DoFour(pxor, xmm_key_tmp0); //xor with first key
2546 // do the aes dec rounds
2547 for (int rnum = 1; rnum <= ROUNDS[k];) {
2548 //load two keys at a time
2549 //k1->0x20, ..., k9->0xa0, k10->0x00
2550 load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2551 load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
2552 DoFour(aesdec, xmm_key_tmp1);
2553 rnum++;
2554 if (rnum != ROUNDS[k]) {
2555 DoFour(aesdec, xmm_key_tmp0);
2556 }
2557 else {
2558 DoFour(aesdeclast, xmm_key_tmp0);
2559 }
2560 rnum++;
2561 }
2562
2563 // for each result, xor with the r vector of previous cipher block
2564 __ pxor(xmm_result0, xmm_prev_block_cipher);
2565 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2566 __ pxor(xmm_result1, xmm_prev_block_cipher);
2567 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2568 __ pxor(xmm_result2, xmm_prev_block_cipher);
2569 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2570 __ pxor(xmm_result3, xmm_prev_block_cipher);
2571 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
2572
2573 // store 4 results into the next 64 bytes of output
2574 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2575 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2576 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2577 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2578
2579 __ addptr(pos, 4 * AESBlockSize);
2580 __ subptr(len_reg, 4 * AESBlockSize);
2581 __ jmp(L_multiBlock_loopTop[k]);
2582
2583 //singleBlock starts here
2584 __ align(OptoLoopAlignment);
2585 __ BIND(L_singleBlock_loopTop[k]);
2586 __ cmpptr(len_reg, 0); // any blocks left?
2587 __ jcc(Assembler::equal, L_exit);
2588 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2589 __ movdqa(xmm_result1, xmm_result0);
2590
2591 load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2592 __ pxor(xmm_result0, xmm_key_tmp0);
2593 // do the aes dec rounds
2594 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
2595 // the java expanded key ordering is rotated one position from what we want
2596 load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2597 __ aesdec(xmm_result0, xmm_key_tmp0);
2598 }
2599 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
2600 __ aesdeclast(xmm_result0, xmm_key_tmp0);
2601 __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
2602 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
2603 // no need to store r to memory until we exit
2604 __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
2605
2606 __ addptr(pos, AESBlockSize);
2607 __ subptr(len_reg, AESBlockSize);
2608 __ jmp(L_singleBlock_loopTop[k]);
2609 }//for 128/192/256
2610
2611 __ BIND(L_exit);
2612 __ movptr(rvec, rvec_param); // restore this since reused earlier
2613 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
2614 handleSOERegisters(false /*restoring*/);
2615 __ movptr(rax, len_param); // return length
2616 __ leave(); // required for proper stackwalking of RuntimeStub frame
2617 __ ret(0);
2618
2619 return start;
2620 }
2621
2622 // CTR AES crypt.
2623 // In 32-bit stub, parallelize 4 blocks at a time
2624 // Arguments:
2625 //
2626 // Inputs:
2627 // c_rarg0 - source byte array address
2628 // c_rarg1 - destination byte array address
2629 // c_rarg2 - K (key) in little endian int array
2630 // c_rarg3 - counter vector byte array address
2631 // c_rarg4 - input length
2632 //
2633 // Output:
2634 // rax - input length
2635 //
generate_counterMode_AESCrypt_Parallel()2636 address generate_counterMode_AESCrypt_Parallel() {
2637 assert(UseAES, "need AES instructions and misaligned SSE support");
2638 __ align(CodeEntryAlignment);
2639 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2640 address start = __ pc();
2641 const Register from = rsi; // source array address
2642 const Register to = rdx; // destination array address
2643 const Register key = rcx; // key array address
2644 const Register counter = rdi; // counter byte array initialized from initvector array address
2645 // and updated with the incremented counter in the end
2646 const Register len_reg = rbx;
2647 const Register pos = rax;
2648
2649 __ enter(); // required for proper stackwalking of RuntimeStub frame
2650 handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
2651
2652 // load registers from incoming parameters
2653 const Address from_param(rbp, 8+0);
2654 const Address to_param (rbp, 8+4);
2655 const Address key_param (rbp, 8+8);
2656 const Address rvec_param (rbp, 8+12);
2657 const Address len_param (rbp, 8+16);
2658 const Address saved_counter_param(rbp, 8 + 20);
2659 const Address used_addr_param(rbp, 8 + 24);
2660
2661 __ movptr(from , from_param);
2662 __ movptr(to , to_param);
2663 __ movptr(len_reg , len_param);
2664
2665 // Use the partially used encrpyted counter from last invocation
2666 Label L_exit_preLoop, L_preLoop_start;
2667
2668 // Use the registers 'counter' and 'key' here in this preloop
2669 // to hold of last 2 params 'used' and 'saved_encCounter_start'
2670 Register used = counter;
2671 Register saved_encCounter_start = key;
2672 Register used_addr = saved_encCounter_start;
2673
2674 __ movptr(used_addr, used_addr_param);
2675 __ movptr(used, Address(used_addr, 0));
2676 __ movptr(saved_encCounter_start, saved_counter_param);
2677
2678 __ BIND(L_preLoop_start);
2679 __ cmpptr(used, 16);
2680 __ jcc(Assembler::aboveEqual, L_exit_preLoop);
2681 __ cmpptr(len_reg, 0);
2682 __ jcc(Assembler::lessEqual, L_exit_preLoop);
2683 __ movb(rax, Address(saved_encCounter_start, used));
2684 __ xorb(rax, Address(from, 0));
2685 __ movb(Address(to, 0), rax);
2686 __ addptr(from, 1);
2687 __ addptr(to, 1);
2688 __ addptr(used, 1);
2689 __ subptr(len_reg, 1);
2690
2691 __ jmp(L_preLoop_start);
2692
2693 __ BIND(L_exit_preLoop);
2694 __ movptr(used_addr, used_addr_param);
2695 __ movptr(used_addr, used_addr_param);
2696 __ movl(Address(used_addr, 0), used);
2697
2698 // load the parameters 'key' and 'counter'
2699 __ movptr(key, key_param);
2700 __ movptr(counter, rvec_param);
2701
2702 // xmm register assignments for the loops below
2703 const XMMRegister xmm_curr_counter = xmm0;
2704 const XMMRegister xmm_counter_shuf_mask = xmm1; // need to be reloaded
2705 const XMMRegister xmm_key_shuf_mask = xmm2; // need to be reloaded
2706 const XMMRegister xmm_key = xmm3;
2707 const XMMRegister xmm_result0 = xmm4;
2708 const XMMRegister xmm_result1 = xmm5;
2709 const XMMRegister xmm_result2 = xmm6;
2710 const XMMRegister xmm_result3 = xmm7;
2711 const XMMRegister xmm_from0 = xmm1; //reuse XMM register
2712 const XMMRegister xmm_from1 = xmm2;
2713 const XMMRegister xmm_from2 = xmm3;
2714 const XMMRegister xmm_from3 = xmm4;
2715
2716 //for key_128, key_192, key_256
2717 const int rounds[3] = {10, 12, 14};
2718 Label L_singleBlockLoopTop[3];
2719 Label L_multiBlock_loopTop[3];
2720 Label L_key192_top, L_key256_top;
2721 Label L_incCounter[3][4]; // 3: different key length, 4: 4 blocks at a time
2722 Label L_incCounter_single[3]; //for single block, key128, key192, key256
2723 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
2724 Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
2725
2726 Label L_exit;
2727 const int PARALLEL_FACTOR = 4; //because of the limited register number
2728
2729 // initialize counter with initial counter
2730 __ movdqu(xmm_curr_counter, Address(counter, 0x00));
2731 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2732 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
2733
2734 // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2735 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2736 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2737 __ cmpl(rax, 52);
2738 __ jcc(Assembler::equal, L_key192_top);
2739 __ cmpl(rax, 60);
2740 __ jcc(Assembler::equal, L_key256_top);
2741
2742 //key128 begins here
2743 __ movptr(pos, NULL_WORD); // init pos before L_multiBlock_loopTop
2744
2745 #define CTR_DoFour(opc, src_reg) \
2746 __ opc(xmm_result0, src_reg); \
2747 __ opc(xmm_result1, src_reg); \
2748 __ opc(xmm_result2, src_reg); \
2749 __ opc(xmm_result3, src_reg);
2750
2751 // k == 0 : generate code for key_128
2752 // k == 1 : generate code for key_192
2753 // k == 2 : generate code for key_256
2754 for (int k = 0; k < 3; ++k) {
2755 //multi blocks starts here
2756 __ align(OptoLoopAlignment);
2757 __ BIND(L_multiBlock_loopTop[k]);
2758 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
2759 __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
2760
2761 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2762 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2763
2764 //load, then increase counters
2765 CTR_DoFour(movdqa, xmm_curr_counter);
2766 __ push(rbx);
2767 inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
2768 inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
2769 inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
2770 inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
2771 __ pop (rbx);
2772
2773 load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
2774
2775 CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
2776 CTR_DoFour(pxor, xmm_key); //PXOR with Round 0 key
2777
2778 for (int i = 1; i < rounds[k]; ++i) {
2779 load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2780 CTR_DoFour(aesenc, xmm_key);
2781 }
2782 load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2783 CTR_DoFour(aesenclast, xmm_key);
2784
2785 // get next PARALLEL_FACTOR blocks into xmm_from registers
2786 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2787 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2788 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2789
2790 // PXOR with input text
2791 __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
2792 __ pxor(xmm_result1, xmm_from1);
2793 __ pxor(xmm_result2, xmm_from2);
2794
2795 // store PARALLEL_FACTOR results into the next 64 bytes of output
2796 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2797 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2798 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2799
2800 // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
2801 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2802 __ pxor(xmm_result3, xmm_from3);
2803 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2804
2805 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
2806 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
2807 __ jmp(L_multiBlock_loopTop[k]);
2808
2809 // singleBlock starts here
2810 __ align(OptoLoopAlignment);
2811 __ BIND(L_singleBlockLoopTop[k]);
2812 __ cmpptr(len_reg, 0);
2813 __ jcc(Assembler::equal, L_exit);
2814 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2815 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2816 __ movdqa(xmm_result0, xmm_curr_counter);
2817 load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
2818 __ push(rbx);//rbx is used for increasing counter
2819 inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
2820 __ pop (rbx);
2821 __ pshufb(xmm_result0, xmm_counter_shuf_mask);
2822 __ pxor(xmm_result0, xmm_key);
2823 for (int i = 1; i < rounds[k]; i++) {
2824 load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2825 __ aesenc(xmm_result0, xmm_key);
2826 }
2827 load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2828 __ aesenclast(xmm_result0, xmm_key);
2829 __ cmpptr(len_reg, AESBlockSize);
2830 __ jcc(Assembler::less, L_processTail_insr[k]);
2831 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2832 __ pxor(xmm_result0, xmm_from0);
2833 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2834 __ addptr(pos, AESBlockSize);
2835 __ subptr(len_reg, AESBlockSize);
2836 __ jmp(L_singleBlockLoopTop[k]);
2837
2838 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array
2839 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register
2840 __ testptr(len_reg, 8);
2841 __ jcc(Assembler::zero, L_processTail_4_insr[k]);
2842 __ subptr(pos,8);
2843 __ pinsrd(xmm_from0, Address(from, pos), 0);
2844 __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
2845 __ BIND(L_processTail_4_insr[k]);
2846 __ testptr(len_reg, 4);
2847 __ jcc(Assembler::zero, L_processTail_2_insr[k]);
2848 __ subptr(pos,4);
2849 __ pslldq(xmm_from0, 4);
2850 __ pinsrd(xmm_from0, Address(from, pos), 0);
2851 __ BIND(L_processTail_2_insr[k]);
2852 __ testptr(len_reg, 2);
2853 __ jcc(Assembler::zero, L_processTail_1_insr[k]);
2854 __ subptr(pos, 2);
2855 __ pslldq(xmm_from0, 2);
2856 __ pinsrw(xmm_from0, Address(from, pos), 0);
2857 __ BIND(L_processTail_1_insr[k]);
2858 __ testptr(len_reg, 1);
2859 __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
2860 __ subptr(pos, 1);
2861 __ pslldq(xmm_from0, 1);
2862 __ pinsrb(xmm_from0, Address(from, pos), 0);
2863 __ BIND(L_processTail_exit_insr[k]);
2864
2865 __ movptr(saved_encCounter_start, saved_counter_param);
2866 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
2867 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
2868
2869 __ testptr(len_reg, 8);
2870 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array
2871 __ pextrd(Address(to, pos), xmm_result0, 0);
2872 __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
2873 __ psrldq(xmm_result0, 8);
2874 __ addptr(pos, 8);
2875 __ BIND(L_processTail_4_extr[k]);
2876 __ testptr(len_reg, 4);
2877 __ jcc(Assembler::zero, L_processTail_2_extr[k]);
2878 __ pextrd(Address(to, pos), xmm_result0, 0);
2879 __ psrldq(xmm_result0, 4);
2880 __ addptr(pos, 4);
2881 __ BIND(L_processTail_2_extr[k]);
2882 __ testptr(len_reg, 2);
2883 __ jcc(Assembler::zero, L_processTail_1_extr[k]);
2884 __ pextrb(Address(to, pos), xmm_result0, 0);
2885 __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
2886 __ psrldq(xmm_result0, 2);
2887 __ addptr(pos, 2);
2888 __ BIND(L_processTail_1_extr[k]);
2889 __ testptr(len_reg, 1);
2890 __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
2891 __ pextrb(Address(to, pos), xmm_result0, 0);
2892
2893 __ BIND(L_processTail_exit_extr[k]);
2894 __ movptr(used_addr, used_addr_param);
2895 __ movl(Address(used_addr, 0), len_reg);
2896 __ jmp(L_exit);
2897 }
2898
2899 __ BIND(L_exit);
2900 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2901 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
2902 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
2903 handleSOERegisters(false /*restoring*/);
2904 __ movptr(rax, len_param); // return length
2905 __ leave(); // required for proper stackwalking of RuntimeStub frame
2906 __ ret(0);
2907
2908 __ BIND (L_key192_top);
2909 __ movptr(pos, NULL_WORD); // init pos before L_multiBlock_loopTop
2910 __ jmp(L_multiBlock_loopTop[1]); //key192
2911
2912 __ BIND (L_key256_top);
2913 __ movptr(pos, NULL_WORD); // init pos before L_multiBlock_loopTop
2914 __ jmp(L_multiBlock_loopTop[2]); //key192
2915
2916 return start;
2917 }
2918
generate_upper_word_mask()2919 address generate_upper_word_mask() {
2920 __ align(64);
2921 StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
2922 address start = __ pc();
2923 __ emit_data(0x00000000, relocInfo::none, 0);
2924 __ emit_data(0x00000000, relocInfo::none, 0);
2925 __ emit_data(0x00000000, relocInfo::none, 0);
2926 __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
2927 return start;
2928 }
2929
generate_shuffle_byte_flip_mask()2930 address generate_shuffle_byte_flip_mask() {
2931 __ align(64);
2932 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
2933 address start = __ pc();
2934 __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2935 __ emit_data(0x08090a0b, relocInfo::none, 0);
2936 __ emit_data(0x04050607, relocInfo::none, 0);
2937 __ emit_data(0x00010203, relocInfo::none, 0);
2938 return start;
2939 }
2940
2941 // ofs and limit are use for multi-block byte array.
2942 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
generate_sha1_implCompress(bool multi_block,const char * name)2943 address generate_sha1_implCompress(bool multi_block, const char *name) {
2944 __ align(CodeEntryAlignment);
2945 StubCodeMark mark(this, "StubRoutines", name);
2946 address start = __ pc();
2947
2948 Register buf = rax;
2949 Register state = rdx;
2950 Register ofs = rcx;
2951 Register limit = rdi;
2952
2953 const Address buf_param(rbp, 8 + 0);
2954 const Address state_param(rbp, 8 + 4);
2955 const Address ofs_param(rbp, 8 + 8);
2956 const Address limit_param(rbp, 8 + 12);
2957
2958 const XMMRegister abcd = xmm0;
2959 const XMMRegister e0 = xmm1;
2960 const XMMRegister e1 = xmm2;
2961 const XMMRegister msg0 = xmm3;
2962
2963 const XMMRegister msg1 = xmm4;
2964 const XMMRegister msg2 = xmm5;
2965 const XMMRegister msg3 = xmm6;
2966 const XMMRegister shuf_mask = xmm7;
2967
2968 __ enter();
2969 __ subptr(rsp, 8 * wordSize);
2970 handleSOERegisters(true /*saving*/);
2971
2972 __ movptr(buf, buf_param);
2973 __ movptr(state, state_param);
2974 if (multi_block) {
2975 __ movptr(ofs, ofs_param);
2976 __ movptr(limit, limit_param);
2977 }
2978
2979 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
2980 buf, state, ofs, limit, rsp, multi_block);
2981
2982 handleSOERegisters(false /*restoring*/);
2983 __ addptr(rsp, 8 * wordSize);
2984 __ leave();
2985 __ ret(0);
2986 return start;
2987 }
2988
generate_pshuffle_byte_flip_mask()2989 address generate_pshuffle_byte_flip_mask() {
2990 __ align(64);
2991 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
2992 address start = __ pc();
2993 __ emit_data(0x00010203, relocInfo::none, 0);
2994 __ emit_data(0x04050607, relocInfo::none, 0);
2995 __ emit_data(0x08090a0b, relocInfo::none, 0);
2996 __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2997 return start;
2998 }
2999
3000 // ofs and limit are use for multi-block byte array.
3001 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
generate_sha256_implCompress(bool multi_block,const char * name)3002 address generate_sha256_implCompress(bool multi_block, const char *name) {
3003 __ align(CodeEntryAlignment);
3004 StubCodeMark mark(this, "StubRoutines", name);
3005 address start = __ pc();
3006
3007 Register buf = rbx;
3008 Register state = rsi;
3009 Register ofs = rdx;
3010 Register limit = rcx;
3011
3012 const Address buf_param(rbp, 8 + 0);
3013 const Address state_param(rbp, 8 + 4);
3014 const Address ofs_param(rbp, 8 + 8);
3015 const Address limit_param(rbp, 8 + 12);
3016
3017 const XMMRegister msg = xmm0;
3018 const XMMRegister state0 = xmm1;
3019 const XMMRegister state1 = xmm2;
3020 const XMMRegister msgtmp0 = xmm3;
3021
3022 const XMMRegister msgtmp1 = xmm4;
3023 const XMMRegister msgtmp2 = xmm5;
3024 const XMMRegister msgtmp3 = xmm6;
3025 const XMMRegister msgtmp4 = xmm7;
3026
3027 __ enter();
3028 __ subptr(rsp, 8 * wordSize);
3029 handleSOERegisters(true /*saving*/);
3030 __ movptr(buf, buf_param);
3031 __ movptr(state, state_param);
3032 if (multi_block) {
3033 __ movptr(ofs, ofs_param);
3034 __ movptr(limit, limit_param);
3035 }
3036
3037 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3038 buf, state, ofs, limit, rsp, multi_block);
3039
3040 handleSOERegisters(false);
3041 __ addptr(rsp, 8 * wordSize);
3042 __ leave();
3043 __ ret(0);
3044 return start;
3045 }
3046
3047 // byte swap x86 long
generate_ghash_long_swap_mask()3048 address generate_ghash_long_swap_mask() {
3049 __ align(CodeEntryAlignment);
3050 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3051 address start = __ pc();
3052 __ emit_data(0x0b0a0908, relocInfo::none, 0);
3053 __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
3054 __ emit_data(0x03020100, relocInfo::none, 0);
3055 __ emit_data(0x07060504, relocInfo::none, 0);
3056
3057 return start;
3058 }
3059
3060 // byte swap x86 byte array
generate_ghash_byte_swap_mask()3061 address generate_ghash_byte_swap_mask() {
3062 __ align(CodeEntryAlignment);
3063 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3064 address start = __ pc();
3065 __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3066 __ emit_data(0x08090a0b, relocInfo::none, 0);
3067 __ emit_data(0x04050607, relocInfo::none, 0);
3068 __ emit_data(0x00010203, relocInfo::none, 0);
3069 return start;
3070 }
3071
3072 /* Single and multi-block ghash operations */
generate_ghash_processBlocks()3073 address generate_ghash_processBlocks() {
3074 assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
3075 __ align(CodeEntryAlignment);
3076 Label L_ghash_loop, L_exit;
3077 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3078 address start = __ pc();
3079
3080 const Register state = rdi;
3081 const Register subkeyH = rsi;
3082 const Register data = rdx;
3083 const Register blocks = rcx;
3084
3085 const Address state_param(rbp, 8+0);
3086 const Address subkeyH_param(rbp, 8+4);
3087 const Address data_param(rbp, 8+8);
3088 const Address blocks_param(rbp, 8+12);
3089
3090 const XMMRegister xmm_temp0 = xmm0;
3091 const XMMRegister xmm_temp1 = xmm1;
3092 const XMMRegister xmm_temp2 = xmm2;
3093 const XMMRegister xmm_temp3 = xmm3;
3094 const XMMRegister xmm_temp4 = xmm4;
3095 const XMMRegister xmm_temp5 = xmm5;
3096 const XMMRegister xmm_temp6 = xmm6;
3097 const XMMRegister xmm_temp7 = xmm7;
3098
3099 __ enter();
3100 handleSOERegisters(true); // Save registers
3101
3102 __ movptr(state, state_param);
3103 __ movptr(subkeyH, subkeyH_param);
3104 __ movptr(data, data_param);
3105 __ movptr(blocks, blocks_param);
3106
3107 __ movdqu(xmm_temp0, Address(state, 0));
3108 __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3109
3110 __ movdqu(xmm_temp1, Address(subkeyH, 0));
3111 __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3112
3113 __ BIND(L_ghash_loop);
3114 __ movdqu(xmm_temp2, Address(data, 0));
3115 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3116
3117 __ pxor(xmm_temp0, xmm_temp2);
3118
3119 //
3120 // Multiply with the hash key
3121 //
3122 __ movdqu(xmm_temp3, xmm_temp0);
3123 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
3124 __ movdqu(xmm_temp4, xmm_temp0);
3125 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
3126
3127 __ movdqu(xmm_temp5, xmm_temp0);
3128 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
3129 __ movdqu(xmm_temp6, xmm_temp0);
3130 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
3131
3132 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
3133
3134 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
3135 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
3136 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
3137 __ pxor(xmm_temp3, xmm_temp5);
3138 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
3139 // of the carry-less multiplication of
3140 // xmm0 by xmm1.
3141
3142 // We shift the result of the multiplication by one bit position
3143 // to the left to cope for the fact that the bits are reversed.
3144 __ movdqu(xmm_temp7, xmm_temp3);
3145 __ movdqu(xmm_temp4, xmm_temp6);
3146 __ pslld (xmm_temp3, 1);
3147 __ pslld(xmm_temp6, 1);
3148 __ psrld(xmm_temp7, 31);
3149 __ psrld(xmm_temp4, 31);
3150 __ movdqu(xmm_temp5, xmm_temp7);
3151 __ pslldq(xmm_temp4, 4);
3152 __ pslldq(xmm_temp7, 4);
3153 __ psrldq(xmm_temp5, 12);
3154 __ por(xmm_temp3, xmm_temp7);
3155 __ por(xmm_temp6, xmm_temp4);
3156 __ por(xmm_temp6, xmm_temp5);
3157
3158 //
3159 // First phase of the reduction
3160 //
3161 // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
3162 // independently.
3163 __ movdqu(xmm_temp7, xmm_temp3);
3164 __ movdqu(xmm_temp4, xmm_temp3);
3165 __ movdqu(xmm_temp5, xmm_temp3);
3166 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
3167 __ pslld(xmm_temp4, 30); // packed right shift shifting << 30
3168 __ pslld(xmm_temp5, 25); // packed right shift shifting << 25
3169 __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions
3170 __ pxor(xmm_temp7, xmm_temp5);
3171 __ movdqu(xmm_temp4, xmm_temp7);
3172 __ pslldq(xmm_temp7, 12);
3173 __ psrldq(xmm_temp4, 4);
3174 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
3175
3176 //
3177 // Second phase of the reduction
3178 //
3179 // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
3180 // shift operations.
3181 __ movdqu(xmm_temp2, xmm_temp3);
3182 __ movdqu(xmm_temp7, xmm_temp3);
3183 __ movdqu(xmm_temp5, xmm_temp3);
3184 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
3185 __ psrld(xmm_temp7, 2); // packed left shifting >> 2
3186 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
3187 __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions
3188 __ pxor(xmm_temp2, xmm_temp5);
3189 __ pxor(xmm_temp2, xmm_temp4);
3190 __ pxor(xmm_temp3, xmm_temp2);
3191 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
3192
3193 __ decrement(blocks);
3194 __ jcc(Assembler::zero, L_exit);
3195 __ movdqu(xmm_temp0, xmm_temp6);
3196 __ addptr(data, 16);
3197 __ jmp(L_ghash_loop);
3198
3199 __ BIND(L_exit);
3200 // Byte swap 16-byte result
3201 __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3202 __ movdqu(Address(state, 0), xmm_temp6); // store the result
3203
3204 handleSOERegisters(false); // restore registers
3205 __ leave();
3206 __ ret(0);
3207 return start;
3208 }
3209
3210 /**
3211 * Arguments:
3212 *
3213 * Inputs:
3214 * rsp(4) - int crc
3215 * rsp(8) - byte* buf
3216 * rsp(12) - int length
3217 *
3218 * Ouput:
3219 * rax - int crc result
3220 */
generate_updateBytesCRC32()3221 address generate_updateBytesCRC32() {
3222 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3223
3224 __ align(CodeEntryAlignment);
3225 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3226
3227 address start = __ pc();
3228
3229 const Register crc = rdx; // crc
3230 const Register buf = rsi; // source java byte array address
3231 const Register len = rcx; // length
3232 const Register table = rdi; // crc_table address (reuse register)
3233 const Register tmp = rbx;
3234 assert_different_registers(crc, buf, len, table, tmp, rax);
3235
3236 BLOCK_COMMENT("Entry:");
3237 __ enter(); // required for proper stackwalking of RuntimeStub frame
3238 __ push(rsi);
3239 __ push(rdi);
3240 __ push(rbx);
3241
3242 Address crc_arg(rbp, 8 + 0);
3243 Address buf_arg(rbp, 8 + 4);
3244 Address len_arg(rbp, 8 + 8);
3245
3246 // Load up:
3247 __ movl(crc, crc_arg);
3248 __ movptr(buf, buf_arg);
3249 __ movl(len, len_arg);
3250
3251 __ kernel_crc32(crc, buf, len, table, tmp);
3252
3253 __ movl(rax, crc);
3254 __ pop(rbx);
3255 __ pop(rdi);
3256 __ pop(rsi);
3257 __ vzeroupper();
3258 __ leave(); // required for proper stackwalking of RuntimeStub frame
3259 __ ret(0);
3260
3261 return start;
3262 }
3263
3264 /**
3265 * Arguments:
3266 *
3267 * Inputs:
3268 * rsp(4) - int crc
3269 * rsp(8) - byte* buf
3270 * rsp(12) - int length
3271 * rsp(16) - table_start - optional (present only when doing a library_calll,
3272 * not used by x86 algorithm)
3273 *
3274 * Ouput:
3275 * rax - int crc result
3276 */
generate_updateBytesCRC32C(bool is_pclmulqdq_supported)3277 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
3278 assert(UseCRC32CIntrinsics, "need SSE4_2");
3279 __ align(CodeEntryAlignment);
3280 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3281 address start = __ pc();
3282 const Register crc = rax; // crc
3283 const Register buf = rcx; // source java byte array address
3284 const Register len = rdx; // length
3285 const Register d = rbx;
3286 const Register g = rsi;
3287 const Register h = rdi;
3288 const Register empty = 0; // will never be used, in order not
3289 // to change a signature for crc32c_IPL_Alg2_Alt2
3290 // between 64/32 I'm just keeping it here
3291 assert_different_registers(crc, buf, len, d, g, h);
3292
3293 BLOCK_COMMENT("Entry:");
3294 __ enter(); // required for proper stackwalking of RuntimeStub frame
3295 Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
3296 // we need to add additional 4 because __ enter
3297 // have just pushed ebp on a stack
3298 Address buf_arg(rsp, 4 + 4 + 4);
3299 Address len_arg(rsp, 4 + 4 + 8);
3300 // Load up:
3301 __ movl(crc, crc_arg);
3302 __ movl(buf, buf_arg);
3303 __ movl(len, len_arg);
3304 __ push(d);
3305 __ push(g);
3306 __ push(h);
3307 __ crc32c_ipl_alg2_alt2(crc, buf, len,
3308 d, g, h,
3309 empty, empty, empty,
3310 xmm0, xmm1, xmm2,
3311 is_pclmulqdq_supported);
3312 __ pop(h);
3313 __ pop(g);
3314 __ pop(d);
3315 __ vzeroupper();
3316 __ leave(); // required for proper stackwalking of RuntimeStub frame
3317 __ ret(0);
3318
3319 return start;
3320 }
3321
generate_libmExp()3322 address generate_libmExp() {
3323 StubCodeMark mark(this, "StubRoutines", "libmExp");
3324
3325 address start = __ pc();
3326
3327 const XMMRegister x0 = xmm0;
3328 const XMMRegister x1 = xmm1;
3329 const XMMRegister x2 = xmm2;
3330 const XMMRegister x3 = xmm3;
3331
3332 const XMMRegister x4 = xmm4;
3333 const XMMRegister x5 = xmm5;
3334 const XMMRegister x6 = xmm6;
3335 const XMMRegister x7 = xmm7;
3336
3337 const Register tmp = rbx;
3338
3339 BLOCK_COMMENT("Entry:");
3340 __ enter(); // required for proper stackwalking of RuntimeStub frame
3341 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3342 __ leave(); // required for proper stackwalking of RuntimeStub frame
3343 __ ret(0);
3344
3345 return start;
3346
3347 }
3348
generate_libmLog()3349 address generate_libmLog() {
3350 StubCodeMark mark(this, "StubRoutines", "libmLog");
3351
3352 address start = __ pc();
3353
3354 const XMMRegister x0 = xmm0;
3355 const XMMRegister x1 = xmm1;
3356 const XMMRegister x2 = xmm2;
3357 const XMMRegister x3 = xmm3;
3358
3359 const XMMRegister x4 = xmm4;
3360 const XMMRegister x5 = xmm5;
3361 const XMMRegister x6 = xmm6;
3362 const XMMRegister x7 = xmm7;
3363
3364 const Register tmp = rbx;
3365
3366 BLOCK_COMMENT("Entry:");
3367 __ enter(); // required for proper stackwalking of RuntimeStub frame
3368 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3369 __ leave(); // required for proper stackwalking of RuntimeStub frame
3370 __ ret(0);
3371
3372 return start;
3373
3374 }
3375
generate_libmLog10()3376 address generate_libmLog10() {
3377 StubCodeMark mark(this, "StubRoutines", "libmLog10");
3378
3379 address start = __ pc();
3380
3381 const XMMRegister x0 = xmm0;
3382 const XMMRegister x1 = xmm1;
3383 const XMMRegister x2 = xmm2;
3384 const XMMRegister x3 = xmm3;
3385
3386 const XMMRegister x4 = xmm4;
3387 const XMMRegister x5 = xmm5;
3388 const XMMRegister x6 = xmm6;
3389 const XMMRegister x7 = xmm7;
3390
3391 const Register tmp = rbx;
3392
3393 BLOCK_COMMENT("Entry:");
3394 __ enter(); // required for proper stackwalking of RuntimeStub frame
3395 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3396 __ leave(); // required for proper stackwalking of RuntimeStub frame
3397 __ ret(0);
3398
3399 return start;
3400
3401 }
3402
generate_libmPow()3403 address generate_libmPow() {
3404 StubCodeMark mark(this, "StubRoutines", "libmPow");
3405
3406 address start = __ pc();
3407
3408 const XMMRegister x0 = xmm0;
3409 const XMMRegister x1 = xmm1;
3410 const XMMRegister x2 = xmm2;
3411 const XMMRegister x3 = xmm3;
3412
3413 const XMMRegister x4 = xmm4;
3414 const XMMRegister x5 = xmm5;
3415 const XMMRegister x6 = xmm6;
3416 const XMMRegister x7 = xmm7;
3417
3418 const Register tmp = rbx;
3419
3420 BLOCK_COMMENT("Entry:");
3421 __ enter(); // required for proper stackwalking of RuntimeStub frame
3422 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3423 __ leave(); // required for proper stackwalking of RuntimeStub frame
3424 __ ret(0);
3425
3426 return start;
3427
3428 }
3429
generate_libm_reduce_pi04l()3430 address generate_libm_reduce_pi04l() {
3431 StubCodeMark mark(this, "StubRoutines", "libm_reduce_pi04l");
3432
3433 address start = __ pc();
3434
3435 BLOCK_COMMENT("Entry:");
3436 __ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3437
3438 return start;
3439
3440 }
3441
generate_libm_sin_cos_huge()3442 address generate_libm_sin_cos_huge() {
3443 StubCodeMark mark(this, "StubRoutines", "libm_sin_cos_huge");
3444
3445 address start = __ pc();
3446
3447 const XMMRegister x0 = xmm0;
3448 const XMMRegister x1 = xmm1;
3449
3450 BLOCK_COMMENT("Entry:");
3451 __ libm_sincos_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3452
3453 return start;
3454
3455 }
3456
generate_libmSin()3457 address generate_libmSin() {
3458 StubCodeMark mark(this, "StubRoutines", "libmSin");
3459
3460 address start = __ pc();
3461
3462 const XMMRegister x0 = xmm0;
3463 const XMMRegister x1 = xmm1;
3464 const XMMRegister x2 = xmm2;
3465 const XMMRegister x3 = xmm3;
3466
3467 const XMMRegister x4 = xmm4;
3468 const XMMRegister x5 = xmm5;
3469 const XMMRegister x6 = xmm6;
3470 const XMMRegister x7 = xmm7;
3471
3472 BLOCK_COMMENT("Entry:");
3473 __ enter(); // required for proper stackwalking of RuntimeStub frame
3474 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rdx);
3475 __ leave(); // required for proper stackwalking of RuntimeStub frame
3476 __ ret(0);
3477
3478 return start;
3479
3480 }
3481
generate_libmCos()3482 address generate_libmCos() {
3483 StubCodeMark mark(this, "StubRoutines", "libmCos");
3484
3485 address start = __ pc();
3486
3487 const XMMRegister x0 = xmm0;
3488 const XMMRegister x1 = xmm1;
3489 const XMMRegister x2 = xmm2;
3490 const XMMRegister x3 = xmm3;
3491
3492 const XMMRegister x4 = xmm4;
3493 const XMMRegister x5 = xmm5;
3494 const XMMRegister x6 = xmm6;
3495 const XMMRegister x7 = xmm7;
3496
3497 const Register tmp = rbx;
3498
3499 BLOCK_COMMENT("Entry:");
3500 __ enter(); // required for proper stackwalking of RuntimeStub frame
3501 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3502 __ leave(); // required for proper stackwalking of RuntimeStub frame
3503 __ ret(0);
3504
3505 return start;
3506
3507 }
3508
generate_libm_tan_cot_huge()3509 address generate_libm_tan_cot_huge() {
3510 StubCodeMark mark(this, "StubRoutines", "libm_tan_cot_huge");
3511
3512 address start = __ pc();
3513
3514 const XMMRegister x0 = xmm0;
3515 const XMMRegister x1 = xmm1;
3516
3517 BLOCK_COMMENT("Entry:");
3518 __ libm_tancot_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3519
3520 return start;
3521
3522 }
3523
generate_libmTan()3524 address generate_libmTan() {
3525 StubCodeMark mark(this, "StubRoutines", "libmTan");
3526
3527 address start = __ pc();
3528
3529 const XMMRegister x0 = xmm0;
3530 const XMMRegister x1 = xmm1;
3531 const XMMRegister x2 = xmm2;
3532 const XMMRegister x3 = xmm3;
3533
3534 const XMMRegister x4 = xmm4;
3535 const XMMRegister x5 = xmm5;
3536 const XMMRegister x6 = xmm6;
3537 const XMMRegister x7 = xmm7;
3538
3539 const Register tmp = rbx;
3540
3541 BLOCK_COMMENT("Entry:");
3542 __ enter(); // required for proper stackwalking of RuntimeStub frame
3543 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3544 __ leave(); // required for proper stackwalking of RuntimeStub frame
3545 __ ret(0);
3546
3547 return start;
3548
3549 }
3550
3551 // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3552 void generate_safefetch(const char* name, int size, address* entry,
3553 address* fault_pc, address* continuation_pc) {
3554 // safefetch signatures:
3555 // int SafeFetch32(int* adr, int errValue);
3556 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3557
3558 StubCodeMark mark(this, "StubRoutines", name);
3559
3560 // Entry point, pc or function descriptor.
3561 *entry = __ pc();
3562
3563 __ movl(rax, Address(rsp, 0x8));
3564 __ movl(rcx, Address(rsp, 0x4));
3565 // Load *adr into eax, may fault.
3566 *fault_pc = __ pc();
3567 switch (size) {
3568 case 4:
3569 // int32_t
3570 __ movl(rax, Address(rcx, 0));
3571 break;
3572 case 8:
3573 // int64_t
3574 Unimplemented();
3575 break;
3576 default:
3577 ShouldNotReachHere();
3578 }
3579
3580 // Return errValue or *adr.
3581 *continuation_pc = __ pc();
3582 __ ret(0);
3583 }
3584
generate_method_entry_barrier()3585 address generate_method_entry_barrier() {
3586 __ align(CodeEntryAlignment);
3587 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
3588
3589 Label deoptimize_label;
3590
3591 address start = __ pc();
3592
3593 __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
3594
3595 BLOCK_COMMENT("Entry:");
3596 __ enter(); // save rbp
3597
3598 // save rbx, because we want to use that value.
3599 // We could do without it but then we depend on the number of slots used by pusha
3600 __ push(rbx);
3601
3602 __ lea(rbx, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for rbx - this should be the return address
3603
3604 __ pusha();
3605
3606 // xmm0 and xmm1 may be used for passing float/double arguments
3607 const int xmm_size = wordSize * 4;
3608 const int xmm_spill_size = xmm_size * 2;
3609 __ subptr(rsp, xmm_spill_size);
3610 __ movdqu(Address(rsp, xmm_size * 1), xmm1);
3611 __ movdqu(Address(rsp, xmm_size * 0), xmm0);
3612
3613 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), rbx);
3614
3615 __ movdqu(xmm0, Address(rsp, xmm_size * 0));
3616 __ movdqu(xmm1, Address(rsp, xmm_size * 1));
3617 __ addptr(rsp, xmm_spill_size);
3618
3619 __ cmpl(rax, 1); // 1 means deoptimize
3620 __ jcc(Assembler::equal, deoptimize_label);
3621
3622 __ popa();
3623 __ pop(rbx);
3624
3625 __ leave();
3626
3627 __ addptr(rsp, 1 * wordSize); // cookie
3628 __ ret(0);
3629
3630 __ BIND(deoptimize_label);
3631
3632 __ popa();
3633 __ pop(rbx);
3634
3635 __ leave();
3636
3637 // this can be taken out, but is good for verification purposes. getting a SIGSEGV
3638 // here while still having a correct stack is valuable
3639 __ testptr(rsp, Address(rsp, 0));
3640
3641 __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
3642 __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
3643
3644 return start;
3645 }
3646
3647 public:
3648 // Information about frame layout at time of blocking runtime call.
3649 // Note that we only have to preserve callee-saved registers since
3650 // the compilers are responsible for supplying a continuation point
3651 // if they expect all registers to be preserved.
3652 enum layout {
3653 thread_off, // last_java_sp
3654 arg1_off,
3655 arg2_off,
3656 rbp_off, // callee saved register
3657 ret_pc,
3658 framesize
3659 };
3660
3661 private:
3662
3663 #undef __
3664 #define __ masm->
3665
3666 //------------------------------------------------------------------------------------------------------------------------
3667 // Continuation point for throwing of implicit exceptions that are not handled in
3668 // the current activation. Fabricates an exception oop and initiates normal
3669 // exception dispatching in this frame.
3670 //
3671 // Previously the compiler (c2) allowed for callee save registers on Java calls.
3672 // This is no longer true after adapter frames were removed but could possibly
3673 // be brought back in the future if the interpreter code was reworked and it
3674 // was deemed worthwhile. The comment below was left to describe what must
3675 // happen here if callee saves were resurrected. As it stands now this stub
3676 // could actually be a vanilla BufferBlob and have now oopMap at all.
3677 // Since it doesn't make much difference we've chosen to leave it the
3678 // way it was in the callee save days and keep the comment.
3679
3680 // If we need to preserve callee-saved values we need a callee-saved oop map and
3681 // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
3682 // If the compiler needs all registers to be preserved between the fault
3683 // point and the exception handler then it must assume responsibility for that in
3684 // AbstractCompiler::continuation_for_implicit_null_exception or
3685 // continuation_for_implicit_division_by_zero_exception. All other implicit
3686 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
3687 // either at call sites or otherwise assume that stack unwinding will be initiated,
3688 // so caller saved registers were assumed volatile in the compiler.
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)3689 address generate_throw_exception(const char* name, address runtime_entry,
3690 Register arg1 = noreg, Register arg2 = noreg) {
3691
3692 int insts_size = 256;
3693 int locs_size = 32;
3694
3695 CodeBuffer code(name, insts_size, locs_size);
3696 OopMapSet* oop_maps = new OopMapSet();
3697 MacroAssembler* masm = new MacroAssembler(&code);
3698
3699 address start = __ pc();
3700
3701 // This is an inlined and slightly modified version of call_VM
3702 // which has the ability to fetch the return PC out of
3703 // thread-local storage and also sets up last_Java_sp slightly
3704 // differently than the real call_VM
3705 Register java_thread = rbx;
3706 __ get_thread(java_thread);
3707
3708 __ enter(); // required for proper stackwalking of RuntimeStub frame
3709
3710 // pc and rbp, already pushed
3711 __ subptr(rsp, (framesize-2) * wordSize); // prolog
3712
3713 // Frame is now completed as far as size and linkage.
3714
3715 int frame_complete = __ pc() - start;
3716
3717 // push java thread (becomes first argument of C function)
3718 __ movptr(Address(rsp, thread_off * wordSize), java_thread);
3719 if (arg1 != noreg) {
3720 __ movptr(Address(rsp, arg1_off * wordSize), arg1);
3721 }
3722 if (arg2 != noreg) {
3723 assert(arg1 != noreg, "missing reg arg");
3724 __ movptr(Address(rsp, arg2_off * wordSize), arg2);
3725 }
3726
3727 // Set up last_Java_sp and last_Java_fp
3728 __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
3729
3730 // Call runtime
3731 BLOCK_COMMENT("call runtime_entry");
3732 __ call(RuntimeAddress(runtime_entry));
3733 // Generate oop map
3734 OopMap* map = new OopMap(framesize, 0);
3735 oop_maps->add_gc_map(__ pc() - start, map);
3736
3737 // restore the thread (cannot use the pushed argument since arguments
3738 // may be overwritten by C code generated by an optimizing compiler);
3739 // however can use the register value directly if it is callee saved.
3740 __ get_thread(java_thread);
3741
3742 __ reset_last_Java_frame(java_thread, true);
3743
3744 __ leave(); // required for proper stackwalking of RuntimeStub frame
3745
3746 // check for pending exceptions
3747 #ifdef ASSERT
3748 Label L;
3749 __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3750 __ jcc(Assembler::notEqual, L);
3751 __ should_not_reach_here();
3752 __ bind(L);
3753 #endif /* ASSERT */
3754 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3755
3756
3757 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
3758 return stub->entry_point();
3759 }
3760
3761
create_control_words()3762 void create_control_words() {
3763 // Round to nearest, 53-bit mode, exceptions masked
3764 StubRoutines::_fpu_cntrl_wrd_std = 0x027F;
3765 // Round to zero, 53-bit mode, exception mased
3766 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
3767 // Round to nearest, 24-bit mode, exceptions masked
3768 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F;
3769 // Round to nearest, 64-bit mode, exceptions masked
3770 StubRoutines::_mxcsr_std = 0x1F80;
3771 // Note: the following two constants are 80-bit values
3772 // layout is critical for correct loading by FPU.
3773 // Bias for strict fp multiply/divide
3774 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
3775 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
3776 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
3777 // Un-Bias for strict fp multiply/divide
3778 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
3779 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
3780 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
3781 }
3782
3783 //---------------------------------------------------------------------------
3784 // Initialization
3785
generate_initial()3786 void generate_initial() {
3787 // Generates all stubs and initializes the entry points
3788
3789 //------------------------------------------------------------------------------------------------------------------------
3790 // entry points that exist in all platforms
3791 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3792 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3793 StubRoutines::_forward_exception_entry = generate_forward_exception();
3794
3795 StubRoutines::_call_stub_entry =
3796 generate_call_stub(StubRoutines::_call_stub_return_address);
3797 // is referenced by megamorphic call
3798 StubRoutines::_catch_exception_entry = generate_catch_exception();
3799
3800 // These are currently used by Solaris/Intel
3801 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
3802
3803 // platform dependent
3804 create_control_words();
3805
3806 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr();
3807 StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = generate_verify_fpu_cntrl_wrd();
3808 StubRoutines::_d2i_wrapper = generate_d2i_wrapper(T_INT,
3809 CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
3810 StubRoutines::_d2l_wrapper = generate_d2i_wrapper(T_LONG,
3811 CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
3812
3813 // Build this early so it's available for the interpreter
3814 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception",
3815 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
3816 StubRoutines::_throw_delayed_StackOverflowError_entry = generate_throw_exception("delayed StackOverflowError throw_exception",
3817 CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
3818
3819 if (UseCRC32Intrinsics) {
3820 // set table address before stub generation which use it
3821 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
3822 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
3823 }
3824
3825 if (UseCRC32CIntrinsics) {
3826 bool supports_clmul = VM_Version::supports_clmul();
3827 StubRoutines::x86::generate_CRC32C_table(supports_clmul);
3828 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
3829 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
3830 }
3831 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
3832 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3833 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3834 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3835 StubRoutines::x86::_L_2il0floatpacket_0_adr = (address)StubRoutines::x86::_L_2il0floatpacket_0;
3836 StubRoutines::x86::_Pi4Inv_adr = (address)StubRoutines::x86::_Pi4Inv;
3837 StubRoutines::x86::_Pi4x3_adr = (address)StubRoutines::x86::_Pi4x3;
3838 StubRoutines::x86::_Pi4x4_adr = (address)StubRoutines::x86::_Pi4x4;
3839 StubRoutines::x86::_ones_adr = (address)StubRoutines::x86::_ones;
3840 }
3841 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
3842 StubRoutines::_dexp = generate_libmExp();
3843 }
3844 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
3845 StubRoutines::_dlog = generate_libmLog();
3846 }
3847 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
3848 StubRoutines::_dlog10 = generate_libmLog10();
3849 }
3850 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
3851 StubRoutines::_dpow = generate_libmPow();
3852 }
3853 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3854 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
3855 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3856 StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l();
3857 }
3858 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
3859 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3860 StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge();
3861 }
3862 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
3863 StubRoutines::_dsin = generate_libmSin();
3864 }
3865 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
3866 StubRoutines::_dcos = generate_libmCos();
3867 }
3868 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
3869 StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge();
3870 StubRoutines::_dtan = generate_libmTan();
3871 }
3872 }
3873
3874 // Safefetch stubs.
3875 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
3876 &StubRoutines::_safefetch32_fault_pc,
3877 &StubRoutines::_safefetch32_continuation_pc);
3878 StubRoutines::_safefetchN_entry = StubRoutines::_safefetch32_entry;
3879 StubRoutines::_safefetchN_fault_pc = StubRoutines::_safefetch32_fault_pc;
3880 StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
3881 }
3882
generate_all()3883 void generate_all() {
3884 // Generates all stubs and initializes the entry points
3885
3886 // These entry points require SharedInfo::stack0 to be set up in non-core builds
3887 // and need to be relocatable, so they each fabricate a RuntimeStub internally.
3888 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
3889 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
3890 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
3891
3892 //------------------------------------------------------------------------------------------------------------------------
3893 // entry points that are platform specific
3894
3895 StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF);
3896 StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x80000000);
3897 StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF);
3898 StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000);
3899 StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff);
3900 StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
3901 StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000);
3902
3903 // support for verify_oop (must happen after universe_init)
3904 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
3905
3906 // arraycopy stubs used by compilers
3907 generate_arraycopy_stubs();
3908
3909 // don't bother generating these AES intrinsic stubs unless global flag is set
3910 if (UseAESIntrinsics) {
3911 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // might be needed by the others
3912
3913 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3914 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3915 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3916 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
3917 }
3918
3919 if (UseAESCTRIntrinsics) {
3920 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
3921 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
3922 }
3923
3924 if (UseSHA1Intrinsics) {
3925 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
3926 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
3927 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
3928 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
3929 }
3930 if (UseSHA256Intrinsics) {
3931 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
3932 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
3933 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
3934 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
3935 }
3936
3937 // Generate GHASH intrinsics code
3938 if (UseGHASHIntrinsics) {
3939 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
3940 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
3941 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
3942 }
3943
3944 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3945 if (bs_nm != NULL) {
3946 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
3947 }
3948 }
3949
3950
3951 public:
StubGenerator(CodeBuffer * code,bool all)3952 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3953 if (all) {
3954 generate_all();
3955 } else {
3956 generate_initial();
3957 }
3958 }
3959 }; // end class declaration
3960
3961 #define UCM_TABLE_MAX_ENTRIES 8
StubGenerator_generate(CodeBuffer * code,bool all)3962 void StubGenerator_generate(CodeBuffer* code, bool all) {
3963 if (UnsafeCopyMemory::_table == NULL) {
3964 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
3965 }
3966 StubGenerator g(code, all);
3967 }
3968