1 /*
2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "interpreter/interpreter.hpp"
29 #include "nativeInst_ppc.hpp"
30 #include "oops/instanceOop.hpp"
31 #include "oops/method.hpp"
32 #include "oops/objArrayKlass.hpp"
33 #include "oops/oop.inline.hpp"
34 #include "prims/methodHandles.hpp"
35 #include "runtime/frame.inline.hpp"
36 #include "runtime/handles.inline.hpp"
37 #include "runtime/sharedRuntime.hpp"
38 #include "runtime/stubCodeGenerator.hpp"
39 #include "runtime/stubRoutines.hpp"
40 #include "utilities/top.hpp"
41 #include "runtime/thread.inline.hpp"
42
43 #define __ _masm->
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) // nothing
47 #else
48 #define BLOCK_COMMENT(str) __ block_comment(str)
49 #endif
50
51 class StubGenerator: public StubCodeGenerator {
52 private:
53
54 // Call stubs are used to call Java from C
55 //
56 // Arguments:
57 //
58 // R3 - call wrapper address : address
59 // R4 - result : intptr_t*
60 // R5 - result type : BasicType
61 // R6 - method : Method
62 // R7 - frame mgr entry point : address
63 // R8 - parameter block : intptr_t*
64 // R9 - parameter count in words : int
65 // R10 - thread : Thread*
66 //
generate_call_stub(address & return_address)67 address generate_call_stub(address& return_address) {
68 // Setup a new c frame, copy java arguments, call frame manager or
69 // native_entry, and process result.
70
71 StubCodeMark mark(this, "StubRoutines", "call_stub");
72
73 address start = __ function_entry();
74
75 // some sanity checks
76 assert((sizeof(frame::abi_minframe) % 16) == 0, "unaligned");
77 assert((sizeof(frame::abi_reg_args) % 16) == 0, "unaligned");
78 assert((sizeof(frame::spill_nonvolatiles) % 16) == 0, "unaligned");
79 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
80 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
81
82 Register r_arg_call_wrapper_addr = R3;
83 Register r_arg_result_addr = R4;
84 Register r_arg_result_type = R5;
85 Register r_arg_method = R6;
86 Register r_arg_entry = R7;
87 Register r_arg_thread = R10;
88
89 Register r_temp = R24;
90 Register r_top_of_arguments_addr = R25;
91 Register r_entryframe_fp = R26;
92
93 {
94 // Stack on entry to call_stub:
95 //
96 // F1 [C_FRAME]
97 // ...
98
99 Register r_arg_argument_addr = R8;
100 Register r_arg_argument_count = R9;
101 Register r_frame_alignment_in_bytes = R27;
102 Register r_argument_addr = R28;
103 Register r_argumentcopy_addr = R29;
104 Register r_argument_size_in_bytes = R30;
105 Register r_frame_size = R23;
106
107 Label arguments_copied;
108
109 // Save LR/CR to caller's C_FRAME.
110 __ save_LR_CR(R0);
111
112 // Zero extend arg_argument_count.
113 __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
114
115 // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
116 __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
117
118 // Keep copy of our frame pointer (caller's SP).
119 __ mr(r_entryframe_fp, R1_SP);
120
121 BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
122 // Push ENTRY_FRAME including arguments:
123 //
124 // F0 [TOP_IJAVA_FRAME_ABI]
125 // alignment (optional)
126 // [outgoing Java arguments]
127 // [ENTRY_FRAME_LOCALS]
128 // F1 [C_FRAME]
129 // ...
130
131 // calculate frame size
132
133 // unaligned size of arguments
134 __ sldi(r_argument_size_in_bytes,
135 r_arg_argument_count, Interpreter::logStackElementSize);
136 // arguments alignment (max 1 slot)
137 // FIXME: use round_to() here
138 __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
139 __ sldi(r_frame_alignment_in_bytes,
140 r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
141
142 // size = unaligned size of arguments + top abi's size
143 __ addi(r_frame_size, r_argument_size_in_bytes,
144 frame::top_ijava_frame_abi_size);
145 // size += arguments alignment
146 __ add(r_frame_size,
147 r_frame_size, r_frame_alignment_in_bytes);
148 // size += size of call_stub locals
149 __ addi(r_frame_size,
150 r_frame_size, frame::entry_frame_locals_size);
151
152 // push ENTRY_FRAME
153 __ push_frame(r_frame_size, r_temp);
154
155 // initialize call_stub locals (step 1)
156 __ std(r_arg_call_wrapper_addr,
157 _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
158 __ std(r_arg_result_addr,
159 _entry_frame_locals_neg(result_address), r_entryframe_fp);
160 __ std(r_arg_result_type,
161 _entry_frame_locals_neg(result_type), r_entryframe_fp);
162 // we will save arguments_tos_address later
163
164
165 BLOCK_COMMENT("Copy Java arguments");
166 // copy Java arguments
167
168 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
169 // FIXME: why not simply use SP+frame::top_ijava_frame_size?
170 __ addi(r_top_of_arguments_addr,
171 R1_SP, frame::top_ijava_frame_abi_size);
172 __ add(r_top_of_arguments_addr,
173 r_top_of_arguments_addr, r_frame_alignment_in_bytes);
174
175 // any arguments to copy?
176 __ cmpdi(CCR0, r_arg_argument_count, 0);
177 __ beq(CCR0, arguments_copied);
178
179 // prepare loop and copy arguments in reverse order
180 {
181 // init CTR with arg_argument_count
182 __ mtctr(r_arg_argument_count);
183
184 // let r_argumentcopy_addr point to last outgoing Java arguments P
185 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
186
187 // let r_argument_addr point to last incoming java argument
188 __ add(r_argument_addr,
189 r_arg_argument_addr, r_argument_size_in_bytes);
190 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
191
192 // now loop while CTR > 0 and copy arguments
193 {
194 Label next_argument;
195 __ bind(next_argument);
196
197 __ ld(r_temp, 0, r_argument_addr);
198 // argument_addr--;
199 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
200 __ std(r_temp, 0, r_argumentcopy_addr);
201 // argumentcopy_addr++;
202 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
203
204 __ bdnz(next_argument);
205 }
206 }
207
208 // Arguments copied, continue.
209 __ bind(arguments_copied);
210 }
211
212 {
213 BLOCK_COMMENT("Call frame manager or native entry.");
214 // Call frame manager or native entry.
215 Register r_new_arg_entry = R14;
216 assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
217 r_arg_method, r_arg_thread);
218
219 __ mr(r_new_arg_entry, r_arg_entry);
220
221 // Register state on entry to frame manager / native entry:
222 //
223 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
224 // R19_method - Method
225 // R16_thread - JavaThread*
226
227 // Tos must point to last argument - element_size.
228 #ifdef CC_INTERP
229 const Register tos = R17_tos;
230 #else
231 const Register tos = R15_esp;
232 #endif
233 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
234
235 // initialize call_stub locals (step 2)
236 // now save tos as arguments_tos_address
237 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
238
239 // load argument registers for call
240 __ mr(R19_method, r_arg_method);
241 __ mr(R16_thread, r_arg_thread);
242 assert(tos != r_arg_method, "trashed r_arg_method");
243 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
244
245 // Set R15_prev_state to 0 for simplifying checks in callee.
246 #ifdef CC_INTERP
247 __ li(R15_prev_state, 0);
248 #else
249 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
250 #endif
251 // Stack on entry to frame manager / native entry:
252 //
253 // F0 [TOP_IJAVA_FRAME_ABI]
254 // alignment (optional)
255 // [outgoing Java arguments]
256 // [ENTRY_FRAME_LOCALS]
257 // F1 [C_FRAME]
258 // ...
259 //
260
261 // global toc register
262 __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
263
264 // Load narrow oop base.
265 __ reinit_heapbase(R30, R11_scratch1);
266
267 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
268 // when called via a c2i.
269
270 // Pass initial_caller_sp to framemanager.
271 __ mr(R21_tmp1, R1_SP);
272
273 // Do a light-weight C-call here, r_new_arg_entry holds the address
274 // of the interpreter entry point (frame manager or native entry)
275 // and save runtime-value of LR in return_address.
276 assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
277 "trashed r_new_arg_entry");
278 return_address = __ call_stub(r_new_arg_entry);
279 }
280
281 {
282 BLOCK_COMMENT("Returned from frame manager or native entry.");
283 // Returned from frame manager or native entry.
284 // Now pop frame, process result, and return to caller.
285
286 // Stack on exit from frame manager / native entry:
287 //
288 // F0 [ABI]
289 // ...
290 // [ENTRY_FRAME_LOCALS]
291 // F1 [C_FRAME]
292 // ...
293 //
294 // Just pop the topmost frame ...
295 //
296
297 Label ret_is_object;
298 Label ret_is_long;
299 Label ret_is_float;
300 Label ret_is_double;
301
302 Register r_entryframe_fp = R30;
303 Register r_lr = R7_ARG5;
304 Register r_cr = R8_ARG6;
305
306 // Reload some volatile registers which we've spilled before the call
307 // to frame manager / native entry.
308 // Access all locals via frame pointer, because we know nothing about
309 // the topmost frame's size.
310 __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
311 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
312 __ ld(r_arg_result_addr,
313 _entry_frame_locals_neg(result_address), r_entryframe_fp);
314 __ ld(r_arg_result_type,
315 _entry_frame_locals_neg(result_type), r_entryframe_fp);
316 __ ld(r_cr, _abi(cr), r_entryframe_fp);
317 __ ld(r_lr, _abi(lr), r_entryframe_fp);
318
319 // pop frame and restore non-volatiles, LR and CR
320 __ mr(R1_SP, r_entryframe_fp);
321 __ mtcr(r_cr);
322 __ mtlr(r_lr);
323
324 // Store result depending on type. Everything that is not
325 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
326 __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
327 __ cmpwi(CCR1, r_arg_result_type, T_LONG);
328 __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
329 __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
330
331 // restore non-volatile registers
332 __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
333
334
335 // Stack on exit from call_stub:
336 //
337 // 0 [C_FRAME]
338 // ...
339 //
340 // no call_stub frames left.
341
342 // All non-volatiles have been restored at this point!!
343 assert(R3_RET == R3, "R3_RET should be R3");
344
345 __ beq(CCR0, ret_is_object);
346 __ beq(CCR1, ret_is_long);
347 __ beq(CCR5, ret_is_float);
348 __ beq(CCR6, ret_is_double);
349
350 // default:
351 __ stw(R3_RET, 0, r_arg_result_addr);
352 __ blr(); // return to caller
353
354 // case T_OBJECT:
355 __ bind(ret_is_object);
356 __ std(R3_RET, 0, r_arg_result_addr);
357 __ blr(); // return to caller
358
359 // case T_LONG:
360 __ bind(ret_is_long);
361 __ std(R3_RET, 0, r_arg_result_addr);
362 __ blr(); // return to caller
363
364 // case T_FLOAT:
365 __ bind(ret_is_float);
366 __ stfs(F1_RET, 0, r_arg_result_addr);
367 __ blr(); // return to caller
368
369 // case T_DOUBLE:
370 __ bind(ret_is_double);
371 __ stfd(F1_RET, 0, r_arg_result_addr);
372 __ blr(); // return to caller
373 }
374
375 return start;
376 }
377
378 // Return point for a Java call if there's an exception thrown in
379 // Java code. The exception is caught and transformed into a
380 // pending exception stored in JavaThread that can be tested from
381 // within the VM.
382 //
generate_catch_exception()383 address generate_catch_exception() {
384 StubCodeMark mark(this, "StubRoutines", "catch_exception");
385
386 address start = __ pc();
387
388 // Registers alive
389 //
390 // R16_thread
391 // R3_ARG1 - address of pending exception
392 // R4_ARG2 - return address in call stub
393
394 const Register exception_file = R21_tmp1;
395 const Register exception_line = R22_tmp2;
396
397 __ load_const(exception_file, (void*)__FILE__);
398 __ load_const(exception_line, (void*)__LINE__);
399
400 __ std(R3_ARG1, thread_(pending_exception));
401 // store into `char *'
402 __ std(exception_file, thread_(exception_file));
403 // store into `int'
404 __ stw(exception_line, thread_(exception_line));
405
406 // complete return to VM
407 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
408
409 __ mtlr(R4_ARG2);
410 // continue in call stub
411 __ blr();
412
413 return start;
414 }
415
416 // Continuation point for runtime calls returning with a pending
417 // exception. The pending exception check happened in the runtime
418 // or native call stub. The pending exception in Thread is
419 // converted into a Java-level exception.
420 //
generate_forward_exception()421 address generate_forward_exception() {
422 StubCodeMark mark(this, "StubRoutines", "forward_exception");
423 address start = __ pc();
424
425 #if !defined(PRODUCT)
426 if (VerifyOops) {
427 // Get pending exception oop.
428 __ ld(R3_ARG1,
429 in_bytes(Thread::pending_exception_offset()),
430 R16_thread);
431 // Make sure that this code is only executed if there is a pending exception.
432 {
433 Label L;
434 __ cmpdi(CCR0, R3_ARG1, 0);
435 __ bne(CCR0, L);
436 __ stop("StubRoutines::forward exception: no pending exception (1)");
437 __ bind(L);
438 }
439 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
440 }
441 #endif
442
443 // Save LR/CR and copy exception pc (LR) into R4_ARG2.
444 __ save_LR_CR(R4_ARG2);
445 __ push_frame_reg_args(0, R0);
446 // Find exception handler.
447 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
448 SharedRuntime::exception_handler_for_return_address),
449 R16_thread,
450 R4_ARG2);
451 // Copy handler's address.
452 __ mtctr(R3_RET);
453 __ pop_frame();
454 __ restore_LR_CR(R0);
455
456 // Set up the arguments for the exception handler:
457 // - R3_ARG1: exception oop
458 // - R4_ARG2: exception pc.
459
460 // Load pending exception oop.
461 __ ld(R3_ARG1,
462 in_bytes(Thread::pending_exception_offset()),
463 R16_thread);
464
465 // The exception pc is the return address in the caller.
466 // Must load it into R4_ARG2.
467 __ mflr(R4_ARG2);
468
469 #ifdef ASSERT
470 // Make sure exception is set.
471 {
472 Label L;
473 __ cmpdi(CCR0, R3_ARG1, 0);
474 __ bne(CCR0, L);
475 __ stop("StubRoutines::forward exception: no pending exception (2)");
476 __ bind(L);
477 }
478 #endif
479
480 // Clear the pending exception.
481 __ li(R0, 0);
482 __ std(R0,
483 in_bytes(Thread::pending_exception_offset()),
484 R16_thread);
485 // Jump to exception handler.
486 __ bctr();
487
488 return start;
489 }
490
491 #undef __
492 #define __ masm->
493 // Continuation point for throwing of implicit exceptions that are
494 // not handled in the current activation. Fabricates an exception
495 // oop and initiates normal exception dispatching in this
496 // frame. Only callee-saved registers are preserved (through the
497 // normal register window / RegisterMap handling). If the compiler
498 // needs all registers to be preserved between the fault point and
499 // the exception handler then it must assume responsibility for that
500 // in AbstractCompiler::continuation_for_implicit_null_exception or
501 // continuation_for_implicit_division_by_zero_exception. All other
502 // implicit exceptions (e.g., NullPointerException or
503 // AbstractMethodError on entry) are either at call sites or
504 // otherwise assume that stack unwinding will be initiated, so
505 // caller saved registers were assumed volatile in the compiler.
506 //
507 // Note that we generate only this stub into a RuntimeStub, because
508 // it needs to be properly traversed and ignored during GC, so we
509 // change the meaning of the "__" macro within this method.
510 //
511 // Note: the routine set_pc_not_at_call_for_caller in
512 // SharedRuntime.cpp requires that this code be generated into a
513 // RuntimeStub.
generate_throw_exception(const char * name,address runtime_entry,bool restore_saved_exception_pc,Register arg1=noreg,Register arg2=noreg)514 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
515 Register arg1 = noreg, Register arg2 = noreg) {
516 CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
517 MacroAssembler* masm = new MacroAssembler(&code);
518
519 OopMapSet* oop_maps = new OopMapSet();
520 int frame_size_in_bytes = frame::abi_reg_args_size;
521 OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
522
523 StubCodeMark mark(this, "StubRoutines", "throw_exception");
524
525 address start = __ pc();
526
527 __ save_LR_CR(R11_scratch1);
528
529 // Push a frame.
530 __ push_frame_reg_args(0, R11_scratch1);
531
532 address frame_complete_pc = __ pc();
533
534 if (restore_saved_exception_pc) {
535 __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
536 }
537
538 // Note that we always have a runtime stub frame on the top of
539 // stack by this point. Remember the offset of the instruction
540 // whose address will be moved to R11_scratch1.
541 address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
542
543 __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
544
545 __ mr(R3_ARG1, R16_thread);
546 if (arg1 != noreg) {
547 __ mr(R4_ARG2, arg1);
548 }
549 if (arg2 != noreg) {
550 __ mr(R5_ARG3, arg2);
551 }
552 #if defined(ABI_ELFv2)
553 __ call_c(runtime_entry, relocInfo::none);
554 #else
555 __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
556 #endif
557
558 // Set an oopmap for the call site.
559 oop_maps->add_gc_map((int)(gc_map_pc - start), map);
560
561 __ reset_last_Java_frame();
562
563 #ifdef ASSERT
564 // Make sure that this code is only executed if there is a pending
565 // exception.
566 {
567 Label L;
568 __ ld(R0,
569 in_bytes(Thread::pending_exception_offset()),
570 R16_thread);
571 __ cmpdi(CCR0, R0, 0);
572 __ bne(CCR0, L);
573 __ stop("StubRoutines::throw_exception: no pending exception");
574 __ bind(L);
575 }
576 #endif
577
578 // Pop frame.
579 __ pop_frame();
580
581 __ restore_LR_CR(R11_scratch1);
582
583 __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
584 __ mtctr(R11_scratch1);
585 __ bctr();
586
587 // Create runtime stub with OopMap.
588 RuntimeStub* stub =
589 RuntimeStub::new_runtime_stub(name, &code,
590 /*frame_complete=*/ (int)(frame_complete_pc - start),
591 frame_size_in_bytes/wordSize,
592 oop_maps,
593 false);
594 return stub->entry_point();
595 }
596 #undef __
597 #define __ _masm->
598
599 // Generate G1 pre-write barrier for array.
600 //
601 // Input:
602 // from - register containing src address (only needed for spilling)
603 // to - register containing starting address
604 // count - register containing element count
605 // tmp - scratch register
606 //
607 // Kills:
608 // nothing
609 //
gen_write_ref_array_pre_barrier(Register from,Register to,Register count,bool dest_uninitialized,Register Rtmp1)610 void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
611 BarrierSet* const bs = Universe::heap()->barrier_set();
612 switch (bs->kind()) {
613 case BarrierSet::G1SATBCT:
614 case BarrierSet::G1SATBCTLogging:
615 // With G1, don't generate the call if we statically know that the target in uninitialized
616 if (!dest_uninitialized) {
617 const int spill_slots = 4 * wordSize;
618 const int frame_size = frame::abi_reg_args_size + spill_slots;
619 Label filtered;
620
621 // Is marking active?
622 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
623 __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
624 } else {
625 guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
626 __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
627 }
628 __ cmpdi(CCR0, Rtmp1, 0);
629 __ beq(CCR0, filtered);
630
631 __ save_LR_CR(R0);
632 __ push_frame_reg_args(spill_slots, R0);
633 __ std(from, frame_size - 1 * wordSize, R1_SP);
634 __ std(to, frame_size - 2 * wordSize, R1_SP);
635 __ std(count, frame_size - 3 * wordSize, R1_SP);
636
637 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
638
639 __ ld(from, frame_size - 1 * wordSize, R1_SP);
640 __ ld(to, frame_size - 2 * wordSize, R1_SP);
641 __ ld(count, frame_size - 3 * wordSize, R1_SP);
642 __ pop_frame();
643 __ restore_LR_CR(R0);
644
645 __ bind(filtered);
646 }
647 break;
648 case BarrierSet::CardTableModRef:
649 case BarrierSet::CardTableExtension:
650 case BarrierSet::ModRef:
651 break;
652 default:
653 ShouldNotReachHere();
654 }
655 }
656
657 // Generate CMS/G1 post-write barrier for array.
658 //
659 // Input:
660 // addr - register containing starting address
661 // count - register containing element count
662 // tmp - scratch register
663 //
664 // The input registers and R0 are overwritten.
665 //
gen_write_ref_array_post_barrier(Register addr,Register count,Register tmp,bool branchToEnd)666 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
667 BarrierSet* const bs = Universe::heap()->barrier_set();
668
669 switch (bs->kind()) {
670 case BarrierSet::G1SATBCT:
671 case BarrierSet::G1SATBCTLogging:
672 {
673 if (branchToEnd) {
674 __ save_LR_CR(R0);
675 // We need this frame only to spill LR.
676 __ push_frame_reg_args(0, R0);
677 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
678 __ pop_frame();
679 __ restore_LR_CR(R0);
680 } else {
681 // Tail call: fake call from stub caller by branching without linking.
682 address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
683 __ mr_if_needed(R3_ARG1, addr);
684 __ mr_if_needed(R4_ARG2, count);
685 __ load_const(R11, entry_point, R0);
686 __ call_c_and_return_to_caller(R11);
687 }
688 }
689 break;
690 case BarrierSet::CardTableModRef:
691 case BarrierSet::CardTableExtension:
692 {
693 Label Lskip_loop, Lstore_loop;
694 if (UseConcMarkSweepGC) {
695 // TODO PPC port: contribute optimization / requires shared changes
696 __ release();
697 }
698
699 CardTableModRefBS* const ct = (CardTableModRefBS*)bs;
700 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
701 assert_different_registers(addr, count, tmp);
702
703 __ sldi(count, count, LogBytesPerHeapOop);
704 __ addi(count, count, -BytesPerHeapOop);
705 __ add(count, addr, count);
706 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
707 __ srdi(addr, addr, CardTableModRefBS::card_shift);
708 __ srdi(count, count, CardTableModRefBS::card_shift);
709 __ subf(count, addr, count);
710 assert_different_registers(R0, addr, count, tmp);
711 __ load_const(tmp, (address)ct->byte_map_base);
712 __ addic_(count, count, 1);
713 __ beq(CCR0, Lskip_loop);
714 __ li(R0, 0);
715 __ mtctr(count);
716 // Byte store loop
717 __ bind(Lstore_loop);
718 __ stbx(R0, tmp, addr);
719 __ addi(addr, addr, 1);
720 __ bdnz(Lstore_loop);
721 __ bind(Lskip_loop);
722
723 if (!branchToEnd) __ blr();
724 }
725 break;
726 case BarrierSet::ModRef:
727 if (!branchToEnd) __ blr();
728 break;
729 default:
730 ShouldNotReachHere();
731 }
732 }
733
734 // Support for void zero_words_aligned8(HeapWord* to, size_t count)
735 //
736 // Arguments:
737 // to:
738 // count:
739 //
740 // Destroys:
741 //
generate_zero_words_aligned8()742 address generate_zero_words_aligned8() {
743 StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
744
745 // Implemented as in ClearArray.
746 address start = __ function_entry();
747
748 Register base_ptr_reg = R3_ARG1; // tohw (needs to be 8b aligned)
749 Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
750 Register tmp1_reg = R5_ARG3;
751 Register tmp2_reg = R6_ARG4;
752 Register zero_reg = R7_ARG5;
753
754 // Procedure for large arrays (uses data cache block zero instruction).
755 Label dwloop, fast, fastloop, restloop, lastdword, done;
756 int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
757 int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
758
759 // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
760 __ dcbtst(base_ptr_reg); // Indicate write access to first cache line ...
761 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even.
762 __ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords
763 __ load_const_optimized(zero_reg, 0L); // Use as zero register.
764
765 __ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even?
766 __ beq(CCR0, lastdword); // size <= 1
767 __ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0).
768 __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
769 __ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
770
771 __ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
772 __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
773
774 __ beq(CCR0, fast); // already 128byte aligned
775 __ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt).
776 __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
777
778 // Clear in first cache line dword-by-dword if not already 128byte aligned.
779 __ bind(dwloop);
780 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
781 __ addi(base_ptr_reg, base_ptr_reg, 8);
782 __ bdnz(dwloop);
783
784 // clear 128byte blocks
785 __ bind(fast);
786 __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
787 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even
788
789 __ mtctr(tmp1_reg); // load counter
790 __ cmpdi(CCR1, tmp2_reg, 0); // rest even?
791 __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
792
793 __ bind(fastloop);
794 __ dcbz(base_ptr_reg); // Clear 128byte aligned block.
795 __ addi(base_ptr_reg, base_ptr_reg, cl_size);
796 __ bdnz(fastloop);
797
798 //__ dcbtst(base_ptr_reg); // Indicate write access to last cache line.
799 __ beq(CCR0, lastdword); // rest<=1
800 __ mtctr(tmp1_reg); // load counter
801
802 // Clear rest.
803 __ bind(restloop);
804 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
805 __ std(zero_reg, 8, base_ptr_reg); // Clear 8byte aligned block.
806 __ addi(base_ptr_reg, base_ptr_reg, 16);
807 __ bdnz(restloop);
808
809 __ bind(lastdword);
810 __ beq(CCR1, done);
811 __ std(zero_reg, 0, base_ptr_reg);
812 __ bind(done);
813 __ blr(); // return
814
815 return start;
816 }
817
818 // The following routine generates a subroutine to throw an asynchronous
819 // UnknownError when an unsafe access gets a fault that could not be
820 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
821 //
generate_handler_for_unsafe_access()822 address generate_handler_for_unsafe_access() {
823 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
824 address start = __ function_entry();
825 __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
826 return start;
827 }
828
829 #if !defined(PRODUCT)
830 // Wrapper which calls oopDesc::is_oop_or_null()
831 // Only called by MacroAssembler::verify_oop
verify_oop_helper(const char * message,oop o)832 static void verify_oop_helper(const char* message, oop o) {
833 if (!o->is_oop_or_null()) {
834 fatal(message);
835 }
836 ++ StubRoutines::_verify_oop_count;
837 }
838 #endif
839
840 // Return address of code to be called from code generated by
841 // MacroAssembler::verify_oop.
842 //
843 // Don't generate, rather use C++ code.
generate_verify_oop()844 address generate_verify_oop() {
845 StubCodeMark mark(this, "StubRoutines", "verify_oop");
846
847 // this is actually a `FunctionDescriptor*'.
848 address start = 0;
849
850 #if !defined(PRODUCT)
851 start = CAST_FROM_FN_PTR(address, verify_oop_helper);
852 #endif
853
854 return start;
855 }
856
857 // Fairer handling of safepoints for native methods.
858 //
859 // Generate code which reads from the polling page. This special handling is needed as the
860 // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
861 // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
862 // to read from the safepoint polling page.
generate_load_from_poll()863 address generate_load_from_poll() {
864 StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
865 address start = __ function_entry();
866 __ unimplemented("StubRoutines::verify_oop", 95); // TODO PPC port
867 return start;
868 }
869
870 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
871 //
872 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
873 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
874 //
875 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
876 // for turning on loop predication optimization, and hence the behavior of "array range check"
877 // and "loop invariant check" could be influenced, which potentially boosted JVM98.
878 //
879 // Generate stub for disjoint short fill. If "aligned" is true, the
880 // "to" address is assumed to be heapword aligned.
881 //
882 // Arguments for generated stub:
883 // to: R3_ARG1
884 // value: R4_ARG2
885 // count: R5_ARG3 treated as signed
886 //
generate_fill(BasicType t,bool aligned,const char * name)887 address generate_fill(BasicType t, bool aligned, const char* name) {
888 StubCodeMark mark(this, "StubRoutines", name);
889 address start = __ function_entry();
890
891 const Register to = R3_ARG1; // source array address
892 const Register value = R4_ARG2; // fill value
893 const Register count = R5_ARG3; // elements count
894 const Register temp = R6_ARG4; // temp register
895
896 //assert_clean_int(count, O3); // Make sure 'count' is clean int.
897
898 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
899 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
900
901 int shift = -1;
902 switch (t) {
903 case T_BYTE:
904 shift = 2;
905 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
906 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
907 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
908 __ blt(CCR0, L_fill_elements);
909 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
910 break;
911 case T_SHORT:
912 shift = 1;
913 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
914 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
915 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
916 __ blt(CCR0, L_fill_elements);
917 break;
918 case T_INT:
919 shift = 0;
920 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
921 __ blt(CCR0, L_fill_4_bytes);
922 break;
923 default: ShouldNotReachHere();
924 }
925
926 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
927 // Align source address at 4 bytes address boundary.
928 if (t == T_BYTE) {
929 // One byte misalignment happens only for byte arrays.
930 __ andi_(temp, to, 1);
931 __ beq(CCR0, L_skip_align1);
932 __ stb(value, 0, to);
933 __ addi(to, to, 1);
934 __ addi(count, count, -1);
935 __ bind(L_skip_align1);
936 }
937 // Two bytes misalignment happens only for byte and short (char) arrays.
938 __ andi_(temp, to, 2);
939 __ beq(CCR0, L_skip_align2);
940 __ sth(value, 0, to);
941 __ addi(to, to, 2);
942 __ addi(count, count, -(1 << (shift - 1)));
943 __ bind(L_skip_align2);
944 }
945
946 if (!aligned) {
947 // Align to 8 bytes, we know we are 4 byte aligned to start.
948 __ andi_(temp, to, 7);
949 __ beq(CCR0, L_fill_32_bytes);
950 __ stw(value, 0, to);
951 __ addi(to, to, 4);
952 __ addi(count, count, -(1 << shift));
953 __ bind(L_fill_32_bytes);
954 }
955
956 __ li(temp, 8<<shift); // Prepare for 32 byte loop.
957 // Clone bytes int->long as above.
958 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
959
960 Label L_check_fill_8_bytes;
961 // Fill 32-byte chunks.
962 __ subf_(count, temp, count);
963 __ blt(CCR0, L_check_fill_8_bytes);
964
965 Label L_fill_32_bytes_loop;
966 __ align(32);
967 __ bind(L_fill_32_bytes_loop);
968
969 __ std(value, 0, to);
970 __ std(value, 8, to);
971 __ subf_(count, temp, count); // Update count.
972 __ std(value, 16, to);
973 __ std(value, 24, to);
974
975 __ addi(to, to, 32);
976 __ bge(CCR0, L_fill_32_bytes_loop);
977
978 __ bind(L_check_fill_8_bytes);
979 __ add_(count, temp, count);
980 __ beq(CCR0, L_exit);
981 __ addic_(count, count, -(2 << shift));
982 __ blt(CCR0, L_fill_4_bytes);
983
984 //
985 // Length is too short, just fill 8 bytes at a time.
986 //
987 Label L_fill_8_bytes_loop;
988 __ bind(L_fill_8_bytes_loop);
989 __ std(value, 0, to);
990 __ addic_(count, count, -(2 << shift));
991 __ addi(to, to, 8);
992 __ bge(CCR0, L_fill_8_bytes_loop);
993
994 // Fill trailing 4 bytes.
995 __ bind(L_fill_4_bytes);
996 __ andi_(temp, count, 1<<shift);
997 __ beq(CCR0, L_fill_2_bytes);
998
999 __ stw(value, 0, to);
1000 if (t == T_BYTE || t == T_SHORT) {
1001 __ addi(to, to, 4);
1002 // Fill trailing 2 bytes.
1003 __ bind(L_fill_2_bytes);
1004 __ andi_(temp, count, 1<<(shift-1));
1005 __ beq(CCR0, L_fill_byte);
1006 __ sth(value, 0, to);
1007 if (t == T_BYTE) {
1008 __ addi(to, to, 2);
1009 // Fill trailing byte.
1010 __ bind(L_fill_byte);
1011 __ andi_(count, count, 1);
1012 __ beq(CCR0, L_exit);
1013 __ stb(value, 0, to);
1014 } else {
1015 __ bind(L_fill_byte);
1016 }
1017 } else {
1018 __ bind(L_fill_2_bytes);
1019 }
1020 __ bind(L_exit);
1021 __ blr();
1022
1023 // Handle copies less than 8 bytes. Int is handled elsewhere.
1024 if (t == T_BYTE) {
1025 __ bind(L_fill_elements);
1026 Label L_fill_2, L_fill_4;
1027 __ andi_(temp, count, 1);
1028 __ beq(CCR0, L_fill_2);
1029 __ stb(value, 0, to);
1030 __ addi(to, to, 1);
1031 __ bind(L_fill_2);
1032 __ andi_(temp, count, 2);
1033 __ beq(CCR0, L_fill_4);
1034 __ stb(value, 0, to);
1035 __ stb(value, 0, to);
1036 __ addi(to, to, 2);
1037 __ bind(L_fill_4);
1038 __ andi_(temp, count, 4);
1039 __ beq(CCR0, L_exit);
1040 __ stb(value, 0, to);
1041 __ stb(value, 1, to);
1042 __ stb(value, 2, to);
1043 __ stb(value, 3, to);
1044 __ blr();
1045 }
1046
1047 if (t == T_SHORT) {
1048 Label L_fill_2;
1049 __ bind(L_fill_elements);
1050 __ andi_(temp, count, 1);
1051 __ beq(CCR0, L_fill_2);
1052 __ sth(value, 0, to);
1053 __ addi(to, to, 2);
1054 __ bind(L_fill_2);
1055 __ andi_(temp, count, 2);
1056 __ beq(CCR0, L_exit);
1057 __ sth(value, 0, to);
1058 __ sth(value, 2, to);
1059 __ blr();
1060 }
1061 return start;
1062 }
1063
1064
1065 // Generate overlap test for array copy stubs.
1066 //
1067 // Input:
1068 // R3_ARG1 - from
1069 // R4_ARG2 - to
1070 // R5_ARG3 - element count
1071 //
array_overlap_test(address no_overlap_target,int log2_elem_size)1072 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
1073 Register tmp1 = R6_ARG4;
1074 Register tmp2 = R7_ARG5;
1075
1076 Label l_overlap;
1077 #ifdef ASSERT
1078 __ srdi_(tmp2, R5_ARG3, 31);
1079 __ asm_assert_eq("missing zero extend", 0xAFFE);
1080 #endif
1081
1082 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
1083 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
1084 __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
1085 __ cmpld(CCR1, tmp1, tmp2);
1086 __ crand(/*CCR0 lt*/0, /*CCR1 lt*/4+0, /*CCR0 lt*/0);
1087 __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
1088
1089 // need to copy forwards
1090 if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
1091 __ b(no_overlap_target);
1092 } else {
1093 __ load_const(tmp1, no_overlap_target, tmp2);
1094 __ mtctr(tmp1);
1095 __ bctr();
1096 }
1097
1098 __ bind(l_overlap);
1099 // need to copy backwards
1100 }
1101
1102 // The guideline in the implementations of generate_disjoint_xxx_copy
1103 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
1104 // single instructions, but to avoid alignment interrupts (see subsequent
1105 // comment). Furthermore, we try to minimize misaligned access, even
1106 // though they cause no alignment interrupt.
1107 //
1108 // In Big-Endian mode, the PowerPC architecture requires implementations to
1109 // handle automatically misaligned integer halfword and word accesses,
1110 // word-aligned integer doubleword accesses, and word-aligned floating-point
1111 // accesses. Other accesses may or may not generate an Alignment interrupt
1112 // depending on the implementation.
1113 // Alignment interrupt handling may require on the order of hundreds of cycles,
1114 // so every effort should be made to avoid misaligned memory values.
1115 //
1116 //
1117 // Generate stub for disjoint byte copy. If "aligned" is true, the
1118 // "from" and "to" addresses are assumed to be heapword aligned.
1119 //
1120 // Arguments for generated stub:
1121 // from: R3_ARG1
1122 // to: R4_ARG2
1123 // count: R5_ARG3 treated as signed
1124 //
generate_disjoint_byte_copy(bool aligned,const char * name)1125 address generate_disjoint_byte_copy(bool aligned, const char * name) {
1126 StubCodeMark mark(this, "StubRoutines", name);
1127 address start = __ function_entry();
1128
1129 Register tmp1 = R6_ARG4;
1130 Register tmp2 = R7_ARG5;
1131 Register tmp3 = R8_ARG6;
1132 Register tmp4 = R9_ARG7;
1133
1134 VectorSRegister tmp_vsr1 = VSR1;
1135 VectorSRegister tmp_vsr2 = VSR2;
1136
1137 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1138
1139 // Don't try anything fancy if arrays don't have many elements.
1140 __ li(tmp3, 0);
1141 __ cmpwi(CCR0, R5_ARG3, 17);
1142 __ ble(CCR0, l_6); // copy 4 at a time
1143
1144 if (!aligned) {
1145 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1146 __ andi_(tmp1, tmp1, 3);
1147 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1148
1149 // Copy elements if necessary to align to 4 bytes.
1150 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1151 __ andi_(tmp1, tmp1, 3);
1152 __ beq(CCR0, l_2);
1153
1154 __ subf(R5_ARG3, tmp1, R5_ARG3);
1155 __ bind(l_9);
1156 __ lbz(tmp2, 0, R3_ARG1);
1157 __ addic_(tmp1, tmp1, -1);
1158 __ stb(tmp2, 0, R4_ARG2);
1159 __ addi(R3_ARG1, R3_ARG1, 1);
1160 __ addi(R4_ARG2, R4_ARG2, 1);
1161 __ bne(CCR0, l_9);
1162
1163 __ bind(l_2);
1164 }
1165
1166 // copy 8 elements at a time
1167 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1168 __ andi_(tmp1, tmp2, 7);
1169 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1170
1171 // copy a 2-element word if necessary to align to 8 bytes
1172 __ andi_(R0, R3_ARG1, 7);
1173 __ beq(CCR0, l_7);
1174
1175 __ lwzx(tmp2, R3_ARG1, tmp3);
1176 __ addi(R5_ARG3, R5_ARG3, -4);
1177 __ stwx(tmp2, R4_ARG2, tmp3);
1178 { // FasterArrayCopy
1179 __ addi(R3_ARG1, R3_ARG1, 4);
1180 __ addi(R4_ARG2, R4_ARG2, 4);
1181 }
1182 __ bind(l_7);
1183
1184 { // FasterArrayCopy
1185 __ cmpwi(CCR0, R5_ARG3, 31);
1186 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1187
1188 __ srdi(tmp1, R5_ARG3, 5);
1189 __ andi_(R5_ARG3, R5_ARG3, 31);
1190 __ mtctr(tmp1);
1191
1192 if (!VM_Version::has_vsx()) {
1193
1194 __ bind(l_8);
1195 // Use unrolled version for mass copying (copy 32 elements a time)
1196 // Load feeding store gets zero latency on Power6, however not on Power5.
1197 // Therefore, the following sequence is made for the good of both.
1198 __ ld(tmp1, 0, R3_ARG1);
1199 __ ld(tmp2, 8, R3_ARG1);
1200 __ ld(tmp3, 16, R3_ARG1);
1201 __ ld(tmp4, 24, R3_ARG1);
1202 __ std(tmp1, 0, R4_ARG2);
1203 __ std(tmp2, 8, R4_ARG2);
1204 __ std(tmp3, 16, R4_ARG2);
1205 __ std(tmp4, 24, R4_ARG2);
1206 __ addi(R3_ARG1, R3_ARG1, 32);
1207 __ addi(R4_ARG2, R4_ARG2, 32);
1208 __ bdnz(l_8);
1209
1210 } else { // Processor supports VSX, so use it to mass copy.
1211
1212 // Prefetch the data into the L2 cache.
1213 __ dcbt(R3_ARG1, 0);
1214
1215 __ li(tmp1, 16);
1216
1217 // Backbranch target aligned to 32-byte. Not 16-byte align as
1218 // loop contains < 8 instructions that fit inside a single
1219 // i-cache sector.
1220 __ align(32);
1221
1222 __ bind(l_10);
1223 // Use loop with VSX load/store instructions to
1224 // copy 32 elements a time.
1225 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1226 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1227 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1228 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1229 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1230 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1231 __ bdnz(l_10); // Dec CTR and loop if not zero.
1232
1233 } // VSX
1234 } // FasterArrayCopy
1235
1236 __ bind(l_6);
1237
1238 // copy 4 elements at a time
1239 __ cmpwi(CCR0, R5_ARG3, 4);
1240 __ blt(CCR0, l_1);
1241 __ srdi(tmp1, R5_ARG3, 2);
1242 __ mtctr(tmp1); // is > 0
1243 __ andi_(R5_ARG3, R5_ARG3, 3);
1244
1245 { // FasterArrayCopy
1246 __ addi(R3_ARG1, R3_ARG1, -4);
1247 __ addi(R4_ARG2, R4_ARG2, -4);
1248 __ bind(l_3);
1249 __ lwzu(tmp2, 4, R3_ARG1);
1250 __ stwu(tmp2, 4, R4_ARG2);
1251 __ bdnz(l_3);
1252 __ addi(R3_ARG1, R3_ARG1, 4);
1253 __ addi(R4_ARG2, R4_ARG2, 4);
1254 }
1255
1256 // do single element copy
1257 __ bind(l_1);
1258 __ cmpwi(CCR0, R5_ARG3, 0);
1259 __ beq(CCR0, l_4);
1260
1261 { // FasterArrayCopy
1262 __ mtctr(R5_ARG3);
1263 __ addi(R3_ARG1, R3_ARG1, -1);
1264 __ addi(R4_ARG2, R4_ARG2, -1);
1265
1266 __ bind(l_5);
1267 __ lbzu(tmp2, 1, R3_ARG1);
1268 __ stbu(tmp2, 1, R4_ARG2);
1269 __ bdnz(l_5);
1270 }
1271
1272 __ bind(l_4);
1273 __ blr();
1274
1275 return start;
1276 }
1277
1278 // Generate stub for conjoint byte copy. If "aligned" is true, the
1279 // "from" and "to" addresses are assumed to be heapword aligned.
1280 //
1281 // Arguments for generated stub:
1282 // from: R3_ARG1
1283 // to: R4_ARG2
1284 // count: R5_ARG3 treated as signed
1285 //
generate_conjoint_byte_copy(bool aligned,const char * name)1286 address generate_conjoint_byte_copy(bool aligned, const char * name) {
1287 StubCodeMark mark(this, "StubRoutines", name);
1288 address start = __ function_entry();
1289
1290 Register tmp1 = R6_ARG4;
1291 Register tmp2 = R7_ARG5;
1292 Register tmp3 = R8_ARG6;
1293
1294 #if defined(ABI_ELFv2)
1295 address nooverlap_target = aligned ?
1296 StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
1297 StubRoutines::jbyte_disjoint_arraycopy();
1298 #else
1299 address nooverlap_target = aligned ?
1300 ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
1301 ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
1302 #endif
1303
1304 array_overlap_test(nooverlap_target, 0);
1305 // Do reverse copy. We assume the case of actual overlap is rare enough
1306 // that we don't have to optimize it.
1307 Label l_1, l_2;
1308
1309 __ b(l_2);
1310 __ bind(l_1);
1311 __ stbx(tmp1, R4_ARG2, R5_ARG3);
1312 __ bind(l_2);
1313 __ addic_(R5_ARG3, R5_ARG3, -1);
1314 __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1315 __ bge(CCR0, l_1);
1316
1317 __ blr();
1318
1319 return start;
1320 }
1321
1322 // Generate stub for disjoint short copy. If "aligned" is true, the
1323 // "from" and "to" addresses are assumed to be heapword aligned.
1324 //
1325 // Arguments for generated stub:
1326 // from: R3_ARG1
1327 // to: R4_ARG2
1328 // elm.count: R5_ARG3 treated as signed
1329 //
1330 // Strategy for aligned==true:
1331 //
1332 // If length <= 9:
1333 // 1. copy 2 elements at a time (l_6)
1334 // 2. copy last element if original element count was odd (l_1)
1335 //
1336 // If length > 9:
1337 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
1338 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
1339 // 3. copy last element if one was left in step 2. (l_1)
1340 //
1341 //
1342 // Strategy for aligned==false:
1343 //
1344 // If length <= 9: same as aligned==true case, but NOTE: load/stores
1345 // can be unaligned (see comment below)
1346 //
1347 // If length > 9:
1348 // 1. continue with step 6. if the alignment of from and to mod 4
1349 // is different.
1350 // 2. align from and to to 4 bytes by copying 1 element if necessary
1351 // 3. at l_2 from and to are 4 byte aligned; continue with
1352 // 5. if they cannot be aligned to 8 bytes because they have
1353 // got different alignment mod 8.
1354 // 4. at this point we know that both, from and to, have the same
1355 // alignment mod 8, now copy one element if necessary to get
1356 // 8 byte alignment of from and to.
1357 // 5. copy 4 elements at a time until less than 4 elements are
1358 // left; depending on step 3. all load/stores are aligned or
1359 // either all loads or all stores are unaligned.
1360 // 6. copy 2 elements at a time until less than 2 elements are
1361 // left (l_6); arriving here from step 1., there is a chance
1362 // that all accesses are unaligned.
1363 // 7. copy last element if one was left in step 6. (l_1)
1364 //
1365 // There are unaligned data accesses using integer load/store
1366 // instructions in this stub. POWER allows such accesses.
1367 //
1368 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1369 // Chapter 2: Effect of Operand Placement on Performance) unaligned
1370 // integer load/stores have good performance. Only unaligned
1371 // floating point load/stores can have poor performance.
1372 //
1373 // TODO:
1374 //
1375 // 1. check if aligning the backbranch target of loops is beneficial
1376 //
generate_disjoint_short_copy(bool aligned,const char * name)1377 address generate_disjoint_short_copy(bool aligned, const char * name) {
1378 StubCodeMark mark(this, "StubRoutines", name);
1379
1380 Register tmp1 = R6_ARG4;
1381 Register tmp2 = R7_ARG5;
1382 Register tmp3 = R8_ARG6;
1383 Register tmp4 = R9_ARG7;
1384
1385 VectorSRegister tmp_vsr1 = VSR1;
1386 VectorSRegister tmp_vsr2 = VSR2;
1387
1388 address start = __ function_entry();
1389
1390 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1391
1392 // don't try anything fancy if arrays don't have many elements
1393 __ li(tmp3, 0);
1394 __ cmpwi(CCR0, R5_ARG3, 9);
1395 __ ble(CCR0, l_6); // copy 2 at a time
1396
1397 if (!aligned) {
1398 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1399 __ andi_(tmp1, tmp1, 3);
1400 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1401
1402 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1403
1404 // Copy 1 element if necessary to align to 4 bytes.
1405 __ andi_(tmp1, R3_ARG1, 3);
1406 __ beq(CCR0, l_2);
1407
1408 __ lhz(tmp2, 0, R3_ARG1);
1409 __ addi(R3_ARG1, R3_ARG1, 2);
1410 __ sth(tmp2, 0, R4_ARG2);
1411 __ addi(R4_ARG2, R4_ARG2, 2);
1412 __ addi(R5_ARG3, R5_ARG3, -1);
1413 __ bind(l_2);
1414
1415 // At this point the positions of both, from and to, are at least 4 byte aligned.
1416
1417 // Copy 4 elements at a time.
1418 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1419 __ xorr(tmp2, R3_ARG1, R4_ARG2);
1420 __ andi_(tmp1, tmp2, 7);
1421 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1422
1423 // Copy a 2-element word if necessary to align to 8 bytes.
1424 __ andi_(R0, R3_ARG1, 7);
1425 __ beq(CCR0, l_7);
1426
1427 __ lwzx(tmp2, R3_ARG1, tmp3);
1428 __ addi(R5_ARG3, R5_ARG3, -2);
1429 __ stwx(tmp2, R4_ARG2, tmp3);
1430 { // FasterArrayCopy
1431 __ addi(R3_ARG1, R3_ARG1, 4);
1432 __ addi(R4_ARG2, R4_ARG2, 4);
1433 }
1434 }
1435
1436 __ bind(l_7);
1437
1438 // Copy 4 elements at a time; either the loads or the stores can
1439 // be unaligned if aligned == false.
1440
1441 { // FasterArrayCopy
1442 __ cmpwi(CCR0, R5_ARG3, 15);
1443 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1444
1445 __ srdi(tmp1, R5_ARG3, 4);
1446 __ andi_(R5_ARG3, R5_ARG3, 15);
1447 __ mtctr(tmp1);
1448
1449 if (!VM_Version::has_vsx()) {
1450
1451 __ bind(l_8);
1452 // Use unrolled version for mass copying (copy 16 elements a time).
1453 // Load feeding store gets zero latency on Power6, however not on Power5.
1454 // Therefore, the following sequence is made for the good of both.
1455 __ ld(tmp1, 0, R3_ARG1);
1456 __ ld(tmp2, 8, R3_ARG1);
1457 __ ld(tmp3, 16, R3_ARG1);
1458 __ ld(tmp4, 24, R3_ARG1);
1459 __ std(tmp1, 0, R4_ARG2);
1460 __ std(tmp2, 8, R4_ARG2);
1461 __ std(tmp3, 16, R4_ARG2);
1462 __ std(tmp4, 24, R4_ARG2);
1463 __ addi(R3_ARG1, R3_ARG1, 32);
1464 __ addi(R4_ARG2, R4_ARG2, 32);
1465 __ bdnz(l_8);
1466
1467 } else { // Processor supports VSX, so use it to mass copy.
1468
1469 // Prefetch src data into L2 cache.
1470 __ dcbt(R3_ARG1, 0);
1471
1472 __ li(tmp1, 16);
1473
1474 // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1475 // as loop contains < 8 instructions that fit inside a single
1476 // i-cache sector.
1477 __ align(32);
1478
1479 __ bind(l_9);
1480 // Use loop with VSX load/store instructions to
1481 // copy 16 elements a time.
1482 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
1483 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
1484 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
1485 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1486 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
1487 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
1488 __ bdnz(l_9); // Dec CTR and loop if not zero.
1489
1490 }
1491 } // FasterArrayCopy
1492 __ bind(l_6);
1493
1494 // copy 2 elements at a time
1495 { // FasterArrayCopy
1496 __ cmpwi(CCR0, R5_ARG3, 2);
1497 __ blt(CCR0, l_1);
1498 __ srdi(tmp1, R5_ARG3, 1);
1499 __ andi_(R5_ARG3, R5_ARG3, 1);
1500
1501 __ addi(R3_ARG1, R3_ARG1, -4);
1502 __ addi(R4_ARG2, R4_ARG2, -4);
1503 __ mtctr(tmp1);
1504
1505 __ bind(l_3);
1506 __ lwzu(tmp2, 4, R3_ARG1);
1507 __ stwu(tmp2, 4, R4_ARG2);
1508 __ bdnz(l_3);
1509
1510 __ addi(R3_ARG1, R3_ARG1, 4);
1511 __ addi(R4_ARG2, R4_ARG2, 4);
1512 }
1513
1514 // do single element copy
1515 __ bind(l_1);
1516 __ cmpwi(CCR0, R5_ARG3, 0);
1517 __ beq(CCR0, l_4);
1518
1519 { // FasterArrayCopy
1520 __ mtctr(R5_ARG3);
1521 __ addi(R3_ARG1, R3_ARG1, -2);
1522 __ addi(R4_ARG2, R4_ARG2, -2);
1523
1524 __ bind(l_5);
1525 __ lhzu(tmp2, 2, R3_ARG1);
1526 __ sthu(tmp2, 2, R4_ARG2);
1527 __ bdnz(l_5);
1528 }
1529 __ bind(l_4);
1530 __ blr();
1531
1532 return start;
1533 }
1534
1535 // Generate stub for conjoint short copy. If "aligned" is true, the
1536 // "from" and "to" addresses are assumed to be heapword aligned.
1537 //
1538 // Arguments for generated stub:
1539 // from: R3_ARG1
1540 // to: R4_ARG2
1541 // count: R5_ARG3 treated as signed
1542 //
generate_conjoint_short_copy(bool aligned,const char * name)1543 address generate_conjoint_short_copy(bool aligned, const char * name) {
1544 StubCodeMark mark(this, "StubRoutines", name);
1545 address start = __ function_entry();
1546
1547 Register tmp1 = R6_ARG4;
1548 Register tmp2 = R7_ARG5;
1549 Register tmp3 = R8_ARG6;
1550
1551 #if defined(ABI_ELFv2)
1552 address nooverlap_target = aligned ?
1553 StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1554 StubRoutines::jshort_disjoint_arraycopy();
1555 #else
1556 address nooverlap_target = aligned ?
1557 ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
1558 ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
1559 #endif
1560
1561 array_overlap_test(nooverlap_target, 1);
1562
1563 Label l_1, l_2;
1564 __ sldi(tmp1, R5_ARG3, 1);
1565 __ b(l_2);
1566 __ bind(l_1);
1567 __ sthx(tmp2, R4_ARG2, tmp1);
1568 __ bind(l_2);
1569 __ addic_(tmp1, tmp1, -2);
1570 __ lhzx(tmp2, R3_ARG1, tmp1);
1571 __ bge(CCR0, l_1);
1572
1573 __ blr();
1574
1575 return start;
1576 }
1577
1578 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
1579 // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1580 //
1581 // Arguments:
1582 // from: R3_ARG1
1583 // to: R4_ARG2
1584 // count: R5_ARG3 treated as signed
1585 //
generate_disjoint_int_copy_core(bool aligned)1586 void generate_disjoint_int_copy_core(bool aligned) {
1587 Register tmp1 = R6_ARG4;
1588 Register tmp2 = R7_ARG5;
1589 Register tmp3 = R8_ARG6;
1590 Register tmp4 = R0;
1591
1592 VectorSRegister tmp_vsr1 = VSR1;
1593 VectorSRegister tmp_vsr2 = VSR2;
1594
1595 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1596
1597 // for short arrays, just do single element copy
1598 __ li(tmp3, 0);
1599 __ cmpwi(CCR0, R5_ARG3, 5);
1600 __ ble(CCR0, l_2);
1601
1602 if (!aligned) {
1603 // check if arrays have same alignment mod 8.
1604 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1605 __ andi_(R0, tmp1, 7);
1606 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1607 __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1608
1609 // copy 1 element to align to and from on an 8 byte boundary
1610 __ andi_(R0, R3_ARG1, 7);
1611 __ beq(CCR0, l_4);
1612
1613 __ lwzx(tmp2, R3_ARG1, tmp3);
1614 __ addi(R5_ARG3, R5_ARG3, -1);
1615 __ stwx(tmp2, R4_ARG2, tmp3);
1616 { // FasterArrayCopy
1617 __ addi(R3_ARG1, R3_ARG1, 4);
1618 __ addi(R4_ARG2, R4_ARG2, 4);
1619 }
1620 __ bind(l_4);
1621 }
1622
1623 { // FasterArrayCopy
1624 __ cmpwi(CCR0, R5_ARG3, 7);
1625 __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1626
1627 __ srdi(tmp1, R5_ARG3, 3);
1628 __ andi_(R5_ARG3, R5_ARG3, 7);
1629 __ mtctr(tmp1);
1630
1631 if (!VM_Version::has_vsx()) {
1632
1633 __ bind(l_6);
1634 // Use unrolled version for mass copying (copy 8 elements a time).
1635 // Load feeding store gets zero latency on power6, however not on power 5.
1636 // Therefore, the following sequence is made for the good of both.
1637 __ ld(tmp1, 0, R3_ARG1);
1638 __ ld(tmp2, 8, R3_ARG1);
1639 __ ld(tmp3, 16, R3_ARG1);
1640 __ ld(tmp4, 24, R3_ARG1);
1641 __ std(tmp1, 0, R4_ARG2);
1642 __ std(tmp2, 8, R4_ARG2);
1643 __ std(tmp3, 16, R4_ARG2);
1644 __ std(tmp4, 24, R4_ARG2);
1645 __ addi(R3_ARG1, R3_ARG1, 32);
1646 __ addi(R4_ARG2, R4_ARG2, 32);
1647 __ bdnz(l_6);
1648
1649 } else { // Processor supports VSX, so use it to mass copy.
1650
1651 // Prefetch the data into the L2 cache.
1652 __ dcbt(R3_ARG1, 0);
1653
1654 __ li(tmp1, 16);
1655
1656 // Backbranch target aligned to 32-byte. Not 16-byte align as
1657 // loop contains < 8 instructions that fit inside a single
1658 // i-cache sector.
1659 __ align(32);
1660
1661 __ bind(l_7);
1662 // Use loop with VSX load/store instructions to
1663 // copy 8 elements a time.
1664 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1665 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1666 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1667 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1668 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1669 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1670 __ bdnz(l_7); // Dec CTR and loop if not zero.
1671
1672 } // VSX
1673 } // FasterArrayCopy
1674
1675 // copy 1 element at a time
1676 __ bind(l_2);
1677 __ cmpwi(CCR0, R5_ARG3, 0);
1678 __ beq(CCR0, l_1);
1679
1680 { // FasterArrayCopy
1681 __ mtctr(R5_ARG3);
1682 __ addi(R3_ARG1, R3_ARG1, -4);
1683 __ addi(R4_ARG2, R4_ARG2, -4);
1684
1685 __ bind(l_3);
1686 __ lwzu(tmp2, 4, R3_ARG1);
1687 __ stwu(tmp2, 4, R4_ARG2);
1688 __ bdnz(l_3);
1689 }
1690
1691 __ bind(l_1);
1692 return;
1693 }
1694
1695 // Generate stub for disjoint int copy. If "aligned" is true, the
1696 // "from" and "to" addresses are assumed to be heapword aligned.
1697 //
1698 // Arguments for generated stub:
1699 // from: R3_ARG1
1700 // to: R4_ARG2
1701 // count: R5_ARG3 treated as signed
1702 //
generate_disjoint_int_copy(bool aligned,const char * name)1703 address generate_disjoint_int_copy(bool aligned, const char * name) {
1704 StubCodeMark mark(this, "StubRoutines", name);
1705 address start = __ function_entry();
1706 generate_disjoint_int_copy_core(aligned);
1707 __ blr();
1708 return start;
1709 }
1710
1711 // Generate core code for conjoint int copy (and oop copy on
1712 // 32-bit). If "aligned" is true, the "from" and "to" addresses
1713 // are assumed to be heapword aligned.
1714 //
1715 // Arguments:
1716 // from: R3_ARG1
1717 // to: R4_ARG2
1718 // count: R5_ARG3 treated as signed
1719 //
generate_conjoint_int_copy_core(bool aligned)1720 void generate_conjoint_int_copy_core(bool aligned) {
1721 // Do reverse copy. We assume the case of actual overlap is rare enough
1722 // that we don't have to optimize it.
1723
1724 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1725
1726 Register tmp1 = R6_ARG4;
1727 Register tmp2 = R7_ARG5;
1728 Register tmp3 = R8_ARG6;
1729 Register tmp4 = R0;
1730
1731 VectorSRegister tmp_vsr1 = VSR1;
1732 VectorSRegister tmp_vsr2 = VSR2;
1733
1734 { // FasterArrayCopy
1735 __ cmpwi(CCR0, R5_ARG3, 0);
1736 __ beq(CCR0, l_6);
1737
1738 __ sldi(R5_ARG3, R5_ARG3, 2);
1739 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1740 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1741 __ srdi(R5_ARG3, R5_ARG3, 2);
1742
1743 if (!aligned) {
1744 // check if arrays have same alignment mod 8.
1745 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1746 __ andi_(R0, tmp1, 7);
1747 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1748 __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1749
1750 // copy 1 element to align to and from on an 8 byte boundary
1751 __ andi_(R0, R3_ARG1, 7);
1752 __ beq(CCR0, l_7);
1753
1754 __ addi(R3_ARG1, R3_ARG1, -4);
1755 __ addi(R4_ARG2, R4_ARG2, -4);
1756 __ addi(R5_ARG3, R5_ARG3, -1);
1757 __ lwzx(tmp2, R3_ARG1);
1758 __ stwx(tmp2, R4_ARG2);
1759 __ bind(l_7);
1760 }
1761
1762 __ cmpwi(CCR0, R5_ARG3, 7);
1763 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1764
1765 __ srdi(tmp1, R5_ARG3, 3);
1766 __ andi(R5_ARG3, R5_ARG3, 7);
1767 __ mtctr(tmp1);
1768
1769 if (!VM_Version::has_vsx()) {
1770 __ bind(l_4);
1771 // Use unrolled version for mass copying (copy 4 elements a time).
1772 // Load feeding store gets zero latency on Power6, however not on Power5.
1773 // Therefore, the following sequence is made for the good of both.
1774 __ addi(R3_ARG1, R3_ARG1, -32);
1775 __ addi(R4_ARG2, R4_ARG2, -32);
1776 __ ld(tmp4, 24, R3_ARG1);
1777 __ ld(tmp3, 16, R3_ARG1);
1778 __ ld(tmp2, 8, R3_ARG1);
1779 __ ld(tmp1, 0, R3_ARG1);
1780 __ std(tmp4, 24, R4_ARG2);
1781 __ std(tmp3, 16, R4_ARG2);
1782 __ std(tmp2, 8, R4_ARG2);
1783 __ std(tmp1, 0, R4_ARG2);
1784 __ bdnz(l_4);
1785 } else { // Processor supports VSX, so use it to mass copy.
1786 // Prefetch the data into the L2 cache.
1787 __ dcbt(R3_ARG1, 0);
1788
1789 __ li(tmp1, 16);
1790
1791 // Backbranch target aligned to 32-byte. Not 16-byte align as
1792 // loop contains < 8 instructions that fit inside a single
1793 // i-cache sector.
1794 __ align(32);
1795
1796 __ bind(l_4);
1797 // Use loop with VSX load/store instructions to
1798 // copy 8 elements a time.
1799 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1800 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1801 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1802 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1803 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1804 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1805 __ bdnz(l_4);
1806
1807 }
1808
1809 __ cmpwi(CCR0, R5_ARG3, 0);
1810 __ beq(CCR0, l_6);
1811
1812 __ bind(l_5);
1813 __ mtctr(R5_ARG3);
1814 __ bind(l_3);
1815 __ lwz(R0, -4, R3_ARG1);
1816 __ stw(R0, -4, R4_ARG2);
1817 __ addi(R3_ARG1, R3_ARG1, -4);
1818 __ addi(R4_ARG2, R4_ARG2, -4);
1819 __ bdnz(l_3);
1820
1821 __ bind(l_6);
1822 }
1823 }
1824
1825 // Generate stub for conjoint int copy. If "aligned" is true, the
1826 // "from" and "to" addresses are assumed to be heapword aligned.
1827 //
1828 // Arguments for generated stub:
1829 // from: R3_ARG1
1830 // to: R4_ARG2
1831 // count: R5_ARG3 treated as signed
1832 //
generate_conjoint_int_copy(bool aligned,const char * name)1833 address generate_conjoint_int_copy(bool aligned, const char * name) {
1834 StubCodeMark mark(this, "StubRoutines", name);
1835 address start = __ function_entry();
1836
1837 #if defined(ABI_ELFv2)
1838 address nooverlap_target = aligned ?
1839 StubRoutines::arrayof_jint_disjoint_arraycopy() :
1840 StubRoutines::jint_disjoint_arraycopy();
1841 #else
1842 address nooverlap_target = aligned ?
1843 ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
1844 ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
1845 #endif
1846
1847 array_overlap_test(nooverlap_target, 2);
1848
1849 generate_conjoint_int_copy_core(aligned);
1850
1851 __ blr();
1852
1853 return start;
1854 }
1855
1856 // Generate core code for disjoint long copy (and oop copy on
1857 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1858 // are assumed to be heapword aligned.
1859 //
1860 // Arguments:
1861 // from: R3_ARG1
1862 // to: R4_ARG2
1863 // count: R5_ARG3 treated as signed
1864 //
generate_disjoint_long_copy_core(bool aligned)1865 void generate_disjoint_long_copy_core(bool aligned) {
1866 Register tmp1 = R6_ARG4;
1867 Register tmp2 = R7_ARG5;
1868 Register tmp3 = R8_ARG6;
1869 Register tmp4 = R0;
1870
1871 Label l_1, l_2, l_3, l_4, l_5;
1872
1873 VectorSRegister tmp_vsr1 = VSR1;
1874 VectorSRegister tmp_vsr2 = VSR2;
1875
1876 { // FasterArrayCopy
1877 __ cmpwi(CCR0, R5_ARG3, 3);
1878 __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1879
1880 __ srdi(tmp1, R5_ARG3, 2);
1881 __ andi_(R5_ARG3, R5_ARG3, 3);
1882 __ mtctr(tmp1);
1883
1884 if (!VM_Version::has_vsx()) {
1885 __ bind(l_4);
1886 // Use unrolled version for mass copying (copy 4 elements a time).
1887 // Load feeding store gets zero latency on Power6, however not on Power5.
1888 // Therefore, the following sequence is made for the good of both.
1889 __ ld(tmp1, 0, R3_ARG1);
1890 __ ld(tmp2, 8, R3_ARG1);
1891 __ ld(tmp3, 16, R3_ARG1);
1892 __ ld(tmp4, 24, R3_ARG1);
1893 __ std(tmp1, 0, R4_ARG2);
1894 __ std(tmp2, 8, R4_ARG2);
1895 __ std(tmp3, 16, R4_ARG2);
1896 __ std(tmp4, 24, R4_ARG2);
1897 __ addi(R3_ARG1, R3_ARG1, 32);
1898 __ addi(R4_ARG2, R4_ARG2, 32);
1899 __ bdnz(l_4);
1900
1901 } else { // Processor supports VSX, so use it to mass copy.
1902
1903 // Prefetch the data into the L2 cache.
1904 __ dcbt(R3_ARG1, 0);
1905
1906 __ li(tmp1, 16);
1907
1908 // Backbranch target aligned to 32-byte. Not 16-byte align as
1909 // loop contains < 8 instructions that fit inside a single
1910 // i-cache sector.
1911 __ align(32);
1912
1913 __ bind(l_5);
1914 // Use loop with VSX load/store instructions to
1915 // copy 4 elements a time.
1916 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1917 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1918 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1919 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1920 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1921 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1922 __ bdnz(l_5); // Dec CTR and loop if not zero.
1923
1924 } // VSX
1925 } // FasterArrayCopy
1926
1927 // copy 1 element at a time
1928 __ bind(l_3);
1929 __ cmpwi(CCR0, R5_ARG3, 0);
1930 __ beq(CCR0, l_1);
1931
1932 { // FasterArrayCopy
1933 __ mtctr(R5_ARG3);
1934 __ addi(R3_ARG1, R3_ARG1, -8);
1935 __ addi(R4_ARG2, R4_ARG2, -8);
1936
1937 __ bind(l_2);
1938 __ ldu(R0, 8, R3_ARG1);
1939 __ stdu(R0, 8, R4_ARG2);
1940 __ bdnz(l_2);
1941
1942 }
1943 __ bind(l_1);
1944 }
1945
1946 // Generate stub for disjoint long copy. If "aligned" is true, the
1947 // "from" and "to" addresses are assumed to be heapword aligned.
1948 //
1949 // Arguments for generated stub:
1950 // from: R3_ARG1
1951 // to: R4_ARG2
1952 // count: R5_ARG3 treated as signed
1953 //
generate_disjoint_long_copy(bool aligned,const char * name)1954 address generate_disjoint_long_copy(bool aligned, const char * name) {
1955 StubCodeMark mark(this, "StubRoutines", name);
1956 address start = __ function_entry();
1957 generate_disjoint_long_copy_core(aligned);
1958 __ blr();
1959
1960 return start;
1961 }
1962
1963 // Generate core code for conjoint long copy (and oop copy on
1964 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1965 // are assumed to be heapword aligned.
1966 //
1967 // Arguments:
1968 // from: R3_ARG1
1969 // to: R4_ARG2
1970 // count: R5_ARG3 treated as signed
1971 //
generate_conjoint_long_copy_core(bool aligned)1972 void generate_conjoint_long_copy_core(bool aligned) {
1973 Register tmp1 = R6_ARG4;
1974 Register tmp2 = R7_ARG5;
1975 Register tmp3 = R8_ARG6;
1976 Register tmp4 = R0;
1977
1978 VectorSRegister tmp_vsr1 = VSR1;
1979 VectorSRegister tmp_vsr2 = VSR2;
1980
1981 Label l_1, l_2, l_3, l_4, l_5;
1982
1983 __ cmpwi(CCR0, R5_ARG3, 0);
1984 __ beq(CCR0, l_1);
1985
1986 { // FasterArrayCopy
1987 __ sldi(R5_ARG3, R5_ARG3, 3);
1988 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1989 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1990 __ srdi(R5_ARG3, R5_ARG3, 3);
1991
1992 __ cmpwi(CCR0, R5_ARG3, 3);
1993 __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
1994
1995 __ srdi(tmp1, R5_ARG3, 2);
1996 __ andi(R5_ARG3, R5_ARG3, 3);
1997 __ mtctr(tmp1);
1998
1999 if (!VM_Version::has_vsx()) {
2000 __ bind(l_4);
2001 // Use unrolled version for mass copying (copy 4 elements a time).
2002 // Load feeding store gets zero latency on Power6, however not on Power5.
2003 // Therefore, the following sequence is made for the good of both.
2004 __ addi(R3_ARG1, R3_ARG1, -32);
2005 __ addi(R4_ARG2, R4_ARG2, -32);
2006 __ ld(tmp4, 24, R3_ARG1);
2007 __ ld(tmp3, 16, R3_ARG1);
2008 __ ld(tmp2, 8, R3_ARG1);
2009 __ ld(tmp1, 0, R3_ARG1);
2010 __ std(tmp4, 24, R4_ARG2);
2011 __ std(tmp3, 16, R4_ARG2);
2012 __ std(tmp2, 8, R4_ARG2);
2013 __ std(tmp1, 0, R4_ARG2);
2014 __ bdnz(l_4);
2015 } else { // Processor supports VSX, so use it to mass copy.
2016 // Prefetch the data into the L2 cache.
2017 __ dcbt(R3_ARG1, 0);
2018
2019 __ li(tmp1, 16);
2020
2021 // Backbranch target aligned to 32-byte. Not 16-byte align as
2022 // loop contains < 8 instructions that fit inside a single
2023 // i-cache sector.
2024 __ align(32);
2025
2026 __ bind(l_4);
2027 // Use loop with VSX load/store instructions to
2028 // copy 4 elements a time.
2029 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
2030 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
2031 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
2032 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
2033 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
2034 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
2035 __ bdnz(l_4);
2036
2037 }
2038
2039 __ cmpwi(CCR0, R5_ARG3, 0);
2040 __ beq(CCR0, l_1);
2041
2042 __ bind(l_5);
2043 __ mtctr(R5_ARG3);
2044 __ bind(l_3);
2045 __ ld(R0, -8, R3_ARG1);
2046 __ std(R0, -8, R4_ARG2);
2047 __ addi(R3_ARG1, R3_ARG1, -8);
2048 __ addi(R4_ARG2, R4_ARG2, -8);
2049 __ bdnz(l_3);
2050
2051 }
2052 __ bind(l_1);
2053 }
2054
2055 // Generate stub for conjoint long copy. If "aligned" is true, the
2056 // "from" and "to" addresses are assumed to be heapword aligned.
2057 //
2058 // Arguments for generated stub:
2059 // from: R3_ARG1
2060 // to: R4_ARG2
2061 // count: R5_ARG3 treated as signed
2062 //
generate_conjoint_long_copy(bool aligned,const char * name)2063 address generate_conjoint_long_copy(bool aligned, const char * name) {
2064 StubCodeMark mark(this, "StubRoutines", name);
2065 address start = __ function_entry();
2066
2067 #if defined(ABI_ELFv2)
2068 address nooverlap_target = aligned ?
2069 StubRoutines::arrayof_jlong_disjoint_arraycopy() :
2070 StubRoutines::jlong_disjoint_arraycopy();
2071 #else
2072 address nooverlap_target = aligned ?
2073 ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
2074 ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
2075 #endif
2076
2077 array_overlap_test(nooverlap_target, 3);
2078 generate_conjoint_long_copy_core(aligned);
2079
2080 __ blr();
2081
2082 return start;
2083 }
2084
2085 // Generate stub for conjoint oop copy. If "aligned" is true, the
2086 // "from" and "to" addresses are assumed to be heapword aligned.
2087 //
2088 // Arguments for generated stub:
2089 // from: R3_ARG1
2090 // to: R4_ARG2
2091 // count: R5_ARG3 treated as signed
2092 // dest_uninitialized: G1 support
2093 //
generate_conjoint_oop_copy(bool aligned,const char * name,bool dest_uninitialized)2094 address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2095 StubCodeMark mark(this, "StubRoutines", name);
2096
2097 address start = __ function_entry();
2098
2099 #if defined(ABI_ELFv2)
2100 address nooverlap_target = aligned ?
2101 StubRoutines::arrayof_oop_disjoint_arraycopy() :
2102 StubRoutines::oop_disjoint_arraycopy();
2103 #else
2104 address nooverlap_target = aligned ?
2105 ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
2106 ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
2107 #endif
2108
2109 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
2110
2111 // Save arguments.
2112 __ mr(R9_ARG7, R4_ARG2);
2113 __ mr(R10_ARG8, R5_ARG3);
2114
2115 if (UseCompressedOops) {
2116 array_overlap_test(nooverlap_target, 2);
2117 generate_conjoint_int_copy_core(aligned);
2118 } else {
2119 array_overlap_test(nooverlap_target, 3);
2120 generate_conjoint_long_copy_core(aligned);
2121 }
2122
2123 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
2124 return start;
2125 }
2126
2127 // Generate stub for disjoint oop copy. If "aligned" is true, the
2128 // "from" and "to" addresses are assumed to be heapword aligned.
2129 //
2130 // Arguments for generated stub:
2131 // from: R3_ARG1
2132 // to: R4_ARG2
2133 // count: R5_ARG3 treated as signed
2134 // dest_uninitialized: G1 support
2135 //
generate_disjoint_oop_copy(bool aligned,const char * name,bool dest_uninitialized)2136 address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2137 StubCodeMark mark(this, "StubRoutines", name);
2138 address start = __ function_entry();
2139
2140 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
2141
2142 // save some arguments, disjoint_long_copy_core destroys them.
2143 // needed for post barrier
2144 __ mr(R9_ARG7, R4_ARG2);
2145 __ mr(R10_ARG8, R5_ARG3);
2146
2147 if (UseCompressedOops) {
2148 generate_disjoint_int_copy_core(aligned);
2149 } else {
2150 generate_disjoint_long_copy_core(aligned);
2151 }
2152
2153 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
2154
2155 return start;
2156 }
2157
2158 // Arguments for generated stub:
2159 // R3_ARG1 - source byte array address
2160 // R4_ARG2 - destination byte array address
2161 // R5_ARG3 - round key array
generate_aescrypt_encryptBlock()2162 address generate_aescrypt_encryptBlock() {
2163 assert(UseAES, "need AES instructions and misaligned SSE support");
2164 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2165
2166 address start = __ function_entry();
2167
2168 Label L_doLast;
2169
2170 Register from = R3_ARG1; // source array address
2171 Register to = R4_ARG2; // destination array address
2172 Register key = R5_ARG3; // round key array
2173
2174 Register keylen = R8;
2175 Register temp = R9;
2176 Register keypos = R10;
2177 Register fifteen = R12;
2178
2179 VectorRegister vRet = VR0;
2180
2181 VectorRegister vKey1 = VR1;
2182 VectorRegister vKey2 = VR2;
2183 VectorRegister vKey3 = VR3;
2184 VectorRegister vKey4 = VR4;
2185
2186 VectorRegister fromPerm = VR5;
2187 VectorRegister keyPerm = VR6;
2188 VectorRegister toPerm = VR7;
2189 VectorRegister fSplt = VR8;
2190
2191 VectorRegister vTmp1 = VR9;
2192 VectorRegister vTmp2 = VR10;
2193 VectorRegister vTmp3 = VR11;
2194 VectorRegister vTmp4 = VR12;
2195
2196 __ li (fifteen, 15);
2197
2198 // load unaligned from[0-15] to vsRet
2199 __ lvx (vRet, from);
2200 __ lvx (vTmp1, fifteen, from);
2201 __ lvsl (fromPerm, from);
2202 #ifdef VM_LITTLE_ENDIAN
2203 __ vspltisb (fSplt, 0x0f);
2204 __ vxor (fromPerm, fromPerm, fSplt);
2205 #endif
2206 __ vperm (vRet, vRet, vTmp1, fromPerm);
2207
2208 // load keylen (44 or 52 or 60)
2209 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2210
2211 // to load keys
2212 __ load_perm (keyPerm, key);
2213 #ifdef VM_LITTLE_ENDIAN
2214 __ vspltisb (vTmp2, -16);
2215 __ vrld (keyPerm, keyPerm, vTmp2);
2216 __ vrld (keyPerm, keyPerm, vTmp2);
2217 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
2218 #endif
2219
2220 // load the 1st round key to vTmp1
2221 __ lvx (vTmp1, key);
2222 __ li (keypos, 16);
2223 __ lvx (vKey1, keypos, key);
2224 __ vec_perm (vTmp1, vKey1, keyPerm);
2225
2226 // 1st round
2227 __ vxor (vRet, vRet, vTmp1);
2228
2229 // load the 2nd round key to vKey1
2230 __ li (keypos, 32);
2231 __ lvx (vKey2, keypos, key);
2232 __ vec_perm (vKey1, vKey2, keyPerm);
2233
2234 // load the 3rd round key to vKey2
2235 __ li (keypos, 48);
2236 __ lvx (vKey3, keypos, key);
2237 __ vec_perm (vKey2, vKey3, keyPerm);
2238
2239 // load the 4th round key to vKey3
2240 __ li (keypos, 64);
2241 __ lvx (vKey4, keypos, key);
2242 __ vec_perm (vKey3, vKey4, keyPerm);
2243
2244 // load the 5th round key to vKey4
2245 __ li (keypos, 80);
2246 __ lvx (vTmp1, keypos, key);
2247 __ vec_perm (vKey4, vTmp1, keyPerm);
2248
2249 // 2nd - 5th rounds
2250 __ vcipher (vRet, vRet, vKey1);
2251 __ vcipher (vRet, vRet, vKey2);
2252 __ vcipher (vRet, vRet, vKey3);
2253 __ vcipher (vRet, vRet, vKey4);
2254
2255 // load the 6th round key to vKey1
2256 __ li (keypos, 96);
2257 __ lvx (vKey2, keypos, key);
2258 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2259
2260 // load the 7th round key to vKey2
2261 __ li (keypos, 112);
2262 __ lvx (vKey3, keypos, key);
2263 __ vec_perm (vKey2, vKey3, keyPerm);
2264
2265 // load the 8th round key to vKey3
2266 __ li (keypos, 128);
2267 __ lvx (vKey4, keypos, key);
2268 __ vec_perm (vKey3, vKey4, keyPerm);
2269
2270 // load the 9th round key to vKey4
2271 __ li (keypos, 144);
2272 __ lvx (vTmp1, keypos, key);
2273 __ vec_perm (vKey4, vTmp1, keyPerm);
2274
2275 // 6th - 9th rounds
2276 __ vcipher (vRet, vRet, vKey1);
2277 __ vcipher (vRet, vRet, vKey2);
2278 __ vcipher (vRet, vRet, vKey3);
2279 __ vcipher (vRet, vRet, vKey4);
2280
2281 // load the 10th round key to vKey1
2282 __ li (keypos, 160);
2283 __ lvx (vKey2, keypos, key);
2284 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2285
2286 // load the 11th round key to vKey2
2287 __ li (keypos, 176);
2288 __ lvx (vTmp1, keypos, key);
2289 __ vec_perm (vKey2, vTmp1, keyPerm);
2290
2291 // if all round keys are loaded, skip next 4 rounds
2292 __ cmpwi (CCR0, keylen, 44);
2293 __ beq (CCR0, L_doLast);
2294
2295 // 10th - 11th rounds
2296 __ vcipher (vRet, vRet, vKey1);
2297 __ vcipher (vRet, vRet, vKey2);
2298
2299 // load the 12th round key to vKey1
2300 __ li (keypos, 192);
2301 __ lvx (vKey2, keypos, key);
2302 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2303
2304 // load the 13th round key to vKey2
2305 __ li (keypos, 208);
2306 __ lvx (vTmp1, keypos, key);
2307 __ vec_perm (vKey2, vTmp1, keyPerm);
2308
2309 // if all round keys are loaded, skip next 2 rounds
2310 __ cmpwi (CCR0, keylen, 52);
2311 __ beq (CCR0, L_doLast);
2312
2313 // 12th - 13th rounds
2314 __ vcipher (vRet, vRet, vKey1);
2315 __ vcipher (vRet, vRet, vKey2);
2316
2317 // load the 14th round key to vKey1
2318 __ li (keypos, 224);
2319 __ lvx (vKey2, keypos, key);
2320 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2321
2322 // load the 15th round key to vKey2
2323 __ li (keypos, 240);
2324 __ lvx (vTmp1, keypos, key);
2325 __ vec_perm (vKey2, vTmp1, keyPerm);
2326
2327 __ bind(L_doLast);
2328
2329 // last two rounds
2330 __ vcipher (vRet, vRet, vKey1);
2331 __ vcipherlast (vRet, vRet, vKey2);
2332
2333 // store result (unaligned)
2334 #ifdef VM_LITTLE_ENDIAN
2335 __ lvsl (toPerm, to);
2336 #else
2337 __ lvsr (toPerm, to);
2338 #endif
2339 __ vspltisb (vTmp3, -1);
2340 __ vspltisb (vTmp4, 0);
2341 __ lvx (vTmp1, to);
2342 __ lvx (vTmp2, fifteen, to);
2343 #ifdef VM_LITTLE_ENDIAN
2344 __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
2345 __ vxor (toPerm, toPerm, fSplt); // swap bytes
2346 #else
2347 __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
2348 #endif
2349 __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data
2350 __ vsel (vTmp2, vTmp4, vTmp2, vTmp3);
2351 __ vsel (vTmp1, vTmp1, vTmp4, vTmp3);
2352 __ stvx (vTmp2, fifteen, to); // store this one first (may alias)
2353 __ stvx (vTmp1, to);
2354
2355 __ blr();
2356 return start;
2357 }
2358
2359 // Arguments for generated stub:
2360 // R3_ARG1 - source byte array address
2361 // R4_ARG2 - destination byte array address
2362 // R5_ARG3 - K (key) in little endian int array
generate_aescrypt_decryptBlock()2363 address generate_aescrypt_decryptBlock() {
2364 assert(UseAES, "need AES instructions and misaligned SSE support");
2365 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2366
2367 address start = __ function_entry();
2368
2369 Label L_doLast;
2370 Label L_do44;
2371 Label L_do52;
2372 Label L_do60;
2373
2374 Register from = R3_ARG1; // source array address
2375 Register to = R4_ARG2; // destination array address
2376 Register key = R5_ARG3; // round key array
2377
2378 Register keylen = R8;
2379 Register temp = R9;
2380 Register keypos = R10;
2381 Register fifteen = R12;
2382
2383 VectorRegister vRet = VR0;
2384
2385 VectorRegister vKey1 = VR1;
2386 VectorRegister vKey2 = VR2;
2387 VectorRegister vKey3 = VR3;
2388 VectorRegister vKey4 = VR4;
2389 VectorRegister vKey5 = VR5;
2390
2391 VectorRegister fromPerm = VR6;
2392 VectorRegister keyPerm = VR7;
2393 VectorRegister toPerm = VR8;
2394 VectorRegister fSplt = VR9;
2395
2396 VectorRegister vTmp1 = VR10;
2397 VectorRegister vTmp2 = VR11;
2398 VectorRegister vTmp3 = VR12;
2399 VectorRegister vTmp4 = VR13;
2400
2401 __ li (fifteen, 15);
2402
2403 // load unaligned from[0-15] to vsRet
2404 __ lvx (vRet, from);
2405 __ lvx (vTmp1, fifteen, from);
2406 __ lvsl (fromPerm, from);
2407 #ifdef VM_LITTLE_ENDIAN
2408 __ vspltisb (fSplt, 0x0f);
2409 __ vxor (fromPerm, fromPerm, fSplt);
2410 #endif
2411 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
2412
2413 // load keylen (44 or 52 or 60)
2414 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2415
2416 // to load keys
2417 __ load_perm (keyPerm, key);
2418 #ifdef VM_LITTLE_ENDIAN
2419 __ vxor (vTmp2, vTmp2, vTmp2);
2420 __ vspltisb (vTmp2, -16);
2421 __ vrld (keyPerm, keyPerm, vTmp2);
2422 __ vrld (keyPerm, keyPerm, vTmp2);
2423 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
2424 #endif
2425
2426 __ cmpwi (CCR0, keylen, 44);
2427 __ beq (CCR0, L_do44);
2428
2429 __ cmpwi (CCR0, keylen, 52);
2430 __ beq (CCR0, L_do52);
2431
2432 // load the 15th round key to vKey1
2433 __ li (keypos, 240);
2434 __ lvx (vKey1, keypos, key);
2435 __ li (keypos, 224);
2436 __ lvx (vKey2, keypos, key);
2437 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
2438
2439 // load the 14th round key to vKey2
2440 __ li (keypos, 208);
2441 __ lvx (vKey3, keypos, key);
2442 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
2443
2444 // load the 13th round key to vKey3
2445 __ li (keypos, 192);
2446 __ lvx (vKey4, keypos, key);
2447 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
2448
2449 // load the 12th round key to vKey4
2450 __ li (keypos, 176);
2451 __ lvx (vKey5, keypos, key);
2452 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
2453
2454 // load the 11th round key to vKey5
2455 __ li (keypos, 160);
2456 __ lvx (vTmp1, keypos, key);
2457 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
2458
2459 // 1st - 5th rounds
2460 __ vxor (vRet, vRet, vKey1);
2461 __ vncipher (vRet, vRet, vKey2);
2462 __ vncipher (vRet, vRet, vKey3);
2463 __ vncipher (vRet, vRet, vKey4);
2464 __ vncipher (vRet, vRet, vKey5);
2465
2466 __ b (L_doLast);
2467
2468 __ bind (L_do52);
2469
2470 // load the 13th round key to vKey1
2471 __ li (keypos, 208);
2472 __ lvx (vKey1, keypos, key);
2473 __ li (keypos, 192);
2474 __ lvx (vKey2, keypos, key);
2475 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
2476
2477 // load the 12th round key to vKey2
2478 __ li (keypos, 176);
2479 __ lvx (vKey3, keypos, key);
2480 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
2481
2482 // load the 11th round key to vKey3
2483 __ li (keypos, 160);
2484 __ lvx (vTmp1, keypos, key);
2485 __ vec_perm (vKey3, vTmp1, vKey3, keyPerm);
2486
2487 // 1st - 3rd rounds
2488 __ vxor (vRet, vRet, vKey1);
2489 __ vncipher (vRet, vRet, vKey2);
2490 __ vncipher (vRet, vRet, vKey3);
2491
2492 __ b (L_doLast);
2493
2494 __ bind (L_do44);
2495
2496 // load the 11th round key to vKey1
2497 __ li (keypos, 176);
2498 __ lvx (vKey1, keypos, key);
2499 __ li (keypos, 160);
2500 __ lvx (vTmp1, keypos, key);
2501 __ vec_perm (vKey1, vTmp1, vKey1, keyPerm);
2502
2503 // 1st round
2504 __ vxor (vRet, vRet, vKey1);
2505
2506 __ bind (L_doLast);
2507
2508 // load the 10th round key to vKey1
2509 __ li (keypos, 144);
2510 __ lvx (vKey2, keypos, key);
2511 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
2512
2513 // load the 9th round key to vKey2
2514 __ li (keypos, 128);
2515 __ lvx (vKey3, keypos, key);
2516 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
2517
2518 // load the 8th round key to vKey3
2519 __ li (keypos, 112);
2520 __ lvx (vKey4, keypos, key);
2521 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
2522
2523 // load the 7th round key to vKey4
2524 __ li (keypos, 96);
2525 __ lvx (vKey5, keypos, key);
2526 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
2527
2528 // load the 6th round key to vKey5
2529 __ li (keypos, 80);
2530 __ lvx (vTmp1, keypos, key);
2531 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
2532
2533 // last 10th - 6th rounds
2534 __ vncipher (vRet, vRet, vKey1);
2535 __ vncipher (vRet, vRet, vKey2);
2536 __ vncipher (vRet, vRet, vKey3);
2537 __ vncipher (vRet, vRet, vKey4);
2538 __ vncipher (vRet, vRet, vKey5);
2539
2540 // load the 5th round key to vKey1
2541 __ li (keypos, 64);
2542 __ lvx (vKey2, keypos, key);
2543 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
2544
2545 // load the 4th round key to vKey2
2546 __ li (keypos, 48);
2547 __ lvx (vKey3, keypos, key);
2548 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
2549
2550 // load the 3rd round key to vKey3
2551 __ li (keypos, 32);
2552 __ lvx (vKey4, keypos, key);
2553 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
2554
2555 // load the 2nd round key to vKey4
2556 __ li (keypos, 16);
2557 __ lvx (vKey5, keypos, key);
2558 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
2559
2560 // load the 1st round key to vKey5
2561 __ lvx (vTmp1, key);
2562 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
2563
2564 // last 5th - 1th rounds
2565 __ vncipher (vRet, vRet, vKey1);
2566 __ vncipher (vRet, vRet, vKey2);
2567 __ vncipher (vRet, vRet, vKey3);
2568 __ vncipher (vRet, vRet, vKey4);
2569 __ vncipherlast (vRet, vRet, vKey5);
2570
2571 // store result (unaligned)
2572 #ifdef VM_LITTLE_ENDIAN
2573 __ lvsl (toPerm, to);
2574 #else
2575 __ lvsr (toPerm, to);
2576 #endif
2577 __ vspltisb (vTmp3, -1);
2578 __ vspltisb (vTmp4, 0);
2579 __ lvx (vTmp1, to);
2580 __ lvx (vTmp2, fifteen, to);
2581 #ifdef VM_LITTLE_ENDIAN
2582 __ vperm (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
2583 __ vxor (toPerm, toPerm, fSplt); // swap bytes
2584 #else
2585 __ vperm (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
2586 #endif
2587 __ vperm (vTmp4, vRet, vRet, toPerm); // rotate data
2588 __ vsel (vTmp2, vTmp4, vTmp2, vTmp3);
2589 __ vsel (vTmp1, vTmp1, vTmp4, vTmp3);
2590 __ stvx (vTmp2, fifteen, to); // store this one first (may alias)
2591 __ stvx (vTmp1, to);
2592
2593 __ blr();
2594 return start;
2595 }
2596
generate_sha256_implCompress(bool multi_block,const char * name)2597 address generate_sha256_implCompress(bool multi_block, const char *name) {
2598 assert(UseSHA, "need SHA instructions");
2599 StubCodeMark mark(this, "StubRoutines", name);
2600 address start = __ function_entry();
2601
2602 __ sha256 (multi_block);
2603
2604 __ blr();
2605 return start;
2606 }
2607
generate_sha512_implCompress(bool multi_block,const char * name)2608 address generate_sha512_implCompress(bool multi_block, const char *name) {
2609 assert(UseSHA, "need SHA instructions");
2610 StubCodeMark mark(this, "StubRoutines", name);
2611 address start = __ function_entry();
2612
2613 __ sha512 (multi_block);
2614
2615 __ blr();
2616 return start;
2617 }
2618
generate_arraycopy_stubs()2619 void generate_arraycopy_stubs() {
2620 // Note: the disjoint stubs must be generated first, some of
2621 // the conjoint stubs use them.
2622
2623 // non-aligned disjoint versions
2624 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
2625 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
2626 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
2627 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
2628 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
2629 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
2630
2631 // aligned disjoint versions
2632 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
2633 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
2634 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
2635 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
2636 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
2637 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
2638
2639 // non-aligned conjoint versions
2640 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
2641 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
2642 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, "jint_arraycopy");
2643 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
2644 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
2645 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
2646
2647 // aligned conjoint versions
2648 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
2649 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
2650 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
2651 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
2652 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
2653 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
2654
2655 // fill routines
2656 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2657 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2658 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2659 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2660 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2661 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2662 }
2663
2664 // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)2665 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
2666 // safefetch signatures:
2667 // int SafeFetch32(int* adr, int errValue);
2668 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2669 //
2670 // arguments:
2671 // R3_ARG1 = adr
2672 // R4_ARG2 = errValue
2673 //
2674 // result:
2675 // R3_RET = *adr or errValue
2676
2677 StubCodeMark mark(this, "StubRoutines", name);
2678
2679 // Entry point, pc or function descriptor.
2680 *entry = __ function_entry();
2681
2682 // Load *adr into R4_ARG2, may fault.
2683 *fault_pc = __ pc();
2684 switch (size) {
2685 case 4:
2686 // int32_t, signed extended
2687 __ lwa(R4_ARG2, 0, R3_ARG1);
2688 break;
2689 case 8:
2690 // int64_t
2691 __ ld(R4_ARG2, 0, R3_ARG1);
2692 break;
2693 default:
2694 ShouldNotReachHere();
2695 }
2696
2697 // return errValue or *adr
2698 *continuation_pc = __ pc();
2699 __ mr(R3_RET, R4_ARG2);
2700 __ blr();
2701 }
2702
2703 /**
2704 * Arguments:
2705 *
2706 * Inputs:
2707 * R3_ARG1 - int crc
2708 * R4_ARG2 - byte* buf
2709 * R5_ARG3 - int length (of buffer)
2710 *
2711 * scratch:
2712 * R2, R6-R12
2713 *
2714 * Ouput:
2715 * R3_RET - int crc result
2716 */
2717 // Compute CRC32 function.
generate_CRC32_updateBytes(const char * name)2718 address generate_CRC32_updateBytes(const char* name) {
2719 __ align(CodeEntryAlignment);
2720 StubCodeMark mark(this, "StubRoutines", name);
2721 address start = __ function_entry(); // Remember stub start address (is rtn value).
2722
2723 // arguments to kernel_crc32:
2724 const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
2725 const Register data = R4_ARG2; // source byte array
2726 const Register dataLen = R5_ARG3; // #bytes to process
2727
2728 const Register table = R6; // crc table address
2729
2730 #ifdef VM_LITTLE_ENDIAN
2731 if (VM_Version::has_vpmsumb()) {
2732 const Register constants = R2; // constants address
2733 const Register bconstants = R8; // barret table address
2734
2735 const Register t0 = R9;
2736 const Register t1 = R10;
2737 const Register t2 = R11;
2738 const Register t3 = R12;
2739 const Register t4 = R7;
2740
2741 BLOCK_COMMENT("Stub body {");
2742 assert_different_registers(crc, data, dataLen, table);
2743
2744 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
2745 StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
2746 StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
2747
2748 __ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
2749
2750 BLOCK_COMMENT("return");
2751 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
2752 __ blr();
2753
2754 BLOCK_COMMENT("} Stub body");
2755 } else
2756 #endif
2757 {
2758 const Register t0 = R2;
2759 const Register t1 = R7;
2760 const Register t2 = R8;
2761 const Register t3 = R9;
2762 const Register tc0 = R10;
2763 const Register tc1 = R11;
2764 const Register tc2 = R12;
2765
2766 BLOCK_COMMENT("Stub body {");
2767 assert_different_registers(crc, data, dataLen, table);
2768
2769 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
2770
2771 __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
2772
2773 BLOCK_COMMENT("return");
2774 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
2775 __ blr();
2776
2777 BLOCK_COMMENT("} Stub body");
2778 }
2779
2780 return start;
2781 }
2782
2783 // Initialization
generate_initial()2784 void generate_initial() {
2785 // Generates all stubs and initializes the entry points
2786
2787 // Entry points that exist in all platforms.
2788 // Note: This is code that could be shared among different platforms - however the
2789 // benefit seems to be smaller than the disadvantage of having a
2790 // much more complicated generator structure. See also comment in
2791 // stubRoutines.hpp.
2792
2793 StubRoutines::_forward_exception_entry = generate_forward_exception();
2794 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
2795 StubRoutines::_catch_exception_entry = generate_catch_exception();
2796
2797 // Build this early so it's available for the interpreter.
2798 StubRoutines::_throw_StackOverflowError_entry =
2799 generate_throw_exception("StackOverflowError throw_exception",
2800 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2801
2802 // CRC32 Intrinsics.
2803 if (UseCRC32Intrinsics) {
2804 StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
2805 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
2806 }
2807 }
2808
generate_all()2809 void generate_all() {
2810 // Generates all stubs and initializes the entry points
2811
2812 // These entry points require SharedInfo::stack0 to be set up in
2813 // non-core builds
2814 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
2815 // Handle IncompatibleClassChangeError in itable stubs.
2816 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
2817 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2818
2819 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access();
2820
2821 // support for verify_oop (must happen after universe_init)
2822 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
2823
2824 // arraycopy stubs used by compilers
2825 generate_arraycopy_stubs();
2826
2827 // Safefetch stubs.
2828 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
2829 &StubRoutines::_safefetch32_fault_pc,
2830 &StubRoutines::_safefetch32_continuation_pc);
2831 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2832 &StubRoutines::_safefetchN_fault_pc,
2833 &StubRoutines::_safefetchN_continuation_pc);
2834
2835 if (UseAESIntrinsics) {
2836 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
2837 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
2838 }
2839
2840 if (UseMontgomeryMultiplyIntrinsic) {
2841 StubRoutines::_montgomeryMultiply
2842 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
2843 }
2844 if (UseMontgomerySquareIntrinsic) {
2845 StubRoutines::_montgomerySquare
2846 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
2847 }
2848
2849 if (UseSHA256Intrinsics) {
2850 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
2851 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
2852 }
2853 if (UseSHA512Intrinsics) {
2854 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
2855 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
2856 }
2857 }
2858
2859 public:
StubGenerator(CodeBuffer * code,bool all)2860 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2861 // replace the standard masm with a special one:
2862 _masm = new MacroAssembler(code);
2863 if (all) {
2864 generate_all();
2865 } else {
2866 generate_initial();
2867 }
2868 }
2869 };
2870
StubGenerator_generate(CodeBuffer * code,bool all)2871 void StubGenerator_generate(CodeBuffer* code, bool all) {
2872 StubGenerator g(code, all);
2873 }
2874