1 /*
2 * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc/shared/collectedHeap.inline.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "memory/resourceArea.hpp"
34 #include "nativeInst_ppc.hpp"
35 #include "oops/klass.inline.hpp"
36 #include "oops/methodData.hpp"
37 #include "prims/methodHandles.hpp"
38 #include "runtime/biasedLocking.hpp"
39 #include "runtime/icache.hpp"
40 #include "runtime/interfaceSupport.inline.hpp"
41 #include "runtime/objectMonitor.hpp"
42 #include "runtime/os.hpp"
43 #include "runtime/safepoint.hpp"
44 #include "runtime/safepointMechanism.hpp"
45 #include "runtime/sharedRuntime.hpp"
46 #include "runtime/stubRoutines.hpp"
47 #include "utilities/macros.hpp"
48 #include "utilities/powerOfTwo.hpp"
49
50 #ifdef PRODUCT
51 #define BLOCK_COMMENT(str) // nothing
52 #else
53 #define BLOCK_COMMENT(str) block_comment(str)
54 #endif
55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
56
57 #ifdef ASSERT
58 // On RISC, there's no benefit to verifying instruction boundaries.
pd_check_instruction_mark()59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
60 #endif
61
ld_largeoffset_unchecked(Register d,int si31,Register a,int emit_filler_nop)62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
64 if (Assembler::is_simm(si31, 16)) {
65 ld(d, si31, a);
66 if (emit_filler_nop) nop();
67 } else {
68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
70 addis(d, a, hi);
71 ld(d, lo, d);
72 }
73 }
74
ld_largeoffset(Register d,int si31,Register a,int emit_filler_nop)75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
76 assert_different_registers(d, a);
77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
78 }
79
load_sized_value(Register dst,RegisterOrConstant offs,Register base,size_t size_in_bytes,bool is_signed)80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
81 size_t size_in_bytes, bool is_signed) {
82 switch (size_in_bytes) {
83 case 8: ld(dst, offs, base); break;
84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
87 default: ShouldNotReachHere();
88 }
89 }
90
store_sized_value(Register dst,RegisterOrConstant offs,Register base,size_t size_in_bytes)91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
92 size_t size_in_bytes) {
93 switch (size_in_bytes) {
94 case 8: std(dst, offs, base); break;
95 case 4: stw(dst, offs, base); break;
96 case 2: sth(dst, offs, base); break;
97 case 1: stb(dst, offs, base); break;
98 default: ShouldNotReachHere();
99 }
100 }
101
align(int modulus,int max,int rem)102 void MacroAssembler::align(int modulus, int max, int rem) {
103 int padding = (rem + modulus - (offset() % modulus)) % modulus;
104 if (padding > max) return;
105 for (int c = (padding >> 2); c > 0; --c) { nop(); }
106 }
107
108 // Issue instructions that calculate given TOC from global TOC.
calculate_address_from_global_toc(Register dst,address addr,bool hi16,bool lo16,bool add_relocation,bool emit_dummy_addr)109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
110 bool add_relocation, bool emit_dummy_addr) {
111 int offset = -1;
112 if (emit_dummy_addr) {
113 offset = -128; // dummy address
114 } else if (addr != (address)(intptr_t)-1) {
115 offset = MacroAssembler::offset_to_global_toc(addr);
116 }
117
118 if (hi16) {
119 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
120 }
121 if (lo16) {
122 if (add_relocation) {
123 // Relocate at the addi to avoid confusion with a load from the method's TOC.
124 relocate(internal_word_Relocation::spec(addr));
125 }
126 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
127 }
128 }
129
patch_calculate_address_from_global_toc_at(address a,address bound,address addr)130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
131 const int offset = MacroAssembler::offset_to_global_toc(addr);
132
133 const address inst2_addr = a;
134 const int inst2 = *(int *)inst2_addr;
135
136 // The relocation points to the second instruction, the addi,
137 // and the addi reads and writes the same register dst.
138 const int dst = inv_rt_field(inst2);
139 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
140
141 // Now, find the preceding addis which writes to dst.
142 int inst1 = 0;
143 address inst1_addr = inst2_addr - BytesPerInstWord;
144 while (inst1_addr >= bound) {
145 inst1 = *(int *) inst1_addr;
146 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
147 // Stop, found the addis which writes dst.
148 break;
149 }
150 inst1_addr -= BytesPerInstWord;
151 }
152
153 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
154 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
155 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
156 return inst1_addr;
157 }
158
get_address_of_calculate_address_from_global_toc_at(address a,address bound)159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
160 const address inst2_addr = a;
161 const int inst2 = *(int *)inst2_addr;
162
163 // The relocation points to the second instruction, the addi,
164 // and the addi reads and writes the same register dst.
165 const int dst = inv_rt_field(inst2);
166 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
167
168 // Now, find the preceding addis which writes to dst.
169 int inst1 = 0;
170 address inst1_addr = inst2_addr - BytesPerInstWord;
171 while (inst1_addr >= bound) {
172 inst1 = *(int *) inst1_addr;
173 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
174 // stop, found the addis which writes dst
175 break;
176 }
177 inst1_addr -= BytesPerInstWord;
178 }
179
180 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
181
182 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
183 // -1 is a special case
184 if (offset == -1) {
185 return (address)(intptr_t)-1;
186 } else {
187 return global_toc() + offset;
188 }
189 }
190
191 #ifdef _LP64
192 // Patch compressed oops or klass constants.
193 // Assembler sequence is
194 // 1) compressed oops:
195 // lis rx = const.hi
196 // ori rx = rx | const.lo
197 // 2) compressed klass:
198 // lis rx = const.hi
199 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
200 // ori rx = rx | const.lo
201 // Clrldi will be passed by.
patch_set_narrow_oop(address a,address bound,narrowOop data)202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
203 assert(UseCompressedOops, "Should only patch compressed oops");
204
205 const address inst2_addr = a;
206 const int inst2 = *(int *)inst2_addr;
207
208 // The relocation points to the second instruction, the ori,
209 // and the ori reads and writes the same register dst.
210 const int dst = inv_rta_field(inst2);
211 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
212 // Now, find the preceding addis which writes to dst.
213 int inst1 = 0;
214 address inst1_addr = inst2_addr - BytesPerInstWord;
215 bool inst1_found = false;
216 while (inst1_addr >= bound) {
217 inst1 = *(int *)inst1_addr;
218 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
219 inst1_addr -= BytesPerInstWord;
220 }
221 assert(inst1_found, "inst is not lis");
222
223 int xc = (data >> 16) & 0xffff;
224 int xd = (data >> 0) & 0xffff;
225
226 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
227 set_imm((int *)inst2_addr, (xd)); // unsigned int
228 return inst1_addr;
229 }
230
231 // Get compressed oop or klass constant.
get_narrow_oop(address a,address bound)232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
233 assert(UseCompressedOops, "Should only patch compressed oops");
234
235 const address inst2_addr = a;
236 const int inst2 = *(int *)inst2_addr;
237
238 // The relocation points to the second instruction, the ori,
239 // and the ori reads and writes the same register dst.
240 const int dst = inv_rta_field(inst2);
241 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
242 // Now, find the preceding lis which writes to dst.
243 int inst1 = 0;
244 address inst1_addr = inst2_addr - BytesPerInstWord;
245 bool inst1_found = false;
246
247 while (inst1_addr >= bound) {
248 inst1 = *(int *) inst1_addr;
249 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
250 inst1_addr -= BytesPerInstWord;
251 }
252 assert(inst1_found, "inst is not lis");
253
254 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
255 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
256
257 return (int) (xl | xh);
258 }
259 #endif // _LP64
260
261 // Returns true if successful.
load_const_from_method_toc(Register dst,AddressLiteral & a,Register toc,bool fixed_size)262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
263 Register toc, bool fixed_size) {
264 int toc_offset = 0;
265 // Use RelocationHolder::none for the constant pool entry, otherwise
266 // we will end up with a failing NativeCall::verify(x) where x is
267 // the address of the constant pool entry.
268 // FIXME: We should insert relocation information for oops at the constant
269 // pool entries instead of inserting it at the loads; patching of a constant
270 // pool entry should be less expensive.
271 address const_address = address_constant((address)a.value(), RelocationHolder::none);
272 if (const_address == NULL) { return false; } // allocation failure
273 // Relocate at the pc of the load.
274 relocate(a.rspec());
275 toc_offset = (int)(const_address - code()->consts()->start());
276 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
277 return true;
278 }
279
is_load_const_from_method_toc_at(address a)280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
281 const address inst1_addr = a;
282 const int inst1 = *(int *)inst1_addr;
283
284 // The relocation points to the ld or the addis.
285 return (is_ld(inst1)) ||
286 (is_addis(inst1) && inv_ra_field(inst1) != 0);
287 }
288
get_offset_of_load_const_from_method_toc_at(address a)289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
290 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
291
292 const address inst1_addr = a;
293 const int inst1 = *(int *)inst1_addr;
294
295 if (is_ld(inst1)) {
296 return inv_d1_field(inst1);
297 } else if (is_addis(inst1)) {
298 const int dst = inv_rt_field(inst1);
299
300 // Now, find the succeeding ld which reads and writes to dst.
301 address inst2_addr = inst1_addr + BytesPerInstWord;
302 int inst2 = 0;
303 while (true) {
304 inst2 = *(int *) inst2_addr;
305 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
306 // Stop, found the ld which reads and writes dst.
307 break;
308 }
309 inst2_addr += BytesPerInstWord;
310 }
311 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
312 }
313 ShouldNotReachHere();
314 return 0;
315 }
316
317 // Get the constant from a `load_const' sequence.
get_const(address a)318 long MacroAssembler::get_const(address a) {
319 assert(is_load_const_at(a), "not a load of a constant");
320 const int *p = (const int*) a;
321 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
322 if (is_ori(*(p+1))) {
323 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
324 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
325 x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
326 } else if (is_lis(*(p+1))) {
327 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
329 x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
330 } else {
331 ShouldNotReachHere();
332 return (long) 0;
333 }
334 return (long) x;
335 }
336
337 // Patch the 64 bit constant of a `load_const' sequence. This is a low
338 // level procedure. It neither flushes the instruction cache nor is it
339 // mt safe.
patch_const(address a,long x)340 void MacroAssembler::patch_const(address a, long x) {
341 assert(is_load_const_at(a), "not a load of a constant");
342 int *p = (int*) a;
343 if (is_ori(*(p+1))) {
344 set_imm(0 + p, (x >> 48) & 0xffff);
345 set_imm(1 + p, (x >> 32) & 0xffff);
346 set_imm(3 + p, (x >> 16) & 0xffff);
347 set_imm(4 + p, x & 0xffff);
348 } else if (is_lis(*(p+1))) {
349 set_imm(0 + p, (x >> 48) & 0xffff);
350 set_imm(2 + p, (x >> 32) & 0xffff);
351 set_imm(1 + p, (x >> 16) & 0xffff);
352 set_imm(3 + p, x & 0xffff);
353 } else {
354 ShouldNotReachHere();
355 }
356 }
357
allocate_metadata_address(Metadata * obj)358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
359 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
360 int index = oop_recorder()->allocate_metadata_index(obj);
361 RelocationHolder rspec = metadata_Relocation::spec(index);
362 return AddressLiteral((address)obj, rspec);
363 }
364
constant_metadata_address(Metadata * obj)365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
366 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
367 int index = oop_recorder()->find_index(obj);
368 RelocationHolder rspec = metadata_Relocation::spec(index);
369 return AddressLiteral((address)obj, rspec);
370 }
371
allocate_oop_address(jobject obj)372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
374 int oop_index = oop_recorder()->allocate_oop_index(obj);
375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
376 }
377
constant_oop_address(jobject obj)378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
380 int oop_index = oop_recorder()->find_index(obj);
381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
382 }
383
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
385 Register tmp, int offset) {
386 intptr_t value = *delayed_value_addr;
387 if (value != 0) {
388 return RegisterOrConstant(value + offset);
389 }
390
391 // Load indirectly to solve generation ordering problem.
392 // static address, no relocation
393 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
394 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
395
396 if (offset != 0) {
397 addi(tmp, tmp, offset);
398 }
399
400 return RegisterOrConstant(tmp);
401 }
402
403 #ifndef PRODUCT
pd_print_patched_instruction(address branch)404 void MacroAssembler::pd_print_patched_instruction(address branch) {
405 Unimplemented(); // TODO: PPC port
406 }
407 #endif // ndef PRODUCT
408
409 // Conditional far branch for destinations encodable in 24+2 bits.
bc_far(int boint,int biint,Label & dest,int optimize)410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
411
412 // If requested by flag optimize, relocate the bc_far as a
413 // runtime_call and prepare for optimizing it when the code gets
414 // relocated.
415 if (optimize == bc_far_optimize_on_relocate) {
416 relocate(relocInfo::runtime_call_type);
417 }
418
419 // variant 2:
420 //
421 // b!cxx SKIP
422 // bxx DEST
423 // SKIP:
424 //
425
426 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
427 opposite_bcond(inv_boint_bcond(boint)));
428
429 // We emit two branches.
430 // First, a conditional branch which jumps around the far branch.
431 const address not_taken_pc = pc() + 2 * BytesPerInstWord;
432 const address bc_pc = pc();
433 bc(opposite_boint, biint, not_taken_pc);
434
435 const int bc_instr = *(int*)bc_pc;
436 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
437 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
438 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
439 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
440 "postcondition");
441 assert(biint == inv_bi_field(bc_instr), "postcondition");
442
443 // Second, an unconditional far branch which jumps to dest.
444 // Note: target(dest) remembers the current pc (see CodeSection::target)
445 // and returns the current pc if the label is not bound yet; when
446 // the label gets bound, the unconditional far branch will be patched.
447 const address target_pc = target(dest);
448 const address b_pc = pc();
449 b(target_pc);
450
451 assert(not_taken_pc == pc(), "postcondition");
452 assert(dest.is_bound() || target_pc == b_pc, "postcondition");
453 }
454
455 // 1 or 2 instructions
bc_far_optimized(int boint,int biint,Label & dest)456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
457 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
458 bc(boint, biint, dest);
459 } else {
460 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
461 }
462 }
463
is_bc_far_at(address instruction_addr)464 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
465 return is_bc_far_variant1_at(instruction_addr) ||
466 is_bc_far_variant2_at(instruction_addr) ||
467 is_bc_far_variant3_at(instruction_addr);
468 }
469
get_dest_of_bc_far_at(address instruction_addr)470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
471 if (is_bc_far_variant1_at(instruction_addr)) {
472 const address instruction_1_addr = instruction_addr;
473 const int instruction_1 = *(int*)instruction_1_addr;
474 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
475 } else if (is_bc_far_variant2_at(instruction_addr)) {
476 const address instruction_2_addr = instruction_addr + 4;
477 return bxx_destination(instruction_2_addr);
478 } else if (is_bc_far_variant3_at(instruction_addr)) {
479 return instruction_addr + 8;
480 }
481 // variant 4 ???
482 ShouldNotReachHere();
483 return NULL;
484 }
set_dest_of_bc_far_at(address instruction_addr,address dest)485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
486
487 if (is_bc_far_variant3_at(instruction_addr)) {
488 // variant 3, far cond branch to the next instruction, already patched to nops:
489 //
490 // nop
491 // endgroup
492 // SKIP/DEST:
493 //
494 return;
495 }
496
497 // first, extract boint and biint from the current branch
498 int boint = 0;
499 int biint = 0;
500
501 ResourceMark rm;
502 const int code_size = 2 * BytesPerInstWord;
503 CodeBuffer buf(instruction_addr, code_size);
504 MacroAssembler masm(&buf);
505 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
506 // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
507 masm.nop();
508 masm.endgroup();
509 } else {
510 if (is_bc_far_variant1_at(instruction_addr)) {
511 // variant 1, the 1st instruction contains the destination address:
512 //
513 // bcxx DEST
514 // nop
515 //
516 const int instruction_1 = *(int*)(instruction_addr);
517 boint = inv_bo_field(instruction_1);
518 biint = inv_bi_field(instruction_1);
519 } else if (is_bc_far_variant2_at(instruction_addr)) {
520 // variant 2, the 2nd instruction contains the destination address:
521 //
522 // b!cxx SKIP
523 // bxx DEST
524 // SKIP:
525 //
526 const int instruction_1 = *(int*)(instruction_addr);
527 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
528 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
529 biint = inv_bi_field(instruction_1);
530 } else {
531 // variant 4???
532 ShouldNotReachHere();
533 }
534
535 // second, set the new branch destination and optimize the code
536 if (dest != instruction_addr + 4 && // the bc_far is still unbound!
537 masm.is_within_range_of_bcxx(dest, instruction_addr)) {
538 // variant 1:
539 //
540 // bcxx DEST
541 // nop
542 //
543 masm.bc(boint, biint, dest);
544 masm.nop();
545 } else {
546 // variant 2:
547 //
548 // b!cxx SKIP
549 // bxx DEST
550 // SKIP:
551 //
552 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
553 opposite_bcond(inv_boint_bcond(boint)));
554 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
555 masm.bc(opposite_boint, biint, not_taken_pc);
556 masm.b(dest);
557 }
558 }
559 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
560 }
561
562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
bxx64_patchable(address dest,relocInfo::relocType rt,bool link)563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
564 // get current pc
565 uint64_t start_pc = (uint64_t) pc();
566
567 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
568 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
569
570 // relocate here
571 if (rt != relocInfo::none) {
572 relocate(rt);
573 }
574
575 if ( ReoptimizeCallSequences &&
576 (( link && is_within_range_of_b(dest, pc_of_bl)) ||
577 (!link && is_within_range_of_b(dest, pc_of_b)))) {
578 // variant 2:
579 // Emit an optimized, pc-relative call/jump.
580
581 if (link) {
582 // some padding
583 nop();
584 nop();
585 nop();
586 nop();
587 nop();
588 nop();
589
590 // do the call
591 assert(pc() == pc_of_bl, "just checking");
592 bl(dest, relocInfo::none);
593 } else {
594 // do the jump
595 assert(pc() == pc_of_b, "just checking");
596 b(dest, relocInfo::none);
597
598 // some padding
599 nop();
600 nop();
601 nop();
602 nop();
603 nop();
604 nop();
605 }
606
607 // Assert that we can identify the emitted call/jump.
608 assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
609 "can't identify emitted call");
610 } else {
611 // variant 1:
612 mr(R0, R11); // spill R11 -> R0.
613
614 // Load the destination address into CTR,
615 // calculate destination relative to global toc.
616 calculate_address_from_global_toc(R11, dest, true, true, false);
617
618 mtctr(R11);
619 mr(R11, R0); // spill R11 <- R0.
620 nop();
621
622 // do the call/jump
623 if (link) {
624 bctrl();
625 } else{
626 bctr();
627 }
628 // Assert that we can identify the emitted call/jump.
629 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
630 "can't identify emitted call");
631 }
632
633 // Assert that we can identify the emitted call/jump.
634 assert(is_bxx64_patchable_at((address)start_pc, link),
635 "can't identify emitted call");
636 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
637 "wrong encoding of dest address");
638 }
639
640 // Identify a bxx64_patchable instruction.
is_bxx64_patchable_at(address instruction_addr,bool link)641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
642 return is_bxx64_patchable_variant1b_at(instruction_addr, link)
643 //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
644 || is_bxx64_patchable_variant2_at(instruction_addr, link);
645 }
646
647 // Does the call64_patchable instruction use a pc-relative encoding of
648 // the call destination?
is_bxx64_patchable_pcrelative_at(address instruction_addr,bool link)649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
650 // variant 2 is pc-relative
651 return is_bxx64_patchable_variant2_at(instruction_addr, link);
652 }
653
654 // Identify variant 1.
is_bxx64_patchable_variant1_at(address instruction_addr,bool link)655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
656 unsigned int* instr = (unsigned int*) instruction_addr;
657 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
658 && is_mtctr(instr[5]) // mtctr
659 && is_load_const_at(instruction_addr);
660 }
661
662 // Identify variant 1b: load destination relative to global toc.
is_bxx64_patchable_variant1b_at(address instruction_addr,bool link)663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
664 unsigned int* instr = (unsigned int*) instruction_addr;
665 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
666 && is_mtctr(instr[3]) // mtctr
667 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
668 }
669
670 // Identify variant 2.
is_bxx64_patchable_variant2_at(address instruction_addr,bool link)671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
672 unsigned int* instr = (unsigned int*) instruction_addr;
673 if (link) {
674 return is_bl (instr[6]) // bl dest is last
675 && is_nop(instr[0]) // nop
676 && is_nop(instr[1]) // nop
677 && is_nop(instr[2]) // nop
678 && is_nop(instr[3]) // nop
679 && is_nop(instr[4]) // nop
680 && is_nop(instr[5]); // nop
681 } else {
682 return is_b (instr[0]) // b dest is first
683 && is_nop(instr[1]) // nop
684 && is_nop(instr[2]) // nop
685 && is_nop(instr[3]) // nop
686 && is_nop(instr[4]) // nop
687 && is_nop(instr[5]) // nop
688 && is_nop(instr[6]); // nop
689 }
690 }
691
692 // Set dest address of a bxx64_patchable instruction.
set_dest_of_bxx64_patchable_at(address instruction_addr,address dest,bool link)693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
694 ResourceMark rm;
695 int code_size = MacroAssembler::bxx64_patchable_size;
696 CodeBuffer buf(instruction_addr, code_size);
697 MacroAssembler masm(&buf);
698 masm.bxx64_patchable(dest, relocInfo::none, link);
699 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
700 }
701
702 // Get dest address of a bxx64_patchable instruction.
get_dest_of_bxx64_patchable_at(address instruction_addr,bool link)703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
704 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
705 return (address) (unsigned long) get_const(instruction_addr);
706 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
707 unsigned int* instr = (unsigned int*) instruction_addr;
708 if (link) {
709 const int instr_idx = 6; // bl is last
710 int branchoffset = branch_destination(instr[instr_idx], 0);
711 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
712 } else {
713 const int instr_idx = 0; // b is first
714 int branchoffset = branch_destination(instr[instr_idx], 0);
715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
716 }
717 // Load dest relative to global toc.
718 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
719 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
720 instruction_addr);
721 } else {
722 ShouldNotReachHere();
723 return NULL;
724 }
725 }
726
727 // Uses ordering which corresponds to ABI:
728 // _savegpr0_14: std r14,-144(r1)
729 // _savegpr0_15: std r15,-136(r1)
730 // _savegpr0_16: std r16,-128(r1)
save_nonvolatile_gprs(Register dst,int offset)731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
732 std(R14, offset, dst); offset += 8;
733 std(R15, offset, dst); offset += 8;
734 std(R16, offset, dst); offset += 8;
735 std(R17, offset, dst); offset += 8;
736 std(R18, offset, dst); offset += 8;
737 std(R19, offset, dst); offset += 8;
738 std(R20, offset, dst); offset += 8;
739 std(R21, offset, dst); offset += 8;
740 std(R22, offset, dst); offset += 8;
741 std(R23, offset, dst); offset += 8;
742 std(R24, offset, dst); offset += 8;
743 std(R25, offset, dst); offset += 8;
744 std(R26, offset, dst); offset += 8;
745 std(R27, offset, dst); offset += 8;
746 std(R28, offset, dst); offset += 8;
747 std(R29, offset, dst); offset += 8;
748 std(R30, offset, dst); offset += 8;
749 std(R31, offset, dst); offset += 8;
750
751 stfd(F14, offset, dst); offset += 8;
752 stfd(F15, offset, dst); offset += 8;
753 stfd(F16, offset, dst); offset += 8;
754 stfd(F17, offset, dst); offset += 8;
755 stfd(F18, offset, dst); offset += 8;
756 stfd(F19, offset, dst); offset += 8;
757 stfd(F20, offset, dst); offset += 8;
758 stfd(F21, offset, dst); offset += 8;
759 stfd(F22, offset, dst); offset += 8;
760 stfd(F23, offset, dst); offset += 8;
761 stfd(F24, offset, dst); offset += 8;
762 stfd(F25, offset, dst); offset += 8;
763 stfd(F26, offset, dst); offset += 8;
764 stfd(F27, offset, dst); offset += 8;
765 stfd(F28, offset, dst); offset += 8;
766 stfd(F29, offset, dst); offset += 8;
767 stfd(F30, offset, dst); offset += 8;
768 stfd(F31, offset, dst);
769 }
770
771 // Uses ordering which corresponds to ABI:
772 // _restgpr0_14: ld r14,-144(r1)
773 // _restgpr0_15: ld r15,-136(r1)
774 // _restgpr0_16: ld r16,-128(r1)
restore_nonvolatile_gprs(Register src,int offset)775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
776 ld(R14, offset, src); offset += 8;
777 ld(R15, offset, src); offset += 8;
778 ld(R16, offset, src); offset += 8;
779 ld(R17, offset, src); offset += 8;
780 ld(R18, offset, src); offset += 8;
781 ld(R19, offset, src); offset += 8;
782 ld(R20, offset, src); offset += 8;
783 ld(R21, offset, src); offset += 8;
784 ld(R22, offset, src); offset += 8;
785 ld(R23, offset, src); offset += 8;
786 ld(R24, offset, src); offset += 8;
787 ld(R25, offset, src); offset += 8;
788 ld(R26, offset, src); offset += 8;
789 ld(R27, offset, src); offset += 8;
790 ld(R28, offset, src); offset += 8;
791 ld(R29, offset, src); offset += 8;
792 ld(R30, offset, src); offset += 8;
793 ld(R31, offset, src); offset += 8;
794
795 // FP registers
796 lfd(F14, offset, src); offset += 8;
797 lfd(F15, offset, src); offset += 8;
798 lfd(F16, offset, src); offset += 8;
799 lfd(F17, offset, src); offset += 8;
800 lfd(F18, offset, src); offset += 8;
801 lfd(F19, offset, src); offset += 8;
802 lfd(F20, offset, src); offset += 8;
803 lfd(F21, offset, src); offset += 8;
804 lfd(F22, offset, src); offset += 8;
805 lfd(F23, offset, src); offset += 8;
806 lfd(F24, offset, src); offset += 8;
807 lfd(F25, offset, src); offset += 8;
808 lfd(F26, offset, src); offset += 8;
809 lfd(F27, offset, src); offset += 8;
810 lfd(F28, offset, src); offset += 8;
811 lfd(F29, offset, src); offset += 8;
812 lfd(F30, offset, src); offset += 8;
813 lfd(F31, offset, src);
814 }
815
816 // For verify_oops.
save_volatile_gprs(Register dst,int offset)817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
818 std(R2, offset, dst); offset += 8;
819 std(R3, offset, dst); offset += 8;
820 std(R4, offset, dst); offset += 8;
821 std(R5, offset, dst); offset += 8;
822 std(R6, offset, dst); offset += 8;
823 std(R7, offset, dst); offset += 8;
824 std(R8, offset, dst); offset += 8;
825 std(R9, offset, dst); offset += 8;
826 std(R10, offset, dst); offset += 8;
827 std(R11, offset, dst); offset += 8;
828 std(R12, offset, dst); offset += 8;
829
830 stfd(F0, offset, dst); offset += 8;
831 stfd(F1, offset, dst); offset += 8;
832 stfd(F2, offset, dst); offset += 8;
833 stfd(F3, offset, dst); offset += 8;
834 stfd(F4, offset, dst); offset += 8;
835 stfd(F5, offset, dst); offset += 8;
836 stfd(F6, offset, dst); offset += 8;
837 stfd(F7, offset, dst); offset += 8;
838 stfd(F8, offset, dst); offset += 8;
839 stfd(F9, offset, dst); offset += 8;
840 stfd(F10, offset, dst); offset += 8;
841 stfd(F11, offset, dst); offset += 8;
842 stfd(F12, offset, dst); offset += 8;
843 stfd(F13, offset, dst);
844 }
845
846 // For verify_oops.
restore_volatile_gprs(Register src,int offset)847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
848 ld(R2, offset, src); offset += 8;
849 ld(R3, offset, src); offset += 8;
850 ld(R4, offset, src); offset += 8;
851 ld(R5, offset, src); offset += 8;
852 ld(R6, offset, src); offset += 8;
853 ld(R7, offset, src); offset += 8;
854 ld(R8, offset, src); offset += 8;
855 ld(R9, offset, src); offset += 8;
856 ld(R10, offset, src); offset += 8;
857 ld(R11, offset, src); offset += 8;
858 ld(R12, offset, src); offset += 8;
859
860 lfd(F0, offset, src); offset += 8;
861 lfd(F1, offset, src); offset += 8;
862 lfd(F2, offset, src); offset += 8;
863 lfd(F3, offset, src); offset += 8;
864 lfd(F4, offset, src); offset += 8;
865 lfd(F5, offset, src); offset += 8;
866 lfd(F6, offset, src); offset += 8;
867 lfd(F7, offset, src); offset += 8;
868 lfd(F8, offset, src); offset += 8;
869 lfd(F9, offset, src); offset += 8;
870 lfd(F10, offset, src); offset += 8;
871 lfd(F11, offset, src); offset += 8;
872 lfd(F12, offset, src); offset += 8;
873 lfd(F13, offset, src);
874 }
875
save_LR_CR(Register tmp)876 void MacroAssembler::save_LR_CR(Register tmp) {
877 mfcr(tmp);
878 std(tmp, _abi(cr), R1_SP);
879 mflr(tmp);
880 std(tmp, _abi(lr), R1_SP);
881 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
882 }
883
restore_LR_CR(Register tmp)884 void MacroAssembler::restore_LR_CR(Register tmp) {
885 assert(tmp != R1_SP, "must be distinct");
886 ld(tmp, _abi(lr), R1_SP);
887 mtlr(tmp);
888 ld(tmp, _abi(cr), R1_SP);
889 mtcr(tmp);
890 }
891
get_PC_trash_LR(Register result)892 address MacroAssembler::get_PC_trash_LR(Register result) {
893 Label L;
894 bl(L);
895 bind(L);
896 address lr_pc = pc();
897 mflr(result);
898 return lr_pc;
899 }
900
resize_frame(Register offset,Register tmp)901 void MacroAssembler::resize_frame(Register offset, Register tmp) {
902 #ifdef ASSERT
903 assert_different_registers(offset, tmp, R1_SP);
904 andi_(tmp, offset, frame::alignment_in_bytes-1);
905 asm_assert_eq("resize_frame: unaligned");
906 #endif
907
908 // tmp <- *(SP)
909 ld(tmp, _abi(callers_sp), R1_SP);
910 // addr <- SP + offset;
911 // *(addr) <- tmp;
912 // SP <- addr
913 stdux(tmp, R1_SP, offset);
914 }
915
resize_frame(int offset,Register tmp)916 void MacroAssembler::resize_frame(int offset, Register tmp) {
917 assert(is_simm(offset, 16), "too big an offset");
918 assert_different_registers(tmp, R1_SP);
919 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
920 // tmp <- *(SP)
921 ld(tmp, _abi(callers_sp), R1_SP);
922 // addr <- SP + offset;
923 // *(addr) <- tmp;
924 // SP <- addr
925 stdu(tmp, offset, R1_SP);
926 }
927
resize_frame_absolute(Register addr,Register tmp1,Register tmp2)928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
929 // (addr == tmp1) || (addr == tmp2) is allowed here!
930 assert(tmp1 != tmp2, "must be distinct");
931
932 // compute offset w.r.t. current stack pointer
933 // tmp_1 <- addr - SP (!)
934 subf(tmp1, R1_SP, addr);
935
936 // atomically update SP keeping back link.
937 resize_frame(tmp1/* offset */, tmp2/* tmp */);
938 }
939
push_frame(Register bytes,Register tmp)940 void MacroAssembler::push_frame(Register bytes, Register tmp) {
941 #ifdef ASSERT
942 assert(bytes != R0, "r0 not allowed here");
943 andi_(R0, bytes, frame::alignment_in_bytes-1);
944 asm_assert_eq("push_frame(Reg, Reg): unaligned");
945 #endif
946 neg(tmp, bytes);
947 stdux(R1_SP, R1_SP, tmp);
948 }
949
950 // Push a frame of size `bytes'.
push_frame(unsigned int bytes,Register tmp)951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
952 long offset = align_addr(bytes, frame::alignment_in_bytes);
953 if (is_simm(-offset, 16)) {
954 stdu(R1_SP, -offset, R1_SP);
955 } else {
956 load_const_optimized(tmp, -offset);
957 stdux(R1_SP, R1_SP, tmp);
958 }
959 }
960
961 // Push a frame of size `bytes' plus abi_reg_args on top.
push_frame_reg_args(unsigned int bytes,Register tmp)962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
963 push_frame(bytes + frame::abi_reg_args_size, tmp);
964 }
965
966 // Setup up a new C frame with a spill area for non-volatile GPRs and
967 // additional space for local variables.
push_frame_reg_args_nonvolatiles(unsigned int bytes,Register tmp)968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
969 Register tmp) {
970 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
971 }
972
973 // Pop current C frame.
pop_frame()974 void MacroAssembler::pop_frame() {
975 ld(R1_SP, _abi(callers_sp), R1_SP);
976 }
977
978 #if defined(ABI_ELFv2)
branch_to(Register r_function_entry,bool and_link)979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
980 // TODO(asmundak): make sure the caller uses R12 as function descriptor
981 // most of the times.
982 if (R12 != r_function_entry) {
983 mr(R12, r_function_entry);
984 }
985 mtctr(R12);
986 // Do a call or a branch.
987 if (and_link) {
988 bctrl();
989 } else {
990 bctr();
991 }
992 _last_calls_return_pc = pc();
993
994 return _last_calls_return_pc;
995 }
996
997 // Call a C function via a function descriptor and use full C
998 // calling conventions. Updates and returns _last_calls_return_pc.
call_c(Register r_function_entry)999 address MacroAssembler::call_c(Register r_function_entry) {
1000 return branch_to(r_function_entry, /*and_link=*/true);
1001 }
1002
1003 // For tail calls: only branch, don't link, so callee returns to caller of this function.
call_c_and_return_to_caller(Register r_function_entry)1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1005 return branch_to(r_function_entry, /*and_link=*/false);
1006 }
1007
call_c(address function_entry,relocInfo::relocType rt)1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1009 load_const(R12, function_entry, R0);
1010 return branch_to(R12, /*and_link=*/true);
1011 }
1012
1013 #else
1014 // Generic version of a call to C function via a function descriptor
1015 // with variable support for C calling conventions (TOC, ENV, etc.).
1016 // Updates and returns _last_calls_return_pc.
branch_to(Register function_descriptor,bool and_link,bool save_toc_before_call,bool restore_toc_after_call,bool load_toc_of_callee,bool load_env_of_callee)1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1018 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1019 // we emit standard ptrgl glue code here
1020 assert((function_descriptor != R0), "function_descriptor cannot be R0");
1021
1022 // retrieve necessary entries from the function descriptor
1023 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1024 mtctr(R0);
1025
1026 if (load_toc_of_callee) {
1027 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1028 }
1029 if (load_env_of_callee) {
1030 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1031 } else if (load_toc_of_callee) {
1032 li(R11, 0);
1033 }
1034
1035 // do a call or a branch
1036 if (and_link) {
1037 bctrl();
1038 } else {
1039 bctr();
1040 }
1041 _last_calls_return_pc = pc();
1042
1043 return _last_calls_return_pc;
1044 }
1045
1046 // Call a C function via a function descriptor and use full C calling
1047 // conventions.
1048 // We don't use the TOC in generated code, so there is no need to save
1049 // and restore its value.
call_c(Register fd)1050 address MacroAssembler::call_c(Register fd) {
1051 return branch_to(fd, /*and_link=*/true,
1052 /*save toc=*/false,
1053 /*restore toc=*/false,
1054 /*load toc=*/true,
1055 /*load env=*/true);
1056 }
1057
call_c_and_return_to_caller(Register fd)1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1059 return branch_to(fd, /*and_link=*/false,
1060 /*save toc=*/false,
1061 /*restore toc=*/false,
1062 /*load toc=*/true,
1063 /*load env=*/true);
1064 }
1065
call_c(const FunctionDescriptor * fd,relocInfo::relocType rt)1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1067 if (rt != relocInfo::none) {
1068 // this call needs to be relocatable
1069 if (!ReoptimizeCallSequences
1070 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1071 || fd == NULL // support code-size estimation
1072 || !fd->is_friend_function()
1073 || fd->entry() == NULL) {
1074 // it's not a friend function as defined by class FunctionDescriptor,
1075 // so do a full call-c here.
1076 load_const(R11, (address)fd, R0);
1077
1078 bool has_env = (fd != NULL && fd->env() != NULL);
1079 return branch_to(R11, /*and_link=*/true,
1080 /*save toc=*/false,
1081 /*restore toc=*/false,
1082 /*load toc=*/true,
1083 /*load env=*/has_env);
1084 } else {
1085 // It's a friend function. Load the entry point and don't care about
1086 // toc and env. Use an optimizable call instruction, but ensure the
1087 // same code-size as in the case of a non-friend function.
1088 nop();
1089 nop();
1090 nop();
1091 bl64_patchable(fd->entry(), rt);
1092 _last_calls_return_pc = pc();
1093 return _last_calls_return_pc;
1094 }
1095 } else {
1096 // This call does not need to be relocatable, do more aggressive
1097 // optimizations.
1098 if (!ReoptimizeCallSequences
1099 || !fd->is_friend_function()) {
1100 // It's not a friend function as defined by class FunctionDescriptor,
1101 // so do a full call-c here.
1102 load_const(R11, (address)fd, R0);
1103 return branch_to(R11, /*and_link=*/true,
1104 /*save toc=*/false,
1105 /*restore toc=*/false,
1106 /*load toc=*/true,
1107 /*load env=*/true);
1108 } else {
1109 // it's a friend function, load the entry point and don't care about
1110 // toc and env.
1111 address dest = fd->entry();
1112 if (is_within_range_of_b(dest, pc())) {
1113 bl(dest);
1114 } else {
1115 bl64_patchable(dest, rt);
1116 }
1117 _last_calls_return_pc = pc();
1118 return _last_calls_return_pc;
1119 }
1120 }
1121 }
1122
1123 // Call a C function. All constants needed reside in TOC.
1124 //
1125 // Read the address to call from the TOC.
1126 // Read env from TOC, if fd specifies an env.
1127 // Read new TOC from TOC.
call_c_using_toc(const FunctionDescriptor * fd,relocInfo::relocType rt,Register toc)1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1129 relocInfo::relocType rt, Register toc) {
1130 if (!ReoptimizeCallSequences
1131 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1132 || !fd->is_friend_function()) {
1133 // It's not a friend function as defined by class FunctionDescriptor,
1134 // so do a full call-c here.
1135 assert(fd->entry() != NULL, "function must be linked");
1136
1137 AddressLiteral fd_entry(fd->entry());
1138 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1139 mtctr(R11);
1140 if (fd->env() == NULL) {
1141 li(R11, 0);
1142 nop();
1143 } else {
1144 AddressLiteral fd_env(fd->env());
1145 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1146 }
1147 AddressLiteral fd_toc(fd->toc());
1148 // Set R2_TOC (load from toc)
1149 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1150 bctrl();
1151 _last_calls_return_pc = pc();
1152 if (!success) { return NULL; }
1153 } else {
1154 // It's a friend function, load the entry point and don't care about
1155 // toc and env. Use an optimizable call instruction, but ensure the
1156 // same code-size as in the case of a non-friend function.
1157 nop();
1158 bl64_patchable(fd->entry(), rt);
1159 _last_calls_return_pc = pc();
1160 }
1161 return _last_calls_return_pc;
1162 }
1163 #endif // ABI_ELFv2
1164
call_VM_base(Register oop_result,Register last_java_sp,address entry_point,bool check_exceptions)1165 void MacroAssembler::call_VM_base(Register oop_result,
1166 Register last_java_sp,
1167 address entry_point,
1168 bool check_exceptions) {
1169 BLOCK_COMMENT("call_VM {");
1170 // Determine last_java_sp register.
1171 if (!last_java_sp->is_valid()) {
1172 last_java_sp = R1_SP;
1173 }
1174 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1175
1176 // ARG1 must hold thread address.
1177 mr(R3_ARG1, R16_thread);
1178 #if defined(ABI_ELFv2)
1179 address return_pc = call_c(entry_point, relocInfo::none);
1180 #else
1181 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1182 #endif
1183
1184 reset_last_Java_frame();
1185
1186 // Check for pending exceptions.
1187 if (check_exceptions) {
1188 // We don't check for exceptions here.
1189 ShouldNotReachHere();
1190 }
1191
1192 // Get oop result if there is one and reset the value in the thread.
1193 if (oop_result->is_valid()) {
1194 get_vm_result(oop_result);
1195 }
1196
1197 _last_calls_return_pc = return_pc;
1198 BLOCK_COMMENT("} call_VM");
1199 }
1200
call_VM_leaf_base(address entry_point)1201 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1202 BLOCK_COMMENT("call_VM_leaf {");
1203 #if defined(ABI_ELFv2)
1204 call_c(entry_point, relocInfo::none);
1205 #else
1206 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1207 #endif
1208 BLOCK_COMMENT("} call_VM_leaf");
1209 }
1210
call_VM(Register oop_result,address entry_point,bool check_exceptions)1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1212 call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1213 }
1214
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1216 bool check_exceptions) {
1217 // R3_ARG1 is reserved for the thread.
1218 mr_if_needed(R4_ARG2, arg_1);
1219 call_VM(oop_result, entry_point, check_exceptions);
1220 }
1221
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1223 bool check_exceptions) {
1224 // R3_ARG1 is reserved for the thread
1225 mr_if_needed(R4_ARG2, arg_1);
1226 assert(arg_2 != R4_ARG2, "smashed argument");
1227 mr_if_needed(R5_ARG3, arg_2);
1228 call_VM(oop_result, entry_point, check_exceptions);
1229 }
1230
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1232 bool check_exceptions) {
1233 // R3_ARG1 is reserved for the thread
1234 mr_if_needed(R4_ARG2, arg_1);
1235 assert(arg_2 != R4_ARG2, "smashed argument");
1236 mr_if_needed(R5_ARG3, arg_2);
1237 mr_if_needed(R6_ARG4, arg_3);
1238 call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240
call_VM_leaf(address entry_point)1241 void MacroAssembler::call_VM_leaf(address entry_point) {
1242 call_VM_leaf_base(entry_point);
1243 }
1244
call_VM_leaf(address entry_point,Register arg_1)1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1246 mr_if_needed(R3_ARG1, arg_1);
1247 call_VM_leaf(entry_point);
1248 }
1249
call_VM_leaf(address entry_point,Register arg_1,Register arg_2)1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1251 mr_if_needed(R3_ARG1, arg_1);
1252 assert(arg_2 != R3_ARG1, "smashed argument");
1253 mr_if_needed(R4_ARG2, arg_2);
1254 call_VM_leaf(entry_point);
1255 }
1256
call_VM_leaf(address entry_point,Register arg_1,Register arg_2,Register arg_3)1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1258 mr_if_needed(R3_ARG1, arg_1);
1259 assert(arg_2 != R3_ARG1, "smashed argument");
1260 mr_if_needed(R4_ARG2, arg_2);
1261 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1262 mr_if_needed(R5_ARG3, arg_3);
1263 call_VM_leaf(entry_point);
1264 }
1265
1266 // Check whether instruction is a read access to the polling page
1267 // which was emitted by load_from_polling_page(..).
is_load_from_polling_page(int instruction,void * ucontext,address * polling_address_ptr)1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1269 address* polling_address_ptr) {
1270 if (!is_ld(instruction))
1271 return false; // It's not a ld. Fail.
1272
1273 int rt = inv_rt_field(instruction);
1274 int ra = inv_ra_field(instruction);
1275 int ds = inv_ds_field(instruction);
1276 if (!(ds == 0 && ra != 0 && rt == 0)) {
1277 return false; // It's not a ld(r0, X, ra). Fail.
1278 }
1279
1280 if (!ucontext) {
1281 // Set polling address.
1282 if (polling_address_ptr != NULL) {
1283 *polling_address_ptr = NULL;
1284 }
1285 return true; // No ucontext given. Can't check value of ra. Assume true.
1286 }
1287
1288 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
1289 // Ucontext given. Check that register ra contains the address of
1290 // the safepoing polling page.
1291 ucontext_t* uc = (ucontext_t*) ucontext;
1292 // Set polling address.
1293 #if defined(LINUX)
1294 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1295 #elif defined(_ALLBSD_SOURCE)
1296 address addr = (address)uc->uc_mcontext.mc_gpr[ra] + (ssize_t)ds;
1297 #endif
1298 if (polling_address_ptr != NULL) {
1299 *polling_address_ptr = addr;
1300 }
1301 return SafepointMechanism::is_poll_address(addr);
1302 #else
1303 // Not on Linux, ucontext must be NULL.
1304 ShouldNotReachHere();
1305 return false;
1306 #endif
1307 }
1308
bang_stack_with_offset(int offset)1309 void MacroAssembler::bang_stack_with_offset(int offset) {
1310 // When increasing the stack, the old stack pointer will be written
1311 // to the new top of stack according to the PPC64 abi.
1312 // Therefore, stack banging is not necessary when increasing
1313 // the stack by <= os::vm_page_size() bytes.
1314 // When increasing the stack by a larger amount, this method is
1315 // called repeatedly to bang the intermediate pages.
1316
1317 // Stack grows down, caller passes positive offset.
1318 assert(offset > 0, "must bang with positive offset");
1319
1320 long stdoffset = -offset;
1321
1322 if (is_simm(stdoffset, 16)) {
1323 // Signed 16 bit offset, a simple std is ok.
1324 if (UseLoadInstructionsForStackBangingPPC64) {
1325 ld(R0, (int)(signed short)stdoffset, R1_SP);
1326 } else {
1327 std(R0,(int)(signed short)stdoffset, R1_SP);
1328 }
1329 } else if (is_simm(stdoffset, 31)) {
1330 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1331 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1332
1333 Register tmp = R11;
1334 addis(tmp, R1_SP, hi);
1335 if (UseLoadInstructionsForStackBangingPPC64) {
1336 ld(R0, lo, tmp);
1337 } else {
1338 std(R0, lo, tmp);
1339 }
1340 } else {
1341 ShouldNotReachHere();
1342 }
1343 }
1344
1345 // If instruction is a stack bang of the form
1346 // std R0, x(Ry), (see bang_stack_with_offset())
1347 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1348 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1349 // return the banged address. Otherwise, return 0.
get_stack_bang_address(int instruction,void * ucontext)1350 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1351 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
1352 ucontext_t* uc = (ucontext_t*) ucontext;
1353 int rs = inv_rs_field(instruction);
1354 int ra = inv_ra_field(instruction);
1355 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1356 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1357 || (is_stdu(instruction) && rs == 1)) {
1358 int ds = inv_ds_field(instruction);
1359 // return banged address
1360 #if defined(LINUX)
1361 return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1362 #elif defined(_ALLBSD_SOURCE)
1363 return ds+(address)uc->uc_mcontext.mc_gpr[ra];
1364 #endif
1365 } else if (is_stdux(instruction) && rs == 1) {
1366 int rb = inv_rb_field(instruction);
1367 #if defined(LINUX)
1368 address sp = (address)uc->uc_mcontext.regs->gpr[1];
1369 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1370 #elif defined(_ALLBSD_SOURCE)
1371 address sp = (address)uc->uc_mcontext.mc_gpr[1];
1372 long rb_val = (long)uc->uc_mcontext.mc_gpr[rb];
1373 #endif
1374 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang
1375 : sp + rb_val; // banged address
1376 }
1377 return NULL; // not a stack bang
1378 #else
1379 // workaround not needed on !LINUX :-)
1380 ShouldNotCallThis();
1381 return NULL;
1382 #endif
1383 }
1384
reserved_stack_check(Register return_pc)1385 void MacroAssembler::reserved_stack_check(Register return_pc) {
1386 // Test if reserved zone needs to be enabled.
1387 Label no_reserved_zone_enabling;
1388
1389 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1390 cmpld(CCR0, R1_SP, R0);
1391 blt_predict_taken(CCR0, no_reserved_zone_enabling);
1392
1393 // Enable reserved zone again, throw stack overflow exception.
1394 push_frame_reg_args(0, R0);
1395 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1396 pop_frame();
1397 mtlr(return_pc);
1398 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1399 mtctr(R0);
1400 bctr();
1401
1402 should_not_reach_here();
1403
1404 bind(no_reserved_zone_enabling);
1405 }
1406
getandsetd(Register dest_current_value,Register exchange_value,Register addr_base,bool cmpxchgx_hint)1407 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1408 bool cmpxchgx_hint) {
1409 Label retry;
1410 bind(retry);
1411 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1412 stdcx_(exchange_value, addr_base);
1413 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1414 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1415 } else {
1416 bne( CCR0, retry); // StXcx_ sets CCR0.
1417 }
1418 }
1419
getandaddd(Register dest_current_value,Register inc_value,Register addr_base,Register tmp,bool cmpxchgx_hint)1420 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1421 Register tmp, bool cmpxchgx_hint) {
1422 Label retry;
1423 bind(retry);
1424 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1425 add(tmp, dest_current_value, inc_value);
1426 stdcx_(tmp, addr_base);
1427 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1428 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1429 } else {
1430 bne( CCR0, retry); // StXcx_ sets CCR0.
1431 }
1432 }
1433
1434 // Word/sub-word atomic helper functions
1435
1436 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1437 // Only signed types are supported with size < 4.
1438 // Atomic add always kills tmp1.
atomic_get_and_modify_generic(Register dest_current_value,Register exchange_value,Register addr_base,Register tmp1,Register tmp2,Register tmp3,bool cmpxchgx_hint,bool is_add,int size)1439 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1440 Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1441 bool cmpxchgx_hint, bool is_add, int size) {
1442 // Sub-word instructions are available since Power 8.
1443 // For older processors, instruction_type != size holds, and we
1444 // emulate the sub-word instructions by constructing a 4-byte value
1445 // that leaves the other bytes unchanged.
1446 const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1447
1448 Label retry;
1449 Register shift_amount = noreg,
1450 val32 = dest_current_value,
1451 modval = is_add ? tmp1 : exchange_value;
1452
1453 if (instruction_type != size) {
1454 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1455 modval = tmp1;
1456 shift_amount = tmp2;
1457 val32 = tmp3;
1458 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1459 #ifdef VM_LITTLE_ENDIAN
1460 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1461 clrrdi(addr_base, addr_base, 2);
1462 #else
1463 xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1464 clrrdi(addr_base, addr_base, 2);
1465 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1466 #endif
1467 }
1468
1469 // atomic emulation loop
1470 bind(retry);
1471
1472 switch (instruction_type) {
1473 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1474 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1475 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1476 default: ShouldNotReachHere();
1477 }
1478
1479 if (instruction_type != size) {
1480 srw(dest_current_value, val32, shift_amount);
1481 }
1482
1483 if (is_add) { add(modval, dest_current_value, exchange_value); }
1484
1485 if (instruction_type != size) {
1486 // Transform exchange value such that the replacement can be done by one xor instruction.
1487 xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1488 clrldi(modval, modval, (size == 1) ? 56 : 48);
1489 slw(modval, modval, shift_amount);
1490 xorr(modval, val32, modval);
1491 }
1492
1493 switch (instruction_type) {
1494 case 4: stwcx_(modval, addr_base); break;
1495 case 2: sthcx_(modval, addr_base); break;
1496 case 1: stbcx_(modval, addr_base); break;
1497 default: ShouldNotReachHere();
1498 }
1499
1500 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1501 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1502 } else {
1503 bne( CCR0, retry); // StXcx_ sets CCR0.
1504 }
1505
1506 // l?arx zero-extends, but Java wants byte/short values sign-extended.
1507 if (size == 1) {
1508 extsb(dest_current_value, dest_current_value);
1509 } else if (size == 2) {
1510 extsh(dest_current_value, dest_current_value);
1511 };
1512 }
1513
1514 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1515 // Only signed types are supported with size < 4.
cmpxchg_loop_body(ConditionRegister flag,Register dest_current_value,Register compare_value,Register exchange_value,Register addr_base,Register tmp1,Register tmp2,Label & retry,Label & failed,bool cmpxchgx_hint,int size)1516 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1517 Register compare_value, Register exchange_value,
1518 Register addr_base, Register tmp1, Register tmp2,
1519 Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1520 // Sub-word instructions are available since Power 8.
1521 // For older processors, instruction_type != size holds, and we
1522 // emulate the sub-word instructions by constructing a 4-byte value
1523 // that leaves the other bytes unchanged.
1524 const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1525
1526 Register shift_amount = noreg,
1527 val32 = dest_current_value,
1528 modval = exchange_value;
1529
1530 if (instruction_type != size) {
1531 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1532 shift_amount = tmp1;
1533 val32 = tmp2;
1534 modval = tmp2;
1535 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1536 #ifdef VM_LITTLE_ENDIAN
1537 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1538 clrrdi(addr_base, addr_base, 2);
1539 #else
1540 xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1541 clrrdi(addr_base, addr_base, 2);
1542 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1543 #endif
1544 // Transform exchange value such that the replacement can be done by one xor instruction.
1545 xorr(exchange_value, compare_value, exchange_value);
1546 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1547 slw(exchange_value, exchange_value, shift_amount);
1548 }
1549
1550 // atomic emulation loop
1551 bind(retry);
1552
1553 switch (instruction_type) {
1554 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1555 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1556 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1557 default: ShouldNotReachHere();
1558 }
1559
1560 if (instruction_type != size) {
1561 srw(dest_current_value, val32, shift_amount);
1562 }
1563 if (size == 1) {
1564 extsb(dest_current_value, dest_current_value);
1565 } else if (size == 2) {
1566 extsh(dest_current_value, dest_current_value);
1567 };
1568
1569 cmpw(flag, dest_current_value, compare_value);
1570 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1571 bne_predict_not_taken(flag, failed);
1572 } else {
1573 bne( flag, failed);
1574 }
1575 // branch to done => (flag == ne), (dest_current_value != compare_value)
1576 // fall through => (flag == eq), (dest_current_value == compare_value)
1577
1578 if (instruction_type != size) {
1579 xorr(modval, val32, exchange_value);
1580 }
1581
1582 switch (instruction_type) {
1583 case 4: stwcx_(modval, addr_base); break;
1584 case 2: sthcx_(modval, addr_base); break;
1585 case 1: stbcx_(modval, addr_base); break;
1586 default: ShouldNotReachHere();
1587 }
1588 }
1589
1590 // CmpxchgX sets condition register to cmpX(current, compare).
cmpxchg_generic(ConditionRegister flag,Register dest_current_value,Register compare_value,Register exchange_value,Register addr_base,Register tmp1,Register tmp2,int semantics,bool cmpxchgx_hint,Register int_flag_success,bool contention_hint,bool weak,int size)1591 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1592 Register compare_value, Register exchange_value,
1593 Register addr_base, Register tmp1, Register tmp2,
1594 int semantics, bool cmpxchgx_hint,
1595 Register int_flag_success, bool contention_hint, bool weak, int size) {
1596 Label retry;
1597 Label failed;
1598 Label done;
1599
1600 // Save one branch if result is returned via register and
1601 // result register is different from the other ones.
1602 bool use_result_reg = (int_flag_success != noreg);
1603 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1604 int_flag_success != exchange_value && int_flag_success != addr_base &&
1605 int_flag_success != tmp1 && int_flag_success != tmp2);
1606 assert(!weak || flag == CCR0, "weak only supported with CCR0");
1607 assert(size == 1 || size == 2 || size == 4, "unsupported");
1608
1609 if (use_result_reg && preset_result_reg) {
1610 li(int_flag_success, 0); // preset (assume cas failed)
1611 }
1612
1613 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1614 if (contention_hint) { // Don't try to reserve if cmp fails.
1615 switch (size) {
1616 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1617 case 2: lha(dest_current_value, 0, addr_base); break;
1618 case 4: lwz(dest_current_value, 0, addr_base); break;
1619 default: ShouldNotReachHere();
1620 }
1621 cmpw(flag, dest_current_value, compare_value);
1622 bne(flag, failed);
1623 }
1624
1625 // release/fence semantics
1626 if (semantics & MemBarRel) {
1627 release();
1628 }
1629
1630 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1631 retry, failed, cmpxchgx_hint, size);
1632 if (!weak || use_result_reg) {
1633 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1634 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1635 } else {
1636 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1637 }
1638 }
1639 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1640
1641 // Result in register (must do this at the end because int_flag_success can be the
1642 // same register as one above).
1643 if (use_result_reg) {
1644 li(int_flag_success, 1);
1645 }
1646
1647 if (semantics & MemBarFenceAfter) {
1648 fence();
1649 } else if (semantics & MemBarAcq) {
1650 isync();
1651 }
1652
1653 if (use_result_reg && !preset_result_reg) {
1654 b(done);
1655 }
1656
1657 bind(failed);
1658 if (use_result_reg && !preset_result_reg) {
1659 li(int_flag_success, 0);
1660 }
1661
1662 bind(done);
1663 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1664 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1665 }
1666
1667 // Preforms atomic compare exchange:
1668 // if (compare_value == *addr_base)
1669 // *addr_base = exchange_value
1670 // int_flag_success = 1;
1671 // else
1672 // int_flag_success = 0;
1673 //
1674 // ConditionRegister flag = cmp(compare_value, *addr_base)
1675 // Register dest_current_value = *addr_base
1676 // Register compare_value Used to compare with value in memory
1677 // Register exchange_value Written to memory if compare_value == *addr_base
1678 // Register addr_base The memory location to compareXChange
1679 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1680 //
1681 // To avoid the costly compare exchange the value is tested beforehand.
1682 // Several special cases exist to avoid that unnecessary information is generated.
1683 //
cmpxchgd(ConditionRegister flag,Register dest_current_value,RegisterOrConstant compare_value,Register exchange_value,Register addr_base,int semantics,bool cmpxchgx_hint,Register int_flag_success,Label * failed_ext,bool contention_hint,bool weak)1684 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1685 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1686 Register addr_base, int semantics, bool cmpxchgx_hint,
1687 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1688 Label retry;
1689 Label failed_int;
1690 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1691 Label done;
1692
1693 // Save one branch if result is returned via register and result register is different from the other ones.
1694 bool use_result_reg = (int_flag_success!=noreg);
1695 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1696 int_flag_success!=exchange_value && int_flag_success!=addr_base);
1697 assert(!weak || flag == CCR0, "weak only supported with CCR0");
1698 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1699
1700 if (use_result_reg && preset_result_reg) {
1701 li(int_flag_success, 0); // preset (assume cas failed)
1702 }
1703
1704 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1705 if (contention_hint) { // Don't try to reserve if cmp fails.
1706 ld(dest_current_value, 0, addr_base);
1707 cmpd(flag, compare_value, dest_current_value);
1708 bne(flag, failed);
1709 }
1710
1711 // release/fence semantics
1712 if (semantics & MemBarRel) {
1713 release();
1714 }
1715
1716 // atomic emulation loop
1717 bind(retry);
1718
1719 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1720 cmpd(flag, compare_value, dest_current_value);
1721 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1722 bne_predict_not_taken(flag, failed);
1723 } else {
1724 bne( flag, failed);
1725 }
1726
1727 stdcx_(exchange_value, addr_base);
1728 if (!weak || use_result_reg || failed_ext) {
1729 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1730 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1731 } else {
1732 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1733 }
1734 }
1735
1736 // result in register (must do this at the end because int_flag_success can be the same register as one above)
1737 if (use_result_reg) {
1738 li(int_flag_success, 1);
1739 }
1740
1741 if (semantics & MemBarFenceAfter) {
1742 fence();
1743 } else if (semantics & MemBarAcq) {
1744 isync();
1745 }
1746
1747 if (use_result_reg && !preset_result_reg) {
1748 b(done);
1749 }
1750
1751 bind(failed_int);
1752 if (use_result_reg && !preset_result_reg) {
1753 li(int_flag_success, 0);
1754 }
1755
1756 bind(done);
1757 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1758 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1759 }
1760
1761 // Look up the method for a megamorphic invokeinterface call.
1762 // The target method is determined by <intf_klass, itable_index>.
1763 // The receiver klass is in recv_klass.
1764 // On success, the result will be in method_result, and execution falls through.
1765 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Register temp2,Label & L_no_such_interface,bool return_method)1766 void MacroAssembler::lookup_interface_method(Register recv_klass,
1767 Register intf_klass,
1768 RegisterOrConstant itable_index,
1769 Register method_result,
1770 Register scan_temp,
1771 Register temp2,
1772 Label& L_no_such_interface,
1773 bool return_method) {
1774 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1775
1776 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1777 int vtable_base = in_bytes(Klass::vtable_start_offset());
1778 int itentry_off = itableMethodEntry::method_offset_in_bytes();
1779 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1780 int scan_step = itableOffsetEntry::size() * wordSize;
1781 int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1782
1783 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1784 // %%% We should store the aligned, prescaled offset in the klassoop.
1785 // Then the next several instructions would fold away.
1786
1787 sldi(scan_temp, scan_temp, log_vte_size);
1788 addi(scan_temp, scan_temp, vtable_base);
1789 add(scan_temp, recv_klass, scan_temp);
1790
1791 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1792 if (return_method) {
1793 if (itable_index.is_register()) {
1794 Register itable_offset = itable_index.as_register();
1795 sldi(method_result, itable_offset, logMEsize);
1796 if (itentry_off) { addi(method_result, method_result, itentry_off); }
1797 add(method_result, method_result, recv_klass);
1798 } else {
1799 long itable_offset = (long)itable_index.as_constant();
1800 // static address, no relocation
1801 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1802 }
1803 }
1804
1805 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1806 // if (scan->interface() == intf) {
1807 // result = (klass + scan->offset() + itable_index);
1808 // }
1809 // }
1810 Label search, found_method;
1811
1812 for (int peel = 1; peel >= 0; peel--) {
1813 // %%%% Could load both offset and interface in one ldx, if they were
1814 // in the opposite order. This would save a load.
1815 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1816
1817 // Check that this entry is non-null. A null entry means that
1818 // the receiver class doesn't implement the interface, and wasn't the
1819 // same as when the caller was compiled.
1820 cmpd(CCR0, temp2, intf_klass);
1821
1822 if (peel) {
1823 beq(CCR0, found_method);
1824 } else {
1825 bne(CCR0, search);
1826 // (invert the test to fall through to found_method...)
1827 }
1828
1829 if (!peel) break;
1830
1831 bind(search);
1832
1833 cmpdi(CCR0, temp2, 0);
1834 beq(CCR0, L_no_such_interface);
1835 addi(scan_temp, scan_temp, scan_step);
1836 }
1837
1838 bind(found_method);
1839
1840 // Got a hit.
1841 if (return_method) {
1842 int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1843 lwz(scan_temp, ito_offset, scan_temp);
1844 ldx(method_result, scan_temp, method_result);
1845 }
1846 }
1847
1848 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1849 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1850 RegisterOrConstant vtable_index,
1851 Register method_result) {
1852
1853 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1854
1855 const int base = in_bytes(Klass::vtable_start_offset());
1856 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1857
1858 if (vtable_index.is_register()) {
1859 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1860 add(recv_klass, vtable_index.as_register(), recv_klass);
1861 } else {
1862 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1863 }
1864 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1865 }
1866
1867 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp1_reg,Register temp2_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1868 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1869 Register super_klass,
1870 Register temp1_reg,
1871 Register temp2_reg,
1872 Label* L_success,
1873 Label* L_failure,
1874 Label* L_slow_path,
1875 RegisterOrConstant super_check_offset) {
1876
1877 const Register check_cache_offset = temp1_reg;
1878 const Register cached_super = temp2_reg;
1879
1880 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1881
1882 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1883 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1884
1885 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1886 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1887
1888 Label L_fallthrough;
1889 int label_nulls = 0;
1890 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1891 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1892 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1893 assert(label_nulls <= 1 ||
1894 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1895 "at most one NULL in the batch, usually");
1896
1897 // If the pointers are equal, we are done (e.g., String[] elements).
1898 // This self-check enables sharing of secondary supertype arrays among
1899 // non-primary types such as array-of-interface. Otherwise, each such
1900 // type would need its own customized SSA.
1901 // We move this check to the front of the fast path because many
1902 // type checks are in fact trivially successful in this manner,
1903 // so we get a nicely predicted branch right at the start of the check.
1904 cmpd(CCR0, sub_klass, super_klass);
1905 beq(CCR0, *L_success);
1906
1907 // Check the supertype display:
1908 if (must_load_sco) {
1909 // The super check offset is always positive...
1910 lwz(check_cache_offset, sco_offset, super_klass);
1911 super_check_offset = RegisterOrConstant(check_cache_offset);
1912 // super_check_offset is register.
1913 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1914 }
1915 // The loaded value is the offset from KlassOopDesc.
1916
1917 ld(cached_super, super_check_offset, sub_klass);
1918 cmpd(CCR0, cached_super, super_klass);
1919
1920 // This check has worked decisively for primary supers.
1921 // Secondary supers are sought in the super_cache ('super_cache_addr').
1922 // (Secondary supers are interfaces and very deeply nested subtypes.)
1923 // This works in the same check above because of a tricky aliasing
1924 // between the super_cache and the primary super display elements.
1925 // (The 'super_check_addr' can address either, as the case requires.)
1926 // Note that the cache is updated below if it does not help us find
1927 // what we need immediately.
1928 // So if it was a primary super, we can just fail immediately.
1929 // Otherwise, it's the slow path for us (no success at this point).
1930
1931 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1932
1933 if (super_check_offset.is_register()) {
1934 beq(CCR0, *L_success);
1935 cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1936 if (L_failure == &L_fallthrough) {
1937 beq(CCR0, *L_slow_path);
1938 } else {
1939 bne(CCR0, *L_failure);
1940 FINAL_JUMP(*L_slow_path);
1941 }
1942 } else {
1943 if (super_check_offset.as_constant() == sc_offset) {
1944 // Need a slow path; fast failure is impossible.
1945 if (L_slow_path == &L_fallthrough) {
1946 beq(CCR0, *L_success);
1947 } else {
1948 bne(CCR0, *L_slow_path);
1949 FINAL_JUMP(*L_success);
1950 }
1951 } else {
1952 // No slow path; it's a fast decision.
1953 if (L_failure == &L_fallthrough) {
1954 beq(CCR0, *L_success);
1955 } else {
1956 bne(CCR0, *L_failure);
1957 FINAL_JUMP(*L_success);
1958 }
1959 }
1960 }
1961
1962 bind(L_fallthrough);
1963 #undef FINAL_JUMP
1964 }
1965
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp1_reg,Register temp2_reg,Label * L_success,Register result_reg)1966 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1967 Register super_klass,
1968 Register temp1_reg,
1969 Register temp2_reg,
1970 Label* L_success,
1971 Register result_reg) {
1972 const Register array_ptr = temp1_reg; // current value from cache array
1973 const Register temp = temp2_reg;
1974
1975 assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1976
1977 int source_offset = in_bytes(Klass::secondary_supers_offset());
1978 int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1979
1980 int length_offset = Array<Klass*>::length_offset_in_bytes();
1981 int base_offset = Array<Klass*>::base_offset_in_bytes();
1982
1983 Label hit, loop, failure, fallthru;
1984
1985 ld(array_ptr, source_offset, sub_klass);
1986
1987 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1988 lwz(temp, length_offset, array_ptr);
1989 cmpwi(CCR0, temp, 0);
1990 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1991
1992 mtctr(temp); // load ctr
1993
1994 bind(loop);
1995 // Oops in table are NO MORE compressed.
1996 ld(temp, base_offset, array_ptr);
1997 cmpd(CCR0, temp, super_klass);
1998 beq(CCR0, hit);
1999 addi(array_ptr, array_ptr, BytesPerWord);
2000 bdnz(loop);
2001
2002 bind(failure);
2003 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2004 b(fallthru);
2005
2006 bind(hit);
2007 std(super_klass, target_offset, sub_klass); // save result to cache
2008 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2009 if (L_success != NULL) { b(*L_success); }
2010 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2011
2012 bind(fallthru);
2013 }
2014
2015 // Try fast path, then go to slow one if not successful
check_klass_subtype(Register sub_klass,Register super_klass,Register temp1_reg,Register temp2_reg,Label & L_success)2016 void MacroAssembler::check_klass_subtype(Register sub_klass,
2017 Register super_klass,
2018 Register temp1_reg,
2019 Register temp2_reg,
2020 Label& L_success) {
2021 Label L_failure;
2022 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2023 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2024 bind(L_failure); // Fallthru if not successful.
2025 }
2026
clinit_barrier(Register klass,Register thread,Label * L_fast_path,Label * L_slow_path)2027 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2028 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2029
2030 Label L_fallthrough;
2031 if (L_fast_path == NULL) {
2032 L_fast_path = &L_fallthrough;
2033 } else if (L_slow_path == NULL) {
2034 L_slow_path = &L_fallthrough;
2035 }
2036
2037 // Fast path check: class is fully initialized
2038 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2039 cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2040 beq(CCR0, *L_fast_path);
2041
2042 // Fast path check: current thread is initializer thread
2043 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2044 cmpd(CCR0, thread, R0);
2045 if (L_slow_path == &L_fallthrough) {
2046 beq(CCR0, *L_fast_path);
2047 } else if (L_fast_path == &L_fallthrough) {
2048 bne(CCR0, *L_slow_path);
2049 } else {
2050 Unimplemented();
2051 }
2052
2053 bind(L_fallthrough);
2054 }
2055
argument_offset(RegisterOrConstant arg_slot,Register temp_reg,int extra_slot_offset)2056 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2057 Register temp_reg,
2058 int extra_slot_offset) {
2059 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2060 int stackElementSize = Interpreter::stackElementSize;
2061 int offset = extra_slot_offset * stackElementSize;
2062 if (arg_slot.is_constant()) {
2063 offset += arg_slot.as_constant() * stackElementSize;
2064 return offset;
2065 } else {
2066 assert(temp_reg != noreg, "must specify");
2067 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2068 if (offset != 0)
2069 addi(temp_reg, temp_reg, offset);
2070 return temp_reg;
2071 }
2072 }
2073
2074 // Supports temp2_reg = R0.
biased_locking_enter(ConditionRegister cr_reg,Register obj_reg,Register mark_reg,Register temp_reg,Register temp2_reg,Label & done,Label * slow_case)2075 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2076 Register mark_reg, Register temp_reg,
2077 Register temp2_reg, Label& done, Label* slow_case) {
2078 assert(UseBiasedLocking, "why call this otherwise?");
2079
2080 #ifdef ASSERT
2081 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2082 #endif
2083
2084 Label cas_label;
2085
2086 // Branch to done if fast path fails and no slow_case provided.
2087 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2088
2089 // Biased locking
2090 // See whether the lock is currently biased toward our thread and
2091 // whether the epoch is still valid
2092 // Note that the runtime guarantees sufficient alignment of JavaThread
2093 // pointers to allow age to be placed into low bits
2094 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,
2095 "biased locking makes assumptions about bit layout");
2096
2097 if (PrintBiasedLockingStatistics) {
2098 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2099 lwzx(temp_reg, temp2_reg);
2100 addi(temp_reg, temp_reg, 1);
2101 stwx(temp_reg, temp2_reg);
2102 }
2103
2104 andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);
2105 cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2106 bne(cr_reg, cas_label);
2107
2108 load_klass(temp_reg, obj_reg);
2109
2110 load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));
2111 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2112 orr(temp_reg, R16_thread, temp_reg);
2113 xorr(temp_reg, mark_reg, temp_reg);
2114 andr(temp_reg, temp_reg, temp2_reg);
2115 cmpdi(cr_reg, temp_reg, 0);
2116 if (PrintBiasedLockingStatistics) {
2117 Label l;
2118 bne(cr_reg, l);
2119 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2120 lwzx(mark_reg, temp2_reg);
2121 addi(mark_reg, mark_reg, 1);
2122 stwx(mark_reg, temp2_reg);
2123 // restore mark_reg
2124 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2125 bind(l);
2126 }
2127 beq(cr_reg, done);
2128
2129 Label try_revoke_bias;
2130 Label try_rebias;
2131
2132 // At this point we know that the header has the bias pattern and
2133 // that we are not the bias owner in the current epoch. We need to
2134 // figure out more details about the state of the header in order to
2135 // know what operations can be legally performed on the object's
2136 // header.
2137
2138 // If the low three bits in the xor result aren't clear, that means
2139 // the prototype header is no longer biased and we have to revoke
2140 // the bias on this object.
2141 andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);
2142 cmpwi(cr_reg, temp2_reg, 0);
2143 bne(cr_reg, try_revoke_bias);
2144
2145 // Biasing is still enabled for this data type. See whether the
2146 // epoch of the current bias is still valid, meaning that the epoch
2147 // bits of the mark word are equal to the epoch bits of the
2148 // prototype header. (Note that the prototype header's epoch bits
2149 // only change at a safepoint.) If not, attempt to rebias the object
2150 // toward the current thread. Note that we must be absolutely sure
2151 // that the current epoch is invalid in order to do this because
2152 // otherwise the manipulations it performs on the mark word are
2153 // illegal.
2154
2155 int shift_amount = 64 - markWord::epoch_shift;
2156 // rotate epoch bits to right (little) end and set other bits to 0
2157 // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2158 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);
2159 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2160 bne(CCR0, try_rebias);
2161
2162 // The epoch of the current bias is still valid but we know nothing
2163 // about the owner; it might be set or it might be clear. Try to
2164 // acquire the bias of the object using an atomic operation. If this
2165 // fails we will go in to the runtime to revoke the object's bias.
2166 // Note that we first construct the presumed unbiased header so we
2167 // don't accidentally blow away another thread's valid bias.
2168 andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |
2169 markWord::age_mask_in_place |
2170 markWord::epoch_mask_in_place));
2171 orr(temp_reg, R16_thread, mark_reg);
2172
2173 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2174
2175 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2176 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2177 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2178 /*where=*/obj_reg,
2179 MacroAssembler::MemBarAcq,
2180 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2181 noreg, slow_case_int); // bail out if failed
2182
2183 // If the biasing toward our thread failed, this means that
2184 // another thread succeeded in biasing it toward itself and we
2185 // need to revoke that bias. The revocation will occur in the
2186 // interpreter runtime in the slow case.
2187 if (PrintBiasedLockingStatistics) {
2188 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2189 lwzx(temp_reg, temp2_reg);
2190 addi(temp_reg, temp_reg, 1);
2191 stwx(temp_reg, temp2_reg);
2192 }
2193 b(done);
2194
2195 bind(try_rebias);
2196 // At this point we know the epoch has expired, meaning that the
2197 // current "bias owner", if any, is actually invalid. Under these
2198 // circumstances _only_, we are allowed to use the current header's
2199 // value as the comparison value when doing the cas to acquire the
2200 // bias in the current epoch. In other words, we allow transfer of
2201 // the bias from one thread to another directly in this situation.
2202 load_klass(temp_reg, obj_reg);
2203 andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2204 orr(temp2_reg, R16_thread, temp2_reg);
2205 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2206 orr(temp_reg, temp2_reg, temp_reg);
2207
2208 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2209
2210 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2211 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2212 /*where=*/obj_reg,
2213 MacroAssembler::MemBarAcq,
2214 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2215 noreg, slow_case_int); // bail out if failed
2216
2217 // If the biasing toward our thread failed, this means that
2218 // another thread succeeded in biasing it toward itself and we
2219 // need to revoke that bias. The revocation will occur in the
2220 // interpreter runtime in the slow case.
2221 if (PrintBiasedLockingStatistics) {
2222 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2223 lwzx(temp_reg, temp2_reg);
2224 addi(temp_reg, temp_reg, 1);
2225 stwx(temp_reg, temp2_reg);
2226 }
2227 b(done);
2228
2229 bind(try_revoke_bias);
2230 // The prototype mark in the klass doesn't have the bias bit set any
2231 // more, indicating that objects of this data type are not supposed
2232 // to be biased any more. We are going to try to reset the mark of
2233 // this object to the prototype value and fall through to the
2234 // CAS-based locking scheme. Note that if our CAS fails, it means
2235 // that another thread raced us for the privilege of revoking the
2236 // bias of this particular object, so it's okay to continue in the
2237 // normal locking code.
2238 load_klass(temp_reg, obj_reg);
2239 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2240 andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2241 orr(temp_reg, temp_reg, temp2_reg);
2242
2243 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2244
2245 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2246 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2247 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2248 /*where=*/obj_reg,
2249 MacroAssembler::MemBarAcq,
2250 MacroAssembler::cmpxchgx_hint_acquire_lock());
2251
2252 // reload markWord in mark_reg before continuing with lightweight locking
2253 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2254
2255 // Fall through to the normal CAS-based lock, because no matter what
2256 // the result of the above CAS, some thread must have succeeded in
2257 // removing the bias bit from the object's header.
2258 if (PrintBiasedLockingStatistics) {
2259 Label l;
2260 bne(cr_reg, l);
2261 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2262 lwzx(temp_reg, temp2_reg);
2263 addi(temp_reg, temp_reg, 1);
2264 stwx(temp_reg, temp2_reg);
2265 bind(l);
2266 }
2267
2268 bind(cas_label);
2269 }
2270
biased_locking_exit(ConditionRegister cr_reg,Register mark_addr,Register temp_reg,Label & done)2271 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2272 // Check for biased locking unlock case, which is a no-op
2273 // Note: we do not have to check the thread ID for two reasons.
2274 // First, the interpreter checks for IllegalMonitorStateException at
2275 // a higher level. Second, if the bias was revoked while we held the
2276 // lock, the object could not be rebiased toward another thread, so
2277 // the bias bit would be clear.
2278
2279 ld(temp_reg, 0, mark_addr);
2280 andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
2281
2282 cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2283 beq(cr_reg, done);
2284 }
2285
2286 // allocation (for C1)
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)2287 void MacroAssembler::eden_allocate(
2288 Register obj, // result: pointer to object after successful allocation
2289 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2290 int con_size_in_bytes, // object size in bytes if known at compile time
2291 Register t1, // temp register
2292 Register t2, // temp register
2293 Label& slow_case // continuation point if fast allocation fails
2294 ) {
2295 b(slow_case);
2296 }
2297
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)2298 void MacroAssembler::tlab_allocate(
2299 Register obj, // result: pointer to object after successful allocation
2300 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2301 int con_size_in_bytes, // object size in bytes if known at compile time
2302 Register t1, // temp register
2303 Label& slow_case // continuation point if fast allocation fails
2304 ) {
2305 // make sure arguments make sense
2306 assert_different_registers(obj, var_size_in_bytes, t1);
2307 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2308 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2309
2310 const Register new_top = t1;
2311 //verify_tlab(); not implemented
2312
2313 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2314 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2315 if (var_size_in_bytes == noreg) {
2316 addi(new_top, obj, con_size_in_bytes);
2317 } else {
2318 add(new_top, obj, var_size_in_bytes);
2319 }
2320 cmpld(CCR0, new_top, R0);
2321 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2322
2323 #ifdef ASSERT
2324 // make sure new free pointer is properly aligned
2325 {
2326 Label L;
2327 andi_(R0, new_top, MinObjAlignmentInBytesMask);
2328 beq(CCR0, L);
2329 stop("updated TLAB free is not properly aligned");
2330 bind(L);
2331 }
2332 #endif // ASSERT
2333
2334 // update the tlab top pointer
2335 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2336 //verify_tlab(); not implemented
2337 }
incr_allocated_bytes(RegisterOrConstant size_in_bytes,Register t1,Register t2)2338 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2339 unimplemented("incr_allocated_bytes");
2340 }
2341
emit_trampoline_stub(int destination_toc_offset,int insts_call_instruction_offset,Register Rtoc)2342 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2343 int insts_call_instruction_offset, Register Rtoc) {
2344 // Start the stub.
2345 address stub = start_a_stub(64);
2346 if (stub == NULL) { return NULL; } // CodeCache full: bail out
2347
2348 // Create a trampoline stub relocation which relates this trampoline stub
2349 // with the call instruction at insts_call_instruction_offset in the
2350 // instructions code-section.
2351 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2352 const int stub_start_offset = offset();
2353
2354 // For java_to_interp stubs we use R11_scratch1 as scratch register
2355 // and in call trampoline stubs we use R12_scratch2. This way we
2356 // can distinguish them (see is_NativeCallTrampolineStub_at()).
2357 Register reg_scratch = R12_scratch2;
2358
2359 // Now, create the trampoline stub's code:
2360 // - load the TOC
2361 // - load the call target from the constant pool
2362 // - call
2363 if (Rtoc == noreg) {
2364 calculate_address_from_global_toc(reg_scratch, method_toc());
2365 Rtoc = reg_scratch;
2366 }
2367
2368 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2369 mtctr(reg_scratch);
2370 bctr();
2371
2372 const address stub_start_addr = addr_at(stub_start_offset);
2373
2374 // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2375 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2376 "encoded offset into the constant pool must match");
2377 // Trampoline_stub_size should be good.
2378 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2379 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2380
2381 // End the stub.
2382 end_a_stub();
2383 return stub;
2384 }
2385
2386 // TM on PPC64.
atomic_inc_ptr(Register addr,Register result,int simm16)2387 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2388 Label retry;
2389 bind(retry);
2390 ldarx(result, addr, /*hint*/ false);
2391 addi(result, result, simm16);
2392 stdcx_(result, addr);
2393 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2394 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2395 } else {
2396 bne( CCR0, retry); // stXcx_ sets CCR0
2397 }
2398 }
2399
atomic_ori_int(Register addr,Register result,int uimm16)2400 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2401 Label retry;
2402 bind(retry);
2403 lwarx(result, addr, /*hint*/ false);
2404 ori(result, result, uimm16);
2405 stwcx_(result, addr);
2406 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2407 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2408 } else {
2409 bne( CCR0, retry); // stXcx_ sets CCR0
2410 }
2411 }
2412
2413 #if INCLUDE_RTM_OPT
2414
2415 // Update rtm_counters based on abort status
2416 // input: abort_status
2417 // rtm_counters_Reg (RTMLockingCounters*)
rtm_counters_update(Register abort_status,Register rtm_counters_Reg)2418 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2419 // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2420 // x86 ppc (! means inverted, ? means not the same)
2421 // 0 31 Set if abort caused by XABORT instruction.
2422 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2423 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2424 // 3 10 Set if an internal buffer overflowed.
2425 // 4 ?12 Set if a debug breakpoint was hit.
2426 // 5 ?32 Set if an abort occurred during execution of a nested transaction.
2427 const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2428 tm_failure_persistent,
2429 tm_non_trans_cf,
2430 tm_trans_cf,
2431 tm_footprint_of,
2432 tm_failure_code,
2433 tm_transaction_level};
2434
2435 const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2436 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2437
2438 const int bit2counter_map[][num_counters] =
2439 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2440 // Inverted logic means that if a bit is set don't count it, or vice-versa.
2441 // Care must be taken when mapping bits to counters as bits for a given
2442 // counter must be mutually exclusive. Otherwise, the counter will be
2443 // incremented more than once.
2444 // counters:
2445 // 0 1 2 3 4 5
2446 // abort , persist, conflict, overflow, debug , nested bits:
2447 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort
2448 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent
2449 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf
2450 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf
2451 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of
2452 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4
2453 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1
2454 // ...
2455
2456 // Move abort_status value to R0 and use abort_status register as a
2457 // temporary register because R0 as third operand in ld/std is treated
2458 // as base address zero (value). Likewise, R0 as second operand in addi
2459 // is problematic because it amounts to li.
2460 const Register temp_Reg = abort_status;
2461 const Register abort_status_R0 = R0;
2462 mr(abort_status_R0, abort_status);
2463
2464 // Increment total abort counter.
2465 int counters_offs = RTMLockingCounters::abort_count_offset();
2466 ld(temp_Reg, counters_offs, rtm_counters_Reg);
2467 addi(temp_Reg, temp_Reg, 1);
2468 std(temp_Reg, counters_offs, rtm_counters_Reg);
2469
2470 // Increment specific abort counters.
2471 if (PrintPreciseRTMLockingStatistics) {
2472
2473 // #0 counter offset.
2474 int abortX_offs = RTMLockingCounters::abortX_count_offset();
2475
2476 for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2477 for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2478 if (bit2counter_map[nbit][ncounter] != 0) {
2479 Label check_abort;
2480 int abort_counter_offs = abortX_offs + (ncounter << 3);
2481
2482 if (failure_bit[nbit] == tm_transaction_level) {
2483 // Don't check outer transaction, TL = 1 (bit 63). Hence only
2484 // 11 bits in the TL field are checked to find out if failure
2485 // occured in a nested transaction. This check also matches
2486 // the case when nesting_of = 1 (nesting overflow).
2487 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2488 } else if (failure_bit[nbit] == tm_failure_code) {
2489 // Check failure code for trap or illegal caught in TM.
2490 // Bits 0:7 are tested as bit 7 (persistent) is copied from
2491 // tabort or treclaim source operand.
2492 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2493 rldicl(temp_Reg, abort_status_R0, 8, 56);
2494 cmpdi(CCR0, temp_Reg, 0xD4);
2495 } else {
2496 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2497 }
2498
2499 if (bit2counter_map[nbit][ncounter] == 1) {
2500 beq(CCR0, check_abort);
2501 } else {
2502 bne(CCR0, check_abort);
2503 }
2504
2505 // We don't increment atomically.
2506 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2507 addi(temp_Reg, temp_Reg, 1);
2508 std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2509
2510 bind(check_abort);
2511 }
2512 }
2513 }
2514 }
2515 // Restore abort_status.
2516 mr(abort_status, abort_status_R0);
2517 }
2518
2519 // Branch if (random & (count-1) != 0), count is 2^n
2520 // tmp and CR0 are killed
branch_on_random_using_tb(Register tmp,int count,Label & brLabel)2521 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2522 mftb(tmp);
2523 andi_(tmp, tmp, count-1);
2524 bne(CCR0, brLabel);
2525 }
2526
2527 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2528 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED
rtm_abort_ratio_calculation(Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data)2529 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2530 RTMLockingCounters* rtm_counters,
2531 Metadata* method_data) {
2532 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2533
2534 if (RTMLockingCalculationDelay > 0) {
2535 // Delay calculation.
2536 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2537 cmpdi(CCR0, rtm_counters_Reg, 0);
2538 beq(CCR0, L_done);
2539 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2540 }
2541 // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2542 // Aborted transactions = abort_count * 100
2543 // All transactions = total_count * RTMTotalCountIncrRate
2544 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2545 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2546 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only.
2547 cmpdi(CCR0, R0, RTMAbortThreshold);
2548 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary
2549 } else {
2550 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2551 cmpd(CCR0, R0, rtm_counters_Reg);
2552 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required
2553 }
2554 mulli(R0, R0, 100);
2555
2556 const Register tmpReg = rtm_counters_Reg;
2557 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2558 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2559 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16
2560 cmpd(CCR0, R0, tmpReg);
2561 blt(CCR0, L_check_always_rtm1); // jump to reload
2562 if (method_data != NULL) {
2563 // Set rtm_state to "no rtm" in MDO.
2564 // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2565 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2566 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2567 atomic_ori_int(R0, tmpReg, NoRTM);
2568 }
2569 b(L_done);
2570
2571 bind(L_check_always_rtm1);
2572 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2573 bind(L_check_always_rtm2);
2574 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2575 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2576 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only.
2577 cmpdi(CCR0, tmpReg, thresholdValue);
2578 } else {
2579 load_const_optimized(R0, thresholdValue);
2580 cmpd(CCR0, tmpReg, R0);
2581 }
2582 blt(CCR0, L_done);
2583 if (method_data != NULL) {
2584 // Set rtm_state to "always rtm" in MDO.
2585 // Not using a metadata relocation. See above.
2586 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2587 atomic_ori_int(R0, tmpReg, UseRTM);
2588 }
2589 bind(L_done);
2590 }
2591
2592 // Update counters and perform abort ratio calculation.
2593 // input: abort_status_Reg
rtm_profiling(Register abort_status_Reg,Register temp_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm)2594 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2595 RTMLockingCounters* rtm_counters,
2596 Metadata* method_data,
2597 bool profile_rtm) {
2598
2599 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2600 // Update rtm counters based on state at abort.
2601 // Reads abort_status_Reg, updates flags.
2602 assert_different_registers(abort_status_Reg, temp_Reg);
2603 load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2604 rtm_counters_update(abort_status_Reg, temp_Reg);
2605 if (profile_rtm) {
2606 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2607 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2608 }
2609 }
2610
2611 // Retry on abort if abort's status indicates non-persistent failure.
2612 // inputs: retry_count_Reg
2613 // : abort_status_Reg
2614 // output: retry_count_Reg decremented by 1
rtm_retry_lock_on_abort(Register retry_count_Reg,Register abort_status_Reg,Label & retryLabel,Label * checkRetry)2615 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2616 Label& retryLabel, Label* checkRetry) {
2617 Label doneRetry;
2618
2619 // Don't retry if failure is persistent.
2620 // The persistent bit is set when a (A) Disallowed operation is performed in
2621 // transactional state, like for instance trying to write the TFHAR after a
2622 // transaction is started; or when there is (B) a Nesting Overflow (too many
2623 // nested transactions); or when (C) the Footprint overflows (too many
2624 // addressess touched in TM state so there is no more space in the footprint
2625 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2626 // store is performed to a given address in TM state, then once in suspended
2627 // state the same address is accessed. Failure (A) is very unlikely to occur
2628 // in the JVM. Failure (D) will never occur because Suspended state is never
2629 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2630 // Overflow will set the persistent bit.
2631 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2632 bne(CCR0, doneRetry);
2633
2634 // Don't retry if transaction was deliberately aborted, i.e. caused by a
2635 // tabort instruction.
2636 rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2637 bne(CCR0, doneRetry);
2638
2639 // Retry if transaction aborted due to a conflict with another thread.
2640 if (checkRetry) { bind(*checkRetry); }
2641 addic_(retry_count_Reg, retry_count_Reg, -1);
2642 blt(CCR0, doneRetry);
2643 b(retryLabel);
2644 bind(doneRetry);
2645 }
2646
2647 // Spin and retry if lock is busy.
2648 // inputs: owner_addr_Reg (monitor address)
2649 // : retry_count_Reg
2650 // output: retry_count_Reg decremented by 1
2651 // CTR is killed
rtm_retry_lock_on_busy(Register retry_count_Reg,Register owner_addr_Reg,Label & retryLabel)2652 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2653 Label SpinLoop, doneRetry, doRetry;
2654 addic_(retry_count_Reg, retry_count_Reg, -1);
2655 blt(CCR0, doneRetry);
2656
2657 if (RTMSpinLoopCount > 1) {
2658 li(R0, RTMSpinLoopCount);
2659 mtctr(R0);
2660 }
2661
2662 // low thread priority
2663 smt_prio_low();
2664 bind(SpinLoop);
2665
2666 if (RTMSpinLoopCount > 1) {
2667 bdz(doRetry);
2668 ld(R0, 0, owner_addr_Reg);
2669 cmpdi(CCR0, R0, 0);
2670 bne(CCR0, SpinLoop);
2671 }
2672
2673 bind(doRetry);
2674
2675 // restore thread priority to default in userspace
2676 #ifdef LINUX
2677 smt_prio_medium_low();
2678 #else
2679 smt_prio_medium();
2680 #endif
2681
2682 b(retryLabel);
2683
2684 bind(doneRetry);
2685 }
2686
2687 // Use RTM for normal stack locks.
2688 // Input: objReg (object to lock)
rtm_stack_locking(ConditionRegister flag,Register obj,Register mark_word,Register tmp,Register retry_on_abort_count_Reg,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL,Label & IsInflated)2689 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2690 Register obj, Register mark_word, Register tmp,
2691 Register retry_on_abort_count_Reg,
2692 RTMLockingCounters* stack_rtm_counters,
2693 Metadata* method_data, bool profile_rtm,
2694 Label& DONE_LABEL, Label& IsInflated) {
2695 assert(UseRTMForStackLocks, "why call this otherwise?");
2696 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2697 Label L_rtm_retry, L_decrement_retry, L_on_abort;
2698
2699 if (RTMRetryCount > 0) {
2700 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2701 bind(L_rtm_retry);
2702 }
2703 andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
2704 bne(CCR0, IsInflated);
2705
2706 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2707 Label L_noincrement;
2708 if (RTMTotalCountIncrRate > 1) {
2709 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2710 }
2711 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2712 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2713 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2714 ldx(mark_word, tmp);
2715 addi(mark_word, mark_word, 1);
2716 stdx(mark_word, tmp);
2717 bind(L_noincrement);
2718 }
2719 tbegin_();
2720 beq(CCR0, L_on_abort);
2721 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked.
2722 andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2723 cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked
2724 beq(flag, DONE_LABEL); // all done if unlocked
2725
2726 if (UseRTMXendForLockBusy) {
2727 tend_();
2728 b(L_decrement_retry);
2729 } else {
2730 tabort_();
2731 }
2732 bind(L_on_abort);
2733 const Register abort_status_Reg = tmp;
2734 mftexasr(abort_status_Reg);
2735 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2736 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2737 }
2738 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2739 if (RTMRetryCount > 0) {
2740 // Retry on lock abort if abort status is not permanent.
2741 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2742 } else {
2743 bind(L_decrement_retry);
2744 }
2745 }
2746
2747 // Use RTM for inflating locks
2748 // inputs: obj (object to lock)
2749 // mark_word (current header - KILLED)
2750 // boxReg (on-stack box address (displaced header location) - KILLED)
rtm_inflated_locking(ConditionRegister flag,Register obj,Register mark_word,Register boxReg,Register retry_on_busy_count_Reg,Register retry_on_abort_count_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL)2751 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2752 Register obj, Register mark_word, Register boxReg,
2753 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2754 RTMLockingCounters* rtm_counters,
2755 Metadata* method_data, bool profile_rtm,
2756 Label& DONE_LABEL) {
2757 assert(UseRTMLocking, "why call this otherwise?");
2758 Label L_rtm_retry, L_decrement_retry, L_on_abort;
2759 // Clean monitor_value bit to get valid pointer.
2760 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2761
2762 // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2763 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2764 const Register tmpReg = boxReg;
2765 const Register owner_addr_Reg = mark_word;
2766 addi(owner_addr_Reg, mark_word, owner_offset);
2767
2768 if (RTMRetryCount > 0) {
2769 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy.
2770 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2771 bind(L_rtm_retry);
2772 }
2773 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2774 Label L_noincrement;
2775 if (RTMTotalCountIncrRate > 1) {
2776 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2777 }
2778 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2779 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2780 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2781 ldx(tmpReg, R0);
2782 addi(tmpReg, tmpReg, 1);
2783 stdx(tmpReg, R0);
2784 bind(L_noincrement);
2785 }
2786 tbegin_();
2787 beq(CCR0, L_on_abort);
2788 // We don't reload mark word. Will only be reset at safepoint.
2789 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2790 cmpdi(flag, R0, 0);
2791 beq(flag, DONE_LABEL);
2792
2793 if (UseRTMXendForLockBusy) {
2794 tend_();
2795 b(L_decrement_retry);
2796 } else {
2797 tabort_();
2798 }
2799 bind(L_on_abort);
2800 const Register abort_status_Reg = tmpReg;
2801 mftexasr(abort_status_Reg);
2802 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2803 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2804 // Restore owner_addr_Reg
2805 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2806 #ifdef ASSERT
2807 andi_(R0, mark_word, markWord::monitor_value);
2808 asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2809 #endif
2810 addi(owner_addr_Reg, mark_word, owner_offset);
2811 }
2812 if (RTMRetryCount > 0) {
2813 // Retry on lock abort if abort status is not permanent.
2814 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2815 }
2816
2817 // Appears unlocked - try to swing _owner from null to non-null.
2818 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2819 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2820 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2821
2822 if (RTMRetryCount > 0) {
2823 // success done else retry
2824 b(DONE_LABEL);
2825 bind(L_decrement_retry);
2826 // Spin and retry if lock is busy.
2827 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2828 } else {
2829 bind(L_decrement_retry);
2830 }
2831 }
2832
2833 #endif // INCLUDE_RTM_OPT
2834
2835 // "The box" is the space on the stack where we copy the object mark.
compiler_fast_lock_object(ConditionRegister flag,Register oop,Register box,Register temp,Register displaced_header,Register current_header,bool try_bias,RTMLockingCounters * rtm_counters,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool use_rtm,bool profile_rtm)2836 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2837 Register temp, Register displaced_header, Register current_header,
2838 bool try_bias,
2839 RTMLockingCounters* rtm_counters,
2840 RTMLockingCounters* stack_rtm_counters,
2841 Metadata* method_data,
2842 bool use_rtm, bool profile_rtm) {
2843 assert_different_registers(oop, box, temp, displaced_header, current_header);
2844 assert(flag != CCR0, "bad condition register");
2845 Label cont;
2846 Label object_has_monitor;
2847 Label cas_failed;
2848
2849 // Load markWord from object into displaced_header.
2850 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2851
2852
2853 if (try_bias) {
2854 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2855 }
2856
2857 #if INCLUDE_RTM_OPT
2858 if (UseRTMForStackLocks && use_rtm) {
2859 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2860 stack_rtm_counters, method_data, profile_rtm,
2861 cont, object_has_monitor);
2862 }
2863 #endif // INCLUDE_RTM_OPT
2864
2865 // Handle existing monitor.
2866 // The object has an existing monitor iff (mark & monitor_value) != 0.
2867 andi_(temp, displaced_header, markWord::monitor_value);
2868 bne(CCR0, object_has_monitor);
2869
2870 // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2871 ori(displaced_header, displaced_header, markWord::unlocked_value);
2872
2873 // Load Compare Value application register.
2874
2875 // Initialize the box. (Must happen before we update the object mark!)
2876 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2877
2878 // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2879 // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2880 cmpxchgd(/*flag=*/flag,
2881 /*current_value=*/current_header,
2882 /*compare_value=*/displaced_header,
2883 /*exchange_value=*/box,
2884 /*where=*/oop,
2885 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2886 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2887 noreg,
2888 &cas_failed,
2889 /*check without membar and ldarx first*/true);
2890 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2891
2892 // If the compare-and-exchange succeeded, then we found an unlocked
2893 // object and we have now locked it.
2894 b(cont);
2895
2896 bind(cas_failed);
2897 // We did not see an unlocked object so try the fast recursive case.
2898
2899 // Check if the owner is self by comparing the value in the markWord of object
2900 // (current_header) with the stack pointer.
2901 sub(current_header, current_header, R1_SP);
2902 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2903
2904 and_(R0/*==0?*/, current_header, temp);
2905 // If condition is true we are cont and hence we can store 0 as the
2906 // displaced header in the box, which indicates that it is a recursive lock.
2907 mcrf(flag,CCR0);
2908 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2909
2910 // Handle existing monitor.
2911 b(cont);
2912
2913 bind(object_has_monitor);
2914 // The object's monitor m is unlocked iff m->owner == NULL,
2915 // otherwise m->owner may contain a thread or a stack address.
2916
2917 #if INCLUDE_RTM_OPT
2918 // Use the same RTM locking code in 32- and 64-bit VM.
2919 if (use_rtm) {
2920 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2921 rtm_counters, method_data, profile_rtm, cont);
2922 } else {
2923 #endif // INCLUDE_RTM_OPT
2924
2925 // Try to CAS m->owner from NULL to current thread.
2926 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2927 cmpxchgd(/*flag=*/flag,
2928 /*current_value=*/current_header,
2929 /*compare_value=*/(intptr_t)0,
2930 /*exchange_value=*/R16_thread,
2931 /*where=*/temp,
2932 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2933 MacroAssembler::cmpxchgx_hint_acquire_lock());
2934
2935 // Store a non-null value into the box.
2936 std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2937
2938 # ifdef ASSERT
2939 bne(flag, cont);
2940 // We have acquired the monitor, check some invariants.
2941 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2942 // Invariant 1: _recursions should be 0.
2943 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2944 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2945 "monitor->_recursions should be 0");
2946 # endif
2947
2948 #if INCLUDE_RTM_OPT
2949 } // use_rtm()
2950 #endif
2951
2952 bind(cont);
2953 // flag == EQ indicates success
2954 // flag == NE indicates failure
2955 }
2956
compiler_fast_unlock_object(ConditionRegister flag,Register oop,Register box,Register temp,Register displaced_header,Register current_header,bool try_bias,bool use_rtm)2957 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2958 Register temp, Register displaced_header, Register current_header,
2959 bool try_bias, bool use_rtm) {
2960 assert_different_registers(oop, box, temp, displaced_header, current_header);
2961 assert(flag != CCR0, "bad condition register");
2962 Label cont;
2963 Label object_has_monitor;
2964
2965 if (try_bias) {
2966 biased_locking_exit(flag, oop, current_header, cont);
2967 }
2968
2969 #if INCLUDE_RTM_OPT
2970 if (UseRTMForStackLocks && use_rtm) {
2971 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2972 Label L_regular_unlock;
2973 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword
2974 andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2975 cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked
2976 bne(flag, L_regular_unlock); // else RegularLock
2977 tend_(); // otherwise end...
2978 b(cont); // ... and we're done
2979 bind(L_regular_unlock);
2980 }
2981 #endif
2982
2983 // Find the lock address and load the displaced header from the stack.
2984 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2985
2986 // If the displaced header is 0, we have a recursive unlock.
2987 cmpdi(flag, displaced_header, 0);
2988 beq(flag, cont);
2989
2990 // Handle existing monitor.
2991 // The object has an existing monitor iff (mark & monitor_value) != 0.
2992 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2993 ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2994 andi_(R0, current_header, markWord::monitor_value);
2995 bne(CCR0, object_has_monitor);
2996
2997 // Check if it is still a light weight lock, this is is true if we see
2998 // the stack address of the basicLock in the markWord of the object.
2999 // Cmpxchg sets flag to cmpd(current_header, box).
3000 cmpxchgd(/*flag=*/flag,
3001 /*current_value=*/current_header,
3002 /*compare_value=*/box,
3003 /*exchange_value=*/displaced_header,
3004 /*where=*/oop,
3005 MacroAssembler::MemBarRel,
3006 MacroAssembler::cmpxchgx_hint_release_lock(),
3007 noreg,
3008 &cont);
3009
3010 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
3011
3012 // Handle existing monitor.
3013 b(cont);
3014
3015 bind(object_has_monitor);
3016 STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
3017 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
3018 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3019
3020 // It's inflated.
3021 #if INCLUDE_RTM_OPT
3022 if (use_rtm) {
3023 Label L_regular_inflated_unlock;
3024 // Clean monitor_value bit to get valid pointer
3025 cmpdi(flag, temp, 0);
3026 bne(flag, L_regular_inflated_unlock);
3027 tend_();
3028 b(cont);
3029 bind(L_regular_inflated_unlock);
3030 }
3031 #endif
3032
3033 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3034 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner.
3035 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3036 cmpdi(flag, temp, 0);
3037 bne(flag, cont);
3038
3039 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3040 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3041 orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3042 cmpdi(flag, temp, 0);
3043 bne(flag, cont);
3044 release();
3045 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3046
3047 bind(cont);
3048 // flag == EQ indicates success
3049 // flag == NE indicates failure
3050 }
3051
safepoint_poll(Label & slow_path,Register temp_reg)3052 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3053 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3054 // Armed page has poll_bit set.
3055 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3056 bne(CCR0, slow_path);
3057 }
3058
resolve_jobject(Register value,Register tmp1,Register tmp2,bool needs_frame)3059 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3060 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3061 bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3062 }
3063
3064 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3065 // in frame_ppc.hpp.
set_last_Java_frame(Register last_Java_sp,Register last_Java_pc)3066 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3067 // Always set last_Java_pc and flags first because once last_Java_sp
3068 // is visible has_last_Java_frame is true and users will look at the
3069 // rest of the fields. (Note: flags should always be zero before we
3070 // get here so doesn't need to be set.)
3071
3072 // Verify that last_Java_pc was zeroed on return to Java
3073 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3074 "last_Java_pc not zeroed before leaving Java");
3075
3076 // When returning from calling out from Java mode the frame anchor's
3077 // last_Java_pc will always be set to NULL. It is set here so that
3078 // if we are doing a call to native (not VM) that we capture the
3079 // known pc and don't have to rely on the native call having a
3080 // standard frame linkage where we can find the pc.
3081 if (last_Java_pc != noreg)
3082 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3083
3084 // Set last_Java_sp last.
3085 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3086 }
3087
reset_last_Java_frame(void)3088 void MacroAssembler::reset_last_Java_frame(void) {
3089 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3090 R16_thread, "SP was not set, still zero");
3091
3092 BLOCK_COMMENT("reset_last_Java_frame {");
3093 li(R0, 0);
3094
3095 // _last_Java_sp = 0
3096 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3097
3098 // _last_Java_pc = 0
3099 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3100 BLOCK_COMMENT("} reset_last_Java_frame");
3101 }
3102
set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp,Register tmp1)3103 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3104 assert_different_registers(sp, tmp1);
3105
3106 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3107 // TOP_IJAVA_FRAME_ABI.
3108 // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3109 address entry = pc();
3110 load_const_optimized(tmp1, entry);
3111
3112 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3113 }
3114
get_vm_result(Register oop_result)3115 void MacroAssembler::get_vm_result(Register oop_result) {
3116 // Read:
3117 // R16_thread
3118 // R16_thread->in_bytes(JavaThread::vm_result_offset())
3119 //
3120 // Updated:
3121 // oop_result
3122 // R16_thread->in_bytes(JavaThread::vm_result_offset())
3123
3124 verify_thread();
3125
3126 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3127 li(R0, 0);
3128 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3129
3130 verify_oop(oop_result, FILE_AND_LINE);
3131 }
3132
get_vm_result_2(Register metadata_result)3133 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3134 // Read:
3135 // R16_thread
3136 // R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3137 //
3138 // Updated:
3139 // metadata_result
3140 // R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3141
3142 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3143 li(R0, 0);
3144 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3145 }
3146
encode_klass_not_null(Register dst,Register src)3147 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3148 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3149 if (CompressedKlassPointers::base() != 0) {
3150 // Use dst as temp if it is free.
3151 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3152 current = dst;
3153 }
3154 if (CompressedKlassPointers::shift() != 0) {
3155 srdi(dst, current, CompressedKlassPointers::shift());
3156 current = dst;
3157 }
3158 return current;
3159 }
3160
store_klass(Register dst_oop,Register klass,Register ck)3161 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3162 if (UseCompressedClassPointers) {
3163 Register compressedKlass = encode_klass_not_null(ck, klass);
3164 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3165 } else {
3166 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3167 }
3168 }
3169
store_klass_gap(Register dst_oop,Register val)3170 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3171 if (UseCompressedClassPointers) {
3172 if (val == noreg) {
3173 val = R0;
3174 li(val, 0);
3175 }
3176 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3177 }
3178 }
3179
instr_size_for_decode_klass_not_null()3180 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3181 if (!UseCompressedClassPointers) return 0;
3182 int num_instrs = 1; // shift or move
3183 if (CompressedKlassPointers::base() != 0) num_instrs = 7; // shift + load const + add
3184 return num_instrs * BytesPerInstWord;
3185 }
3186
decode_klass_not_null(Register dst,Register src)3187 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3188 assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3189 if (src == noreg) src = dst;
3190 Register shifted_src = src;
3191 if (CompressedKlassPointers::shift() != 0 ||
3192 CompressedKlassPointers::base() == 0 && src != dst) { // Move required.
3193 shifted_src = dst;
3194 sldi(shifted_src, src, CompressedKlassPointers::shift());
3195 }
3196 if (CompressedKlassPointers::base() != 0) {
3197 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3198 }
3199 }
3200
load_klass(Register dst,Register src)3201 void MacroAssembler::load_klass(Register dst, Register src) {
3202 if (UseCompressedClassPointers) {
3203 lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3204 // Attention: no null check here!
3205 decode_klass_not_null(dst, dst);
3206 } else {
3207 ld(dst, oopDesc::klass_offset_in_bytes(), src);
3208 }
3209 }
3210
3211 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result)3212 void MacroAssembler::resolve_oop_handle(Register result) {
3213 // OopHandle::resolve is an indirection.
3214 ld(result, 0, result);
3215 }
3216
load_mirror_from_const_method(Register mirror,Register const_method)3217 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3218 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3219 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3220 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3221 resolve_oop_handle(mirror);
3222 }
3223
load_method_holder(Register holder,Register method)3224 void MacroAssembler::load_method_holder(Register holder, Register method) {
3225 ld(holder, in_bytes(Method::const_offset()), method);
3226 ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3227 ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3228 }
3229
3230 // Clear Array
3231 // For very short arrays. tmp == R0 is allowed.
clear_memory_unrolled(Register base_ptr,int cnt_dwords,Register tmp,int offset)3232 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3233 if (cnt_dwords > 0) { li(tmp, 0); }
3234 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3235 }
3236
3237 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
clear_memory_constlen(Register base_ptr,int cnt_dwords,Register tmp)3238 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3239 if (cnt_dwords < 8) {
3240 clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3241 return;
3242 }
3243
3244 Label loop;
3245 const long loopcnt = cnt_dwords >> 1,
3246 remainder = cnt_dwords & 1;
3247
3248 li(tmp, loopcnt);
3249 mtctr(tmp);
3250 li(tmp, 0);
3251 bind(loop);
3252 std(tmp, 0, base_ptr);
3253 std(tmp, 8, base_ptr);
3254 addi(base_ptr, base_ptr, 16);
3255 bdnz(loop);
3256 if (remainder) { std(tmp, 0, base_ptr); }
3257 }
3258
3259 // Kills both input registers. tmp == R0 is allowed.
clear_memory_doubleword(Register base_ptr,Register cnt_dwords,Register tmp,long const_cnt)3260 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3261 // Procedure for large arrays (uses data cache block zero instruction).
3262 Label startloop, fast, fastloop, small_rest, restloop, done;
3263 const int cl_size = VM_Version::L1_data_cache_line_size(),
3264 cl_dwords = cl_size >> 3,
3265 cl_dw_addr_bits = exact_log2(cl_dwords),
3266 dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
3267 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3268
3269 if (const_cnt >= 0) {
3270 // Constant case.
3271 if (const_cnt < min_cnt) {
3272 clear_memory_constlen(base_ptr, const_cnt, tmp);
3273 return;
3274 }
3275 load_const_optimized(cnt_dwords, const_cnt, tmp);
3276 } else {
3277 // cnt_dwords already loaded in register. Need to check size.
3278 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3279 blt(CCR1, small_rest);
3280 }
3281 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3282 beq(CCR0, fast); // Already 128byte aligned.
3283
3284 subfic(tmp, tmp, cl_dwords);
3285 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3286 subf(cnt_dwords, tmp, cnt_dwords); // rest.
3287 li(tmp, 0);
3288
3289 bind(startloop); // Clear at the beginning to reach 128byte boundary.
3290 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3291 addi(base_ptr, base_ptr, 8);
3292 bdnz(startloop);
3293
3294 bind(fast); // Clear 128byte blocks.
3295 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
3296 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3297 mtctr(tmp); // Load counter.
3298
3299 bind(fastloop);
3300 dcbz(base_ptr); // Clear 128byte aligned block.
3301 addi(base_ptr, base_ptr, cl_size);
3302 bdnz(fastloop);
3303
3304 bind(small_rest);
3305 cmpdi(CCR0, cnt_dwords, 0); // size 0?
3306 beq(CCR0, done); // rest == 0
3307 li(tmp, 0);
3308 mtctr(cnt_dwords); // Load counter.
3309
3310 bind(restloop); // Clear rest.
3311 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3312 addi(base_ptr, base_ptr, 8);
3313 bdnz(restloop);
3314
3315 bind(done);
3316 }
3317
3318 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3319
3320 // Helpers for Intrinsic Emitters
3321 //
3322 // Revert the byte order of a 32bit value in a register
3323 // src: 0x44556677
3324 // dst: 0x77665544
3325 // Three steps to obtain the result:
3326 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3327 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3328 // This value initializes dst.
3329 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3330 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3331 // This value is mask inserted into dst with a [0..23] mask of 1s.
3332 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3333 // This value is mask inserted into dst with a [8..15] mask of 1s.
load_reverse_32(Register dst,Register src)3334 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3335 assert_different_registers(dst, src);
3336
3337 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3338 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3339 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3340 }
3341
3342 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3343 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3344 // body size from 20 to 16 instructions.
3345 // Returns the offset that was used to calculate the address of column tc3.
3346 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3347 // at hand, the original table address can be easily reconstructed.
crc32_table_columns(Register table,Register tc0,Register tc1,Register tc2,Register tc3)3348 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3349 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3350
3351 // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3352 // Layout: See StubRoutines::generate_crc_constants.
3353 #ifdef VM_LITTLE_ENDIAN
3354 const int ix0 = 3 * CRC32_TABLE_SIZE;
3355 const int ix1 = 2 * CRC32_TABLE_SIZE;
3356 const int ix2 = 1 * CRC32_TABLE_SIZE;
3357 const int ix3 = 0 * CRC32_TABLE_SIZE;
3358 #else
3359 const int ix0 = 1 * CRC32_TABLE_SIZE;
3360 const int ix1 = 2 * CRC32_TABLE_SIZE;
3361 const int ix2 = 3 * CRC32_TABLE_SIZE;
3362 const int ix3 = 4 * CRC32_TABLE_SIZE;
3363 #endif
3364 assert_different_registers(table, tc0, tc1, tc2);
3365 assert(table == tc3, "must be!");
3366
3367 addi(tc0, table, ix0);
3368 addi(tc1, table, ix1);
3369 addi(tc2, table, ix2);
3370 if (ix3 != 0) addi(tc3, table, ix3);
3371
3372 return ix3;
3373 }
3374
3375 /**
3376 * uint32_t crc;
3377 * table[crc & 0xFF] ^ (crc >> 8);
3378 */
fold_byte_crc32(Register crc,Register val,Register table,Register tmp)3379 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3380 assert_different_registers(crc, table, tmp);
3381 assert_different_registers(val, table);
3382
3383 if (crc == val) { // Must rotate first to use the unmodified value.
3384 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3385 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3386 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3387 } else {
3388 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3389 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3390 }
3391 lwzx(tmp, table, tmp);
3392 xorr(crc, crc, tmp);
3393 }
3394
3395 /**
3396 * Emits code to update CRC-32 with a byte value according to constants in table.
3397 *
3398 * @param [in,out]crc Register containing the crc.
3399 * @param [in]val Register containing the byte to fold into the CRC.
3400 * @param [in]table Register containing the table of crc constants.
3401 *
3402 * uint32_t crc;
3403 * val = crc_table[(val ^ crc) & 0xFF];
3404 * crc = val ^ (crc >> 8);
3405 */
update_byte_crc32(Register crc,Register val,Register table)3406 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3407 BLOCK_COMMENT("update_byte_crc32:");
3408 xorr(val, val, crc);
3409 fold_byte_crc32(crc, val, table, val);
3410 }
3411
3412 /**
3413 * @param crc register containing existing CRC (32-bit)
3414 * @param buf register pointing to input byte buffer (byte*)
3415 * @param len register containing number of bytes
3416 * @param table register pointing to CRC table
3417 */
update_byteLoop_crc32(Register crc,Register buf,Register len,Register table,Register data,bool loopAlignment)3418 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3419 Register data, bool loopAlignment) {
3420 assert_different_registers(crc, buf, len, table, data);
3421
3422 Label L_mainLoop, L_done;
3423 const int mainLoop_stepping = 1;
3424 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3425
3426 // Process all bytes in a single-byte loop.
3427 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
3428 beq(CCR0, L_done);
3429
3430 mtctr(len);
3431 align(mainLoop_alignment);
3432 BIND(L_mainLoop);
3433 lbz(data, 0, buf); // Byte from buffer, zero-extended.
3434 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3435 update_byte_crc32(crc, data, table);
3436 bdnz(L_mainLoop); // Iterate.
3437
3438 bind(L_done);
3439 }
3440
3441 /**
3442 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3443 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3444 */
3445 // A note on the lookup table address(es):
3446 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3447 // To save the effort of adding the column offset to the table address each time
3448 // a table element is looked up, it is possible to pass the pre-calculated
3449 // column addresses.
3450 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
update_1word_crc32(Register crc,Register buf,Register table,int bufDisp,int bufInc,Register t0,Register t1,Register t2,Register t3,Register tc0,Register tc1,Register tc2,Register tc3)3451 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3452 Register t0, Register t1, Register t2, Register t3,
3453 Register tc0, Register tc1, Register tc2, Register tc3) {
3454 assert_different_registers(crc, t3);
3455
3456 // XOR crc with next four bytes of buffer.
3457 lwz(t3, bufDisp, buf);
3458 if (bufInc != 0) {
3459 addi(buf, buf, bufInc);
3460 }
3461 xorr(t3, t3, crc);
3462
3463 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3464 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3465 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3466 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3467 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3468
3469 // Use the pre-calculated column addresses.
3470 // Load pre-calculated table values.
3471 lwzx(t0, tc0, t0);
3472 lwzx(t1, tc1, t1);
3473 lwzx(t2, tc2, t2);
3474 lwzx(t3, tc3, t3);
3475
3476 // Calculate new crc from table values.
3477 xorr(t0, t0, t1);
3478 xorr(t2, t2, t3);
3479 xorr(crc, t0, t2); // Now crc contains the final checksum value.
3480 }
3481
3482 /**
3483 * @param crc register containing existing CRC (32-bit)
3484 * @param buf register pointing to input byte buffer (byte*)
3485 * @param len register containing number of bytes
3486 * @param table register pointing to CRC table
3487 *
3488 * uses R9..R12 as work register. Must be saved/restored by caller!
3489 */
kernel_crc32_1word(Register crc,Register buf,Register len,Register table,Register t0,Register t1,Register t2,Register t3,Register tc0,Register tc1,Register tc2,Register tc3,bool invertCRC)3490 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3491 Register t0, Register t1, Register t2, Register t3,
3492 Register tc0, Register tc1, Register tc2, Register tc3,
3493 bool invertCRC) {
3494 assert_different_registers(crc, buf, len, table);
3495
3496 Label L_mainLoop, L_tail;
3497 Register tmp = t0;
3498 Register data = t0;
3499 Register tmp2 = t1;
3500 const int mainLoop_stepping = 4;
3501 const int tailLoop_stepping = 1;
3502 const int log_stepping = exact_log2(mainLoop_stepping);
3503 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3504 const int complexThreshold = 2*mainLoop_stepping;
3505
3506 // Don't test for len <= 0 here. This pathological case should not occur anyway.
3507 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3508 // for all well-behaved cases. The situation itself is detected and handled correctly
3509 // within update_byteLoop_crc32.
3510 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3511
3512 BLOCK_COMMENT("kernel_crc32_1word {");
3513
3514 if (invertCRC) {
3515 nand(crc, crc, crc); // 1s complement of crc
3516 }
3517
3518 // Check for short (<mainLoop_stepping) buffer.
3519 cmpdi(CCR0, len, complexThreshold);
3520 blt(CCR0, L_tail);
3521
3522 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3523 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3524 {
3525 // Align buf addr to mainLoop_stepping boundary.
3526 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
3527 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3528
3529 if (complexThreshold > mainLoop_stepping) {
3530 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3531 } else {
3532 sub(tmp, len, tmp2); // Remaining bytes for main loop.
3533 cmpdi(CCR0, tmp, mainLoop_stepping);
3534 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
3535 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3536 }
3537 update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3538 }
3539
3540 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
3541 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
3542 mtctr(tmp2);
3543
3544 #ifdef VM_LITTLE_ENDIAN
3545 Register crc_rv = crc;
3546 #else
3547 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
3548 // Occupies tmp, but frees up crc.
3549 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
3550 tmp = crc;
3551 #endif
3552
3553 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3554
3555 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
3556 BIND(L_mainLoop);
3557 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3558 bdnz(L_mainLoop);
3559
3560 #ifndef VM_LITTLE_ENDIAN
3561 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
3562 tmp = crc_rv; // Tmp uses it's original register again.
3563 #endif
3564
3565 // Restore original table address for tailLoop.
3566 if (reconstructTableOffset != 0) {
3567 addi(table, table, -reconstructTableOffset);
3568 }
3569
3570 // Process last few (<complexThreshold) bytes of buffer.
3571 BIND(L_tail);
3572 update_byteLoop_crc32(crc, buf, len, table, data, false);
3573
3574 if (invertCRC) {
3575 nand(crc, crc, crc); // 1s complement of crc
3576 }
3577 BLOCK_COMMENT("} kernel_crc32_1word");
3578 }
3579
3580 /**
3581 * @param crc register containing existing CRC (32-bit)
3582 * @param buf register pointing to input byte buffer (byte*)
3583 * @param len register containing number of bytes
3584 * @param constants register pointing to precomputed constants
3585 * @param t0-t6 temp registers
3586 */
kernel_crc32_vpmsum(Register crc,Register buf,Register len,Register constants,Register t0,Register t1,Register t2,Register t3,Register t4,Register t5,Register t6,bool invertCRC)3587 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3588 Register t0, Register t1, Register t2, Register t3,
3589 Register t4, Register t5, Register t6, bool invertCRC) {
3590 assert_different_registers(crc, buf, len, constants);
3591
3592 Label L_tail;
3593
3594 BLOCK_COMMENT("kernel_crc32_vpmsum {");
3595
3596 if (invertCRC) {
3597 nand(crc, crc, crc); // 1s complement of crc
3598 }
3599
3600 // Enforce 32 bit.
3601 clrldi(len, len, 32);
3602
3603 // Align if we have enough bytes for the fast version.
3604 const int alignment = 16,
3605 threshold = 32;
3606 Register prealign = t0;
3607
3608 neg(prealign, buf);
3609 addi(t1, len, -threshold);
3610 andi(prealign, prealign, alignment - 1);
3611 cmpw(CCR0, t1, prealign);
3612 blt(CCR0, L_tail); // len - prealign < threshold?
3613
3614 subf(len, prealign, len);
3615 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3616
3617 // Calculate from first aligned address as far as possible.
3618 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3619 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3620 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3621
3622 // Remaining bytes.
3623 BIND(L_tail);
3624 update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3625
3626 if (invertCRC) {
3627 nand(crc, crc, crc); // 1s complement of crc
3628 }
3629
3630 BLOCK_COMMENT("} kernel_crc32_vpmsum");
3631 }
3632
3633 /**
3634 * @param crc register containing existing CRC (32-bit)
3635 * @param buf register pointing to input byte buffer (byte*)
3636 * @param len register containing number of bytes (will get updated to remaining bytes)
3637 * @param constants register pointing to CRC table for 128-bit aligned memory
3638 * @param t0-t6 temp registers
3639 */
kernel_crc32_vpmsum_aligned(Register crc,Register buf,Register len,Register constants,Register t0,Register t1,Register t2,Register t3,Register t4,Register t5,Register t6)3640 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3641 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3642
3643 // Save non-volatile vector registers (frameless).
3644 Register offset = t1;
3645 int offsetInt = 0;
3646 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3647 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3648 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3649 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3650 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3651 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3652 #ifndef VM_LITTLE_ENDIAN
3653 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3654 #endif
3655 offsetInt -= 8; std(R14, offsetInt, R1_SP);
3656 offsetInt -= 8; std(R15, offsetInt, R1_SP);
3657
3658 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3659 // bytes per iteration. The basic scheme is:
3660 // lvx: load vector (Big Endian needs reversal)
3661 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3662 // vxor: xor partial results together to get unroll_factor2 vectors
3663
3664 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3665
3666 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3667 const int unroll_factor = CRC32_UNROLL_FACTOR,
3668 unroll_factor2 = CRC32_UNROLL_FACTOR2;
3669
3670 const int outer_consts_size = (unroll_factor2 - 1) * 16,
3671 inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3672
3673 // Support registers.
3674 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3675 Register num_bytes = R14,
3676 loop_count = R15,
3677 cur_const = crc; // will live in VCRC
3678 // Constant array for outer loop: unroll_factor2 - 1 registers,
3679 // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3680 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3681 consts1[] = { VR23, VR24 };
3682 // Data register arrays: 2 arrays with unroll_factor2 registers.
3683 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3684 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3685
3686 VectorRegister VCRC = data0[0];
3687 VectorRegister Vc = VR25;
3688 VectorRegister swap_bytes = VR26; // Only for Big Endian.
3689
3690 // We have at least 1 iteration (ensured by caller).
3691 Label L_outer_loop, L_inner_loop, L_last;
3692
3693 // If supported set DSCR pre-fetch to deepest.
3694 if (VM_Version::has_mfdscr()) {
3695 load_const_optimized(t0, VM_Version::_dscr_val | 7);
3696 mtdscr(t0);
3697 }
3698
3699 mtvrwz(VCRC, crc); // crc lives in VCRC, now
3700
3701 for (int i = 1; i < unroll_factor2; ++i) {
3702 li(offs[i], 16 * i);
3703 }
3704
3705 // Load consts for outer loop
3706 lvx(consts0[0], constants);
3707 for (int i = 1; i < unroll_factor2 - 1; ++i) {
3708 lvx(consts0[i], offs[i], constants);
3709 }
3710
3711 load_const_optimized(num_bytes, 16 * unroll_factor);
3712
3713 // Reuse data registers outside of the loop.
3714 VectorRegister Vtmp = data1[0];
3715 VectorRegister Vtmp2 = data1[1];
3716 VectorRegister zeroes = data1[2];
3717
3718 vspltisb(Vtmp, 0);
3719 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3720
3721 // Load vector for vpermxor (to xor both 64 bit parts together)
3722 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
3723 vspltisb(Vc, 4);
3724 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3725 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3726 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3727
3728 #ifdef VM_LITTLE_ENDIAN
3729 #define BE_swap_bytes(x)
3730 #else
3731 vspltisb(Vtmp2, 0xf);
3732 vxor(swap_bytes, Vtmp, Vtmp2);
3733 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3734 #endif
3735
3736 cmpd(CCR0, len, num_bytes);
3737 blt(CCR0, L_last);
3738
3739 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3740 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3741
3742 // ********** Main loop start **********
3743 align(32);
3744 bind(L_outer_loop);
3745
3746 // Begin of unrolled first iteration (no xor).
3747 lvx(data1[0], buf);
3748 for (int i = 1; i < unroll_factor2 / 2; ++i) {
3749 lvx(data1[i], offs[i], buf);
3750 }
3751 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3752 lvx(consts1[0], cur_const);
3753 mtctr(loop_count);
3754 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3755 BE_swap_bytes(data1[i]);
3756 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3757 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3758 vpmsumw(data0[i], data1[i], consts1[0]);
3759 }
3760 addi(buf, buf, 16 * unroll_factor2);
3761 subf(len, num_bytes, len);
3762 lvx(consts1[1], offs[1], cur_const);
3763 addi(cur_const, cur_const, 32);
3764 // Begin of unrolled second iteration (head).
3765 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3766 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3767 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3768 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3769 }
3770 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3771 BE_swap_bytes(data1[i]);
3772 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3773 vpmsumw(data1[i], data1[i], consts1[1]);
3774 }
3775 addi(buf, buf, 16 * unroll_factor2);
3776
3777 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3778 // Double-iteration allows using the 2 constant registers alternatingly.
3779 align(32);
3780 bind(L_inner_loop);
3781 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3782 if (j & 1) {
3783 lvx(consts1[0], cur_const);
3784 } else {
3785 lvx(consts1[1], offs[1], cur_const);
3786 addi(cur_const, cur_const, 32);
3787 }
3788 for (int i = 0; i < unroll_factor2; ++i) {
3789 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3790 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3791 BE_swap_bytes(data1[idx]);
3792 vxor(data0[i], data0[i], data1[i]);
3793 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3794 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3795 }
3796 addi(buf, buf, 16 * unroll_factor2);
3797 }
3798 bdnz(L_inner_loop);
3799
3800 addi(cur_const, constants, outer_consts_size); // Reset
3801
3802 // Tail of last iteration (no loads).
3803 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3804 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3805 vxor(data0[i], data0[i], data1[i]);
3806 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3807 }
3808 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3809 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3810 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3811 }
3812
3813 // Last data register is ok, other ones need fixup shift.
3814 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3815 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3816 }
3817
3818 // Combine to 128 bit result vector VCRC = data0[0].
3819 for (int i = 1; i < unroll_factor2; i<<=1) {
3820 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3821 vxor(data0[j], data0[j], data0[j+i]);
3822 }
3823 }
3824 cmpd(CCR0, len, num_bytes);
3825 bge(CCR0, L_outer_loop);
3826
3827 // Last chance with lower num_bytes.
3828 bind(L_last);
3829 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3830 // Point behind last const for inner loop.
3831 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3832 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3833 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3834 subf(cur_const, R0, cur_const); // Point to constant to be used first.
3835
3836 addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3837 bgt(CCR0, L_outer_loop);
3838 // ********** Main loop end **********
3839
3840 // Restore DSCR pre-fetch value.
3841 if (VM_Version::has_mfdscr()) {
3842 load_const_optimized(t0, VM_Version::_dscr_val);
3843 mtdscr(t0);
3844 }
3845
3846 // ********** Simple loop for remaining 16 byte blocks **********
3847 {
3848 Label L_loop, L_done;
3849
3850 srdi_(t0, len, 4); // 16 bytes per iteration
3851 clrldi(len, len, 64-4);
3852 beq(CCR0, L_done);
3853
3854 // Point to const (same as last const for inner loop).
3855 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3856 mtctr(t0);
3857 lvx(Vtmp2, cur_const);
3858
3859 align(32);
3860 bind(L_loop);
3861
3862 lvx(Vtmp, buf);
3863 addi(buf, buf, 16);
3864 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3865 BE_swap_bytes(Vtmp);
3866 vxor(VCRC, VCRC, Vtmp);
3867 vpmsumw(VCRC, VCRC, Vtmp2);
3868 bdnz(L_loop);
3869
3870 bind(L_done);
3871 }
3872 // ********** Simple loop end **********
3873 #undef BE_swap_bytes
3874
3875 // Point to Barrett constants
3876 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3877
3878 vspltisb(zeroes, 0);
3879
3880 // Combine to 64 bit result.
3881 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3882
3883 // Reduce to 32 bit CRC: Remainder by multiply-high.
3884 lvx(Vtmp, cur_const);
3885 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
3886 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
3887 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3888 vsldoi(Vtmp, zeroes, Vtmp, 8);
3889 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
3890 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
3891
3892 // Move result. len is already updated.
3893 vsldoi(VCRC, VCRC, zeroes, 8);
3894 mfvrd(crc, VCRC);
3895
3896 // Restore non-volatile Vector registers (frameless).
3897 offsetInt = 0;
3898 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3899 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3900 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3901 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3902 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3903 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3904 #ifndef VM_LITTLE_ENDIAN
3905 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3906 #endif
3907 offsetInt -= 8; ld(R14, offsetInt, R1_SP);
3908 offsetInt -= 8; ld(R15, offsetInt, R1_SP);
3909 }
3910
crc32(Register crc,Register buf,Register len,Register t0,Register t1,Register t2,Register t3,Register t4,Register t5,Register t6,Register t7,bool is_crc32c)3911 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3912 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3913 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3914 : StubRoutines::crc_table_addr() , R0);
3915
3916 if (VM_Version::has_vpmsumb()) {
3917 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3918 } else {
3919 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3920 }
3921 }
3922
kernel_crc32_singleByteReg(Register crc,Register val,Register table,bool invertCRC)3923 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3924 assert_different_registers(crc, val, table);
3925
3926 BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3927 if (invertCRC) {
3928 nand(crc, crc, crc); // 1s complement of crc
3929 }
3930
3931 update_byte_crc32(crc, val, table);
3932
3933 if (invertCRC) {
3934 nand(crc, crc, crc); // 1s complement of crc
3935 }
3936 }
3937
3938 // dest_lo += src1 + src2
3939 // dest_hi += carry1 + carry2
add2_with_carry(Register dest_hi,Register dest_lo,Register src1,Register src2)3940 void MacroAssembler::add2_with_carry(Register dest_hi,
3941 Register dest_lo,
3942 Register src1, Register src2) {
3943 li(R0, 0);
3944 addc(dest_lo, dest_lo, src1);
3945 adde(dest_hi, dest_hi, R0);
3946 addc(dest_lo, dest_lo, src2);
3947 adde(dest_hi, dest_hi, R0);
3948 }
3949
3950 // Multiply 64 bit by 64 bit first loop.
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product_high,Register product,Register idx,Register kdx,Register tmp)3951 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3952 Register x_xstart,
3953 Register y, Register y_idx,
3954 Register z,
3955 Register carry,
3956 Register product_high, Register product,
3957 Register idx, Register kdx,
3958 Register tmp) {
3959 // jlong carry, x[], y[], z[];
3960 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3961 // huge_128 product = y[idx] * x[xstart] + carry;
3962 // z[kdx] = (jlong)product;
3963 // carry = (jlong)(product >>> 64);
3964 // }
3965 // z[xstart] = carry;
3966
3967 Label L_first_loop, L_first_loop_exit;
3968 Label L_one_x, L_one_y, L_multiply;
3969
3970 addic_(xstart, xstart, -1);
3971 blt(CCR0, L_one_x); // Special case: length of x is 1.
3972
3973 // Load next two integers of x.
3974 sldi(tmp, xstart, LogBytesPerInt);
3975 ldx(x_xstart, x, tmp);
3976 #ifdef VM_LITTLE_ENDIAN
3977 rldicl(x_xstart, x_xstart, 32, 0);
3978 #endif
3979
3980 align(32, 16);
3981 bind(L_first_loop);
3982
3983 cmpdi(CCR0, idx, 1);
3984 blt(CCR0, L_first_loop_exit);
3985 addi(idx, idx, -2);
3986 beq(CCR0, L_one_y);
3987
3988 // Load next two integers of y.
3989 sldi(tmp, idx, LogBytesPerInt);
3990 ldx(y_idx, y, tmp);
3991 #ifdef VM_LITTLE_ENDIAN
3992 rldicl(y_idx, y_idx, 32, 0);
3993 #endif
3994
3995
3996 bind(L_multiply);
3997 multiply64(product_high, product, x_xstart, y_idx);
3998
3999 li(tmp, 0);
4000 addc(product, product, carry); // Add carry to result.
4001 adde(product_high, product_high, tmp); // Add carry of the last addition.
4002 addi(kdx, kdx, -2);
4003
4004 // Store result.
4005 #ifdef VM_LITTLE_ENDIAN
4006 rldicl(product, product, 32, 0);
4007 #endif
4008 sldi(tmp, kdx, LogBytesPerInt);
4009 stdx(product, z, tmp);
4010 mr_if_needed(carry, product_high);
4011 b(L_first_loop);
4012
4013
4014 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4015
4016 lwz(y_idx, 0, y);
4017 b(L_multiply);
4018
4019
4020 bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4021
4022 lwz(x_xstart, 0, x);
4023 b(L_first_loop);
4024
4025 bind(L_first_loop_exit);
4026 }
4027
4028 // Multiply 64 bit by 64 bit and add 128 bit.
multiply_add_128_x_128(Register x_xstart,Register y,Register z,Register yz_idx,Register idx,Register carry,Register product_high,Register product,Register tmp,int offset)4029 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4030 Register z, Register yz_idx,
4031 Register idx, Register carry,
4032 Register product_high, Register product,
4033 Register tmp, int offset) {
4034
4035 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4036 // z[kdx] = (jlong)product;
4037
4038 sldi(tmp, idx, LogBytesPerInt);
4039 if (offset) {
4040 addi(tmp, tmp, offset);
4041 }
4042 ldx(yz_idx, y, tmp);
4043 #ifdef VM_LITTLE_ENDIAN
4044 rldicl(yz_idx, yz_idx, 32, 0);
4045 #endif
4046
4047 multiply64(product_high, product, x_xstart, yz_idx);
4048 ldx(yz_idx, z, tmp);
4049 #ifdef VM_LITTLE_ENDIAN
4050 rldicl(yz_idx, yz_idx, 32, 0);
4051 #endif
4052
4053 add2_with_carry(product_high, product, carry, yz_idx);
4054
4055 sldi(tmp, idx, LogBytesPerInt);
4056 if (offset) {
4057 addi(tmp, tmp, offset);
4058 }
4059 #ifdef VM_LITTLE_ENDIAN
4060 rldicl(product, product, 32, 0);
4061 #endif
4062 stdx(product, z, tmp);
4063 }
4064
4065 // Multiply 128 bit by 128 bit. Unrolled inner loop.
multiply_128_x_128_loop(Register x_xstart,Register y,Register z,Register yz_idx,Register idx,Register carry,Register product_high,Register product,Register carry2,Register tmp)4066 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4067 Register y, Register z,
4068 Register yz_idx, Register idx, Register carry,
4069 Register product_high, Register product,
4070 Register carry2, Register tmp) {
4071
4072 // jlong carry, x[], y[], z[];
4073 // int kdx = ystart+1;
4074 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4075 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4076 // z[kdx+idx+1] = (jlong)product;
4077 // jlong carry2 = (jlong)(product >>> 64);
4078 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4079 // z[kdx+idx] = (jlong)product;
4080 // carry = (jlong)(product >>> 64);
4081 // }
4082 // idx += 2;
4083 // if (idx > 0) {
4084 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4085 // z[kdx+idx] = (jlong)product;
4086 // carry = (jlong)(product >>> 64);
4087 // }
4088
4089 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4090 const Register jdx = R0;
4091
4092 // Scale the index.
4093 srdi_(jdx, idx, 2);
4094 beq(CCR0, L_third_loop_exit);
4095 mtctr(jdx);
4096
4097 align(32, 16);
4098 bind(L_third_loop);
4099
4100 addi(idx, idx, -4);
4101
4102 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4103 mr_if_needed(carry2, product_high);
4104
4105 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4106 mr_if_needed(carry, product_high);
4107 bdnz(L_third_loop);
4108
4109 bind(L_third_loop_exit); // Handle any left-over operand parts.
4110
4111 andi_(idx, idx, 0x3);
4112 beq(CCR0, L_post_third_loop_done);
4113
4114 Label L_check_1;
4115
4116 addic_(idx, idx, -2);
4117 blt(CCR0, L_check_1);
4118
4119 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4120 mr_if_needed(carry, product_high);
4121
4122 bind(L_check_1);
4123
4124 addi(idx, idx, 0x2);
4125 andi_(idx, idx, 0x1);
4126 addic_(idx, idx, -1);
4127 blt(CCR0, L_post_third_loop_done);
4128
4129 sldi(tmp, idx, LogBytesPerInt);
4130 lwzx(yz_idx, y, tmp);
4131 multiply64(product_high, product, x_xstart, yz_idx);
4132 lwzx(yz_idx, z, tmp);
4133
4134 add2_with_carry(product_high, product, yz_idx, carry);
4135
4136 sldi(tmp, idx, LogBytesPerInt);
4137 stwx(product, z, tmp);
4138 srdi(product, product, 32);
4139
4140 sldi(product_high, product_high, 32);
4141 orr(product, product, product_high);
4142 mr_if_needed(carry, product);
4143
4144 bind(L_post_third_loop_done);
4145 } // multiply_128_x_128_loop
4146
muladd(Register out,Register in,Register offset,Register len,Register k,Register tmp1,Register tmp2,Register carry)4147 void MacroAssembler::muladd(Register out, Register in,
4148 Register offset, Register len, Register k,
4149 Register tmp1, Register tmp2, Register carry) {
4150
4151 // Labels
4152 Label LOOP, SKIP;
4153
4154 // Make sure length is positive.
4155 cmpdi (CCR0, len, 0);
4156
4157 // Prepare variables
4158 subi (offset, offset, 4);
4159 li (carry, 0);
4160 ble (CCR0, SKIP);
4161
4162 mtctr (len);
4163 subi (len, len, 1 );
4164 sldi (len, len, 2 );
4165
4166 // Main loop
4167 bind(LOOP);
4168 lwzx (tmp1, len, in );
4169 lwzx (tmp2, offset, out );
4170 mulld (tmp1, tmp1, k );
4171 add (tmp2, carry, tmp2 );
4172 add (tmp2, tmp1, tmp2 );
4173 stwx (tmp2, offset, out );
4174 srdi (carry, tmp2, 32 );
4175 subi (offset, offset, 4 );
4176 subi (len, len, 4 );
4177 bdnz (LOOP);
4178 bind(SKIP);
4179 }
4180
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register tmp7,Register tmp8,Register tmp9,Register tmp10,Register tmp11,Register tmp12,Register tmp13)4181 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4182 Register y, Register ylen,
4183 Register z, Register zlen,
4184 Register tmp1, Register tmp2,
4185 Register tmp3, Register tmp4,
4186 Register tmp5, Register tmp6,
4187 Register tmp7, Register tmp8,
4188 Register tmp9, Register tmp10,
4189 Register tmp11, Register tmp12,
4190 Register tmp13) {
4191
4192 ShortBranchVerifier sbv(this);
4193
4194 assert_different_registers(x, xlen, y, ylen, z, zlen,
4195 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4196 assert_different_registers(x, xlen, y, ylen, z, zlen,
4197 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4198 assert_different_registers(x, xlen, y, ylen, z, zlen,
4199 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4200
4201 const Register idx = tmp1;
4202 const Register kdx = tmp2;
4203 const Register xstart = tmp3;
4204
4205 const Register y_idx = tmp4;
4206 const Register carry = tmp5;
4207 const Register product = tmp6;
4208 const Register product_high = tmp7;
4209 const Register x_xstart = tmp8;
4210 const Register tmp = tmp9;
4211
4212 // First Loop.
4213 //
4214 // final static long LONG_MASK = 0xffffffffL;
4215 // int xstart = xlen - 1;
4216 // int ystart = ylen - 1;
4217 // long carry = 0;
4218 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4219 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4220 // z[kdx] = (int)product;
4221 // carry = product >>> 32;
4222 // }
4223 // z[xstart] = (int)carry;
4224
4225 mr_if_needed(idx, ylen); // idx = ylen
4226 mr_if_needed(kdx, zlen); // kdx = xlen + ylen
4227 li(carry, 0); // carry = 0
4228
4229 Label L_done;
4230
4231 addic_(xstart, xlen, -1);
4232 blt(CCR0, L_done);
4233
4234 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4235 carry, product_high, product, idx, kdx, tmp);
4236
4237 Label L_second_loop;
4238
4239 cmpdi(CCR0, kdx, 0);
4240 beq(CCR0, L_second_loop);
4241
4242 Label L_carry;
4243
4244 addic_(kdx, kdx, -1);
4245 beq(CCR0, L_carry);
4246
4247 // Store lower 32 bits of carry.
4248 sldi(tmp, kdx, LogBytesPerInt);
4249 stwx(carry, z, tmp);
4250 srdi(carry, carry, 32);
4251 addi(kdx, kdx, -1);
4252
4253
4254 bind(L_carry);
4255
4256 // Store upper 32 bits of carry.
4257 sldi(tmp, kdx, LogBytesPerInt);
4258 stwx(carry, z, tmp);
4259
4260 // Second and third (nested) loops.
4261 //
4262 // for (int i = xstart-1; i >= 0; i--) { // Second loop
4263 // carry = 0;
4264 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4265 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4266 // (z[k] & LONG_MASK) + carry;
4267 // z[k] = (int)product;
4268 // carry = product >>> 32;
4269 // }
4270 // z[i] = (int)carry;
4271 // }
4272 //
4273 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4274
4275 bind(L_second_loop);
4276
4277 li(carry, 0); // carry = 0;
4278
4279 addic_(xstart, xstart, -1); // i = xstart-1;
4280 blt(CCR0, L_done);
4281
4282 Register zsave = tmp10;
4283
4284 mr(zsave, z);
4285
4286
4287 Label L_last_x;
4288
4289 sldi(tmp, xstart, LogBytesPerInt);
4290 add(z, z, tmp); // z = z + k - j
4291 addi(z, z, 4);
4292 addic_(xstart, xstart, -1); // i = xstart-1;
4293 blt(CCR0, L_last_x);
4294
4295 sldi(tmp, xstart, LogBytesPerInt);
4296 ldx(x_xstart, x, tmp);
4297 #ifdef VM_LITTLE_ENDIAN
4298 rldicl(x_xstart, x_xstart, 32, 0);
4299 #endif
4300
4301
4302 Label L_third_loop_prologue;
4303
4304 bind(L_third_loop_prologue);
4305
4306 Register xsave = tmp11;
4307 Register xlensave = tmp12;
4308 Register ylensave = tmp13;
4309
4310 mr(xsave, x);
4311 mr(xlensave, xstart);
4312 mr(ylensave, ylen);
4313
4314
4315 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4316 carry, product_high, product, x, tmp);
4317
4318 mr(z, zsave);
4319 mr(x, xsave);
4320 mr(xlen, xlensave); // This is the decrement of the loop counter!
4321 mr(ylen, ylensave);
4322
4323 addi(tmp3, xlen, 1);
4324 sldi(tmp, tmp3, LogBytesPerInt);
4325 stwx(carry, z, tmp);
4326 addic_(tmp3, tmp3, -1);
4327 blt(CCR0, L_done);
4328
4329 srdi(carry, carry, 32);
4330 sldi(tmp, tmp3, LogBytesPerInt);
4331 stwx(carry, z, tmp);
4332 b(L_second_loop);
4333
4334 // Next infrequent code is moved outside loops.
4335 bind(L_last_x);
4336
4337 lwz(x_xstart, 0, x);
4338 b(L_third_loop_prologue);
4339
4340 bind(L_done);
4341 } // multiply_to_len
4342
asm_assert(bool check_equal,const char * msg)4343 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4344 #ifdef ASSERT
4345 Label ok;
4346 if (check_equal) {
4347 beq(CCR0, ok);
4348 } else {
4349 bne(CCR0, ok);
4350 }
4351 stop(msg);
4352 bind(ok);
4353 #endif
4354 }
4355
asm_assert_mems_zero(bool check_equal,int size,int mem_offset,Register mem_base,const char * msg)4356 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4357 Register mem_base, const char* msg) {
4358 #ifdef ASSERT
4359 switch (size) {
4360 case 4:
4361 lwz(R0, mem_offset, mem_base);
4362 cmpwi(CCR0, R0, 0);
4363 break;
4364 case 8:
4365 ld(R0, mem_offset, mem_base);
4366 cmpdi(CCR0, R0, 0);
4367 break;
4368 default:
4369 ShouldNotReachHere();
4370 }
4371 asm_assert(check_equal, msg);
4372 #endif // ASSERT
4373 }
4374
verify_thread()4375 void MacroAssembler::verify_thread() {
4376 if (VerifyThread) {
4377 unimplemented("'VerifyThread' currently not implemented on PPC");
4378 }
4379 }
4380
verify_coop(Register coop,const char * msg)4381 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4382 if (!VerifyOops) { return; }
4383 if (UseCompressedOops) { decode_heap_oop(coop); }
4384 verify_oop(coop, msg);
4385 if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4386 }
4387
4388 // READ: oop. KILL: R0. Volatile floats perhaps.
verify_oop(Register oop,const char * msg)4389 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4390 if (!VerifyOops) {
4391 return;
4392 }
4393
4394 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4395 const Register tmp = R11; // Will be preserved.
4396 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4397
4398 BLOCK_COMMENT("verify_oop {");
4399
4400 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4401
4402 mr_if_needed(R4_ARG2, oop);
4403 save_LR_CR(tmp); // save in old frame
4404 push_frame_reg_args(nbytes_save, tmp);
4405 // load FunctionDescriptor** / entry_address *
4406 load_const_optimized(tmp, fd, R0);
4407 // load FunctionDescriptor* / entry_address
4408 ld(tmp, 0, tmp);
4409 load_const_optimized(R3_ARG1, (address)msg, R0);
4410 // Call destination for its side effect.
4411 call_c(tmp);
4412
4413 pop_frame();
4414 restore_LR_CR(tmp);
4415 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4416
4417 BLOCK_COMMENT("} verify_oop");
4418 }
4419
verify_oop_addr(RegisterOrConstant offs,Register base,const char * msg)4420 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4421 if (!VerifyOops) {
4422 return;
4423 }
4424
4425 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4426 const Register tmp = R11; // Will be preserved.
4427 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4428 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4429
4430 ld(R4_ARG2, offs, base);
4431 save_LR_CR(tmp); // save in old frame
4432 push_frame_reg_args(nbytes_save, tmp);
4433 // load FunctionDescriptor** / entry_address *
4434 load_const_optimized(tmp, fd, R0);
4435 // load FunctionDescriptor* / entry_address
4436 ld(tmp, 0, tmp);
4437 load_const_optimized(R3_ARG1, (address)msg, R0);
4438 // Call destination for its side effect.
4439 call_c(tmp);
4440
4441 pop_frame();
4442 restore_LR_CR(tmp);
4443 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4444 }
4445
4446 // Call a C-function that prints output.
stop(int type,const char * msg)4447 void MacroAssembler::stop(int type, const char* msg) {
4448 bool msg_present = (msg != NULL);
4449
4450 #ifndef PRODUCT
4451 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4452 #else
4453 block_comment("stop {");
4454 #endif
4455
4456 if (msg_present) {
4457 type |= stop_msg_present;
4458 }
4459 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4460 if (msg_present) {
4461 emit_int64((uintptr_t)msg);
4462 }
4463
4464 block_comment("} stop;");
4465 }
4466
4467 #ifndef PRODUCT
4468 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4469 // Val, addr are temp registers.
4470 // If low == addr, addr is killed.
4471 // High is preserved.
zap_from_to(Register low,int before,Register high,int after,Register val,Register addr)4472 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4473 if (!ZapMemory) return;
4474
4475 assert_different_registers(low, val);
4476
4477 BLOCK_COMMENT("zap memory region {");
4478 load_const_optimized(val, 0x0101010101010101);
4479 int size = before + after;
4480 if (low == high && size < 5 && size > 0) {
4481 int offset = -before*BytesPerWord;
4482 for (int i = 0; i < size; ++i) {
4483 std(val, offset, low);
4484 offset += (1*BytesPerWord);
4485 }
4486 } else {
4487 addi(addr, low, -before*BytesPerWord);
4488 assert_different_registers(high, val);
4489 if (after) addi(high, high, after * BytesPerWord);
4490 Label loop;
4491 bind(loop);
4492 std(val, 0, addr);
4493 addi(addr, addr, 8);
4494 cmpd(CCR6, addr, high);
4495 ble(CCR6, loop);
4496 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
4497 }
4498 BLOCK_COMMENT("} zap memory region");
4499 }
4500
4501 #endif // !PRODUCT
4502
skip_to_label_if_equal_zero(MacroAssembler * masm,Register temp,const bool * flag_addr,Label & label)4503 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4504 const bool* flag_addr, Label& label) {
4505 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4506 assert(sizeof(bool) == 1, "PowerPC ABI");
4507 masm->lbz(temp, simm16_offset, temp);
4508 masm->cmpwi(CCR0, temp, 0);
4509 masm->beq(CCR0, label);
4510 }
4511
SkipIfEqualZero(MacroAssembler * masm,Register temp,const bool * flag_addr)4512 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4513 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4514 }
4515
~SkipIfEqualZero()4516 SkipIfEqualZero::~SkipIfEqualZero() {
4517 _masm->bind(_label);
4518 }
4519
cache_wb(Address line)4520 void MacroAssembler::cache_wb(Address line) {
4521 assert(line.index() == noreg, "index should be noreg");
4522 assert(line.disp() == 0, "displacement should be 0");
4523 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4524 // Data Cache Store, not really a flush, so it works like a sync of cache
4525 // line and persistent mem, i.e. copying the cache line to persistent whilst
4526 // not invalidating the cache line.
4527 dcbst(line.base());
4528 }
4529
cache_wbsync(bool is_presync)4530 void MacroAssembler::cache_wbsync(bool is_presync) {
4531 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4532 // We only need a post sync barrier. Post means _after_ a cache line flush or
4533 // store instruction, pre means a barrier emitted before such a instructions.
4534 if (!is_presync) {
4535 fence();
4536 }
4537 }
4538