1 /*
2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  *
24  */
25 
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc/shared/collectedHeap.inline.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "memory/resourceArea.hpp"
34 #include "nativeInst_ppc.hpp"
35 #include "oops/klass.inline.hpp"
36 #include "oops/methodData.hpp"
37 #include "prims/methodHandles.hpp"
38 #include "runtime/biasedLocking.hpp"
39 #include "runtime/icache.hpp"
40 #include "runtime/interfaceSupport.inline.hpp"
41 #include "runtime/objectMonitor.hpp"
42 #include "runtime/os.hpp"
43 #include "runtime/safepoint.hpp"
44 #include "runtime/safepointMechanism.hpp"
45 #include "runtime/sharedRuntime.hpp"
46 #include "runtime/stubRoutines.hpp"
47 #include "utilities/macros.hpp"
48 #include "utilities/powerOfTwo.hpp"
49 
50 #ifdef PRODUCT
51 #define BLOCK_COMMENT(str) // nothing
52 #else
53 #define BLOCK_COMMENT(str) block_comment(str)
54 #endif
55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
56 
57 #ifdef ASSERT
58 // On RISC, there's no benefit to verifying instruction boundaries.
pd_check_instruction_mark()59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
60 #endif
61 
ld_largeoffset_unchecked(Register d,int si31,Register a,int emit_filler_nop)62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
64   if (Assembler::is_simm(si31, 16)) {
65     ld(d, si31, a);
66     if (emit_filler_nop) nop();
67   } else {
68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
70     addis(d, a, hi);
71     ld(d, lo, d);
72   }
73 }
74 
ld_largeoffset(Register d,int si31,Register a,int emit_filler_nop)75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
76   assert_different_registers(d, a);
77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
78 }
79 
load_sized_value(Register dst,RegisterOrConstant offs,Register base,size_t size_in_bytes,bool is_signed)80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
81                                       size_t size_in_bytes, bool is_signed) {
82   switch (size_in_bytes) {
83   case  8:              ld(dst, offs, base);                         break;
84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
87   default:  ShouldNotReachHere();
88   }
89 }
90 
store_sized_value(Register dst,RegisterOrConstant offs,Register base,size_t size_in_bytes)91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
92                                        size_t size_in_bytes) {
93   switch (size_in_bytes) {
94   case  8:  std(dst, offs, base); break;
95   case  4:  stw(dst, offs, base); break;
96   case  2:  sth(dst, offs, base); break;
97   case  1:  stb(dst, offs, base); break;
98   default:  ShouldNotReachHere();
99   }
100 }
101 
align(int modulus,int max,int rem)102 void MacroAssembler::align(int modulus, int max, int rem) {
103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
104   if (padding > max) return;
105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
106 }
107 
108 // Issue instructions that calculate given TOC from global TOC.
calculate_address_from_global_toc(Register dst,address addr,bool hi16,bool lo16,bool add_relocation,bool emit_dummy_addr)109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
110                                                        bool add_relocation, bool emit_dummy_addr) {
111   int offset = -1;
112   if (emit_dummy_addr) {
113     offset = -128; // dummy address
114   } else if (addr != (address)(intptr_t)-1) {
115     offset = MacroAssembler::offset_to_global_toc(addr);
116   }
117 
118   if (hi16) {
119     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
120   }
121   if (lo16) {
122     if (add_relocation) {
123       // Relocate at the addi to avoid confusion with a load from the method's TOC.
124       relocate(internal_word_Relocation::spec(addr));
125     }
126     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
127   }
128 }
129 
patch_calculate_address_from_global_toc_at(address a,address bound,address addr)130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
131   const int offset = MacroAssembler::offset_to_global_toc(addr);
132 
133   const address inst2_addr = a;
134   const int inst2 = *(int *)inst2_addr;
135 
136   // The relocation points to the second instruction, the addi,
137   // and the addi reads and writes the same register dst.
138   const int dst = inv_rt_field(inst2);
139   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
140 
141   // Now, find the preceding addis which writes to dst.
142   int inst1 = 0;
143   address inst1_addr = inst2_addr - BytesPerInstWord;
144   while (inst1_addr >= bound) {
145     inst1 = *(int *) inst1_addr;
146     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
147       // Stop, found the addis which writes dst.
148       break;
149     }
150     inst1_addr -= BytesPerInstWord;
151   }
152 
153   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
154   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
155   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
156   return inst1_addr;
157 }
158 
get_address_of_calculate_address_from_global_toc_at(address a,address bound)159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
160   const address inst2_addr = a;
161   const int inst2 = *(int *)inst2_addr;
162 
163   // The relocation points to the second instruction, the addi,
164   // and the addi reads and writes the same register dst.
165   const int dst = inv_rt_field(inst2);
166   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
167 
168   // Now, find the preceding addis which writes to dst.
169   int inst1 = 0;
170   address inst1_addr = inst2_addr - BytesPerInstWord;
171   while (inst1_addr >= bound) {
172     inst1 = *(int *) inst1_addr;
173     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
174       // stop, found the addis which writes dst
175       break;
176     }
177     inst1_addr -= BytesPerInstWord;
178   }
179 
180   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
181 
182   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
183   // -1 is a special case
184   if (offset == -1) {
185     return (address)(intptr_t)-1;
186   } else {
187     return global_toc() + offset;
188   }
189 }
190 
191 #ifdef _LP64
192 // Patch compressed oops or klass constants.
193 // Assembler sequence is
194 // 1) compressed oops:
195 //    lis  rx = const.hi
196 //    ori rx = rx | const.lo
197 // 2) compressed klass:
198 //    lis  rx = const.hi
199 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
200 //    ori rx = rx | const.lo
201 // Clrldi will be passed by.
patch_set_narrow_oop(address a,address bound,narrowOop data)202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
203   assert(UseCompressedOops, "Should only patch compressed oops");
204 
205   const address inst2_addr = a;
206   const int inst2 = *(int *)inst2_addr;
207 
208   // The relocation points to the second instruction, the ori,
209   // and the ori reads and writes the same register dst.
210   const int dst = inv_rta_field(inst2);
211   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
212   // Now, find the preceding addis which writes to dst.
213   int inst1 = 0;
214   address inst1_addr = inst2_addr - BytesPerInstWord;
215   bool inst1_found = false;
216   while (inst1_addr >= bound) {
217     inst1 = *(int *)inst1_addr;
218     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
219     inst1_addr -= BytesPerInstWord;
220   }
221   assert(inst1_found, "inst is not lis");
222 
223   int xc = (data >> 16) & 0xffff;
224   int xd = (data >>  0) & 0xffff;
225 
226   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
227   set_imm((int *)inst2_addr,        (xd)); // unsigned int
228   return inst1_addr;
229 }
230 
231 // Get compressed oop or klass constant.
get_narrow_oop(address a,address bound)232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
233   assert(UseCompressedOops, "Should only patch compressed oops");
234 
235   const address inst2_addr = a;
236   const int inst2 = *(int *)inst2_addr;
237 
238   // The relocation points to the second instruction, the ori,
239   // and the ori reads and writes the same register dst.
240   const int dst = inv_rta_field(inst2);
241   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
242   // Now, find the preceding lis which writes to dst.
243   int inst1 = 0;
244   address inst1_addr = inst2_addr - BytesPerInstWord;
245   bool inst1_found = false;
246 
247   while (inst1_addr >= bound) {
248     inst1 = *(int *) inst1_addr;
249     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
250     inst1_addr -= BytesPerInstWord;
251   }
252   assert(inst1_found, "inst is not lis");
253 
254   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
255   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
256 
257   return (int) (xl | xh);
258 }
259 #endif // _LP64
260 
261 // Returns true if successful.
load_const_from_method_toc(Register dst,AddressLiteral & a,Register toc,bool fixed_size)262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
263                                                 Register toc, bool fixed_size) {
264   int toc_offset = 0;
265   // Use RelocationHolder::none for the constant pool entry, otherwise
266   // we will end up with a failing NativeCall::verify(x) where x is
267   // the address of the constant pool entry.
268   // FIXME: We should insert relocation information for oops at the constant
269   // pool entries instead of inserting it at the loads; patching of a constant
270   // pool entry should be less expensive.
271   address const_address = address_constant((address)a.value(), RelocationHolder::none);
272   if (const_address == NULL) { return false; } // allocation failure
273   // Relocate at the pc of the load.
274   relocate(a.rspec());
275   toc_offset = (int)(const_address - code()->consts()->start());
276   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
277   return true;
278 }
279 
is_load_const_from_method_toc_at(address a)280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
281   const address inst1_addr = a;
282   const int inst1 = *(int *)inst1_addr;
283 
284    // The relocation points to the ld or the addis.
285    return (is_ld(inst1)) ||
286           (is_addis(inst1) && inv_ra_field(inst1) != 0);
287 }
288 
get_offset_of_load_const_from_method_toc_at(address a)289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
290   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
291 
292   const address inst1_addr = a;
293   const int inst1 = *(int *)inst1_addr;
294 
295   if (is_ld(inst1)) {
296     return inv_d1_field(inst1);
297   } else if (is_addis(inst1)) {
298     const int dst = inv_rt_field(inst1);
299 
300     // Now, find the succeeding ld which reads and writes to dst.
301     address inst2_addr = inst1_addr + BytesPerInstWord;
302     int inst2 = 0;
303     while (true) {
304       inst2 = *(int *) inst2_addr;
305       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
306         // Stop, found the ld which reads and writes dst.
307         break;
308       }
309       inst2_addr += BytesPerInstWord;
310     }
311     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
312   }
313   ShouldNotReachHere();
314   return 0;
315 }
316 
317 // Get the constant from a `load_const' sequence.
get_const(address a)318 long MacroAssembler::get_const(address a) {
319   assert(is_load_const_at(a), "not a load of a constant");
320   const int *p = (const int*) a;
321   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
322   if (is_ori(*(p+1))) {
323     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
324     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
325     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
326   } else if (is_lis(*(p+1))) {
327     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
330   } else {
331     ShouldNotReachHere();
332     return (long) 0;
333   }
334   return (long) x;
335 }
336 
337 // Patch the 64 bit constant of a `load_const' sequence. This is a low
338 // level procedure. It neither flushes the instruction cache nor is it
339 // mt safe.
patch_const(address a,long x)340 void MacroAssembler::patch_const(address a, long x) {
341   assert(is_load_const_at(a), "not a load of a constant");
342   int *p = (int*) a;
343   if (is_ori(*(p+1))) {
344     set_imm(0 + p, (x >> 48) & 0xffff);
345     set_imm(1 + p, (x >> 32) & 0xffff);
346     set_imm(3 + p, (x >> 16) & 0xffff);
347     set_imm(4 + p, x & 0xffff);
348   } else if (is_lis(*(p+1))) {
349     set_imm(0 + p, (x >> 48) & 0xffff);
350     set_imm(2 + p, (x >> 32) & 0xffff);
351     set_imm(1 + p, (x >> 16) & 0xffff);
352     set_imm(3 + p, x & 0xffff);
353   } else {
354     ShouldNotReachHere();
355   }
356 }
357 
allocate_metadata_address(Metadata * obj)358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
360   int index = oop_recorder()->allocate_metadata_index(obj);
361   RelocationHolder rspec = metadata_Relocation::spec(index);
362   return AddressLiteral((address)obj, rspec);
363 }
364 
constant_metadata_address(Metadata * obj)365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
366   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
367   int index = oop_recorder()->find_index(obj);
368   RelocationHolder rspec = metadata_Relocation::spec(index);
369   return AddressLiteral((address)obj, rspec);
370 }
371 
allocate_oop_address(jobject obj)372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
374   int oop_index = oop_recorder()->allocate_oop_index(obj);
375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
376 }
377 
constant_oop_address(jobject obj)378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
380   int oop_index = oop_recorder()->find_index(obj);
381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
382 }
383 
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
385                                                       Register tmp, int offset) {
386   intptr_t value = *delayed_value_addr;
387   if (value != 0) {
388     return RegisterOrConstant(value + offset);
389   }
390 
391   // Load indirectly to solve generation ordering problem.
392   // static address, no relocation
393   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
394   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
395 
396   if (offset != 0) {
397     addi(tmp, tmp, offset);
398   }
399 
400   return RegisterOrConstant(tmp);
401 }
402 
403 #ifndef PRODUCT
pd_print_patched_instruction(address branch)404 void MacroAssembler::pd_print_patched_instruction(address branch) {
405   Unimplemented(); // TODO: PPC port
406 }
407 #endif // ndef PRODUCT
408 
409 // Conditional far branch for destinations encodable in 24+2 bits.
bc_far(int boint,int biint,Label & dest,int optimize)410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
411 
412   // If requested by flag optimize, relocate the bc_far as a
413   // runtime_call and prepare for optimizing it when the code gets
414   // relocated.
415   if (optimize == bc_far_optimize_on_relocate) {
416     relocate(relocInfo::runtime_call_type);
417   }
418 
419   // variant 2:
420   //
421   //    b!cxx SKIP
422   //    bxx   DEST
423   //  SKIP:
424   //
425 
426   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
427                                                 opposite_bcond(inv_boint_bcond(boint)));
428 
429   // We emit two branches.
430   // First, a conditional branch which jumps around the far branch.
431   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
432   const address bc_pc        = pc();
433   bc(opposite_boint, biint, not_taken_pc);
434 
435   const int bc_instr = *(int*)bc_pc;
436   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
437   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
438   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
439                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
440          "postcondition");
441   assert(biint == inv_bi_field(bc_instr), "postcondition");
442 
443   // Second, an unconditional far branch which jumps to dest.
444   // Note: target(dest) remembers the current pc (see CodeSection::target)
445   //       and returns the current pc if the label is not bound yet; when
446   //       the label gets bound, the unconditional far branch will be patched.
447   const address target_pc = target(dest);
448   const address b_pc  = pc();
449   b(target_pc);
450 
451   assert(not_taken_pc == pc(),                     "postcondition");
452   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
453 }
454 
455 // 1 or 2 instructions
bc_far_optimized(int boint,int biint,Label & dest)456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
457   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
458     bc(boint, biint, dest);
459   } else {
460     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
461   }
462 }
463 
is_bc_far_at(address instruction_addr)464 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
465   return is_bc_far_variant1_at(instruction_addr) ||
466          is_bc_far_variant2_at(instruction_addr) ||
467          is_bc_far_variant3_at(instruction_addr);
468 }
469 
get_dest_of_bc_far_at(address instruction_addr)470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
471   if (is_bc_far_variant1_at(instruction_addr)) {
472     const address instruction_1_addr = instruction_addr;
473     const int instruction_1 = *(int*)instruction_1_addr;
474     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
475   } else if (is_bc_far_variant2_at(instruction_addr)) {
476     const address instruction_2_addr = instruction_addr + 4;
477     return bxx_destination(instruction_2_addr);
478   } else if (is_bc_far_variant3_at(instruction_addr)) {
479     return instruction_addr + 8;
480   }
481   // variant 4 ???
482   ShouldNotReachHere();
483   return NULL;
484 }
set_dest_of_bc_far_at(address instruction_addr,address dest)485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
486 
487   if (is_bc_far_variant3_at(instruction_addr)) {
488     // variant 3, far cond branch to the next instruction, already patched to nops:
489     //
490     //    nop
491     //    endgroup
492     //  SKIP/DEST:
493     //
494     return;
495   }
496 
497   // first, extract boint and biint from the current branch
498   int boint = 0;
499   int biint = 0;
500 
501   ResourceMark rm;
502   const int code_size = 2 * BytesPerInstWord;
503   CodeBuffer buf(instruction_addr, code_size);
504   MacroAssembler masm(&buf);
505   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
506     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
507     masm.nop();
508     masm.endgroup();
509   } else {
510     if (is_bc_far_variant1_at(instruction_addr)) {
511       // variant 1, the 1st instruction contains the destination address:
512       //
513       //    bcxx  DEST
514       //    nop
515       //
516       const int instruction_1 = *(int*)(instruction_addr);
517       boint = inv_bo_field(instruction_1);
518       biint = inv_bi_field(instruction_1);
519     } else if (is_bc_far_variant2_at(instruction_addr)) {
520       // variant 2, the 2nd instruction contains the destination address:
521       //
522       //    b!cxx SKIP
523       //    bxx   DEST
524       //  SKIP:
525       //
526       const int instruction_1 = *(int*)(instruction_addr);
527       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
528           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
529       biint = inv_bi_field(instruction_1);
530     } else {
531       // variant 4???
532       ShouldNotReachHere();
533     }
534 
535     // second, set the new branch destination and optimize the code
536     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
537         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
538       // variant 1:
539       //
540       //    bcxx  DEST
541       //    nop
542       //
543       masm.bc(boint, biint, dest);
544       masm.nop();
545     } else {
546       // variant 2:
547       //
548       //    b!cxx SKIP
549       //    bxx   DEST
550       //  SKIP:
551       //
552       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
553                                                     opposite_bcond(inv_boint_bcond(boint)));
554       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
555       masm.bc(opposite_boint, biint, not_taken_pc);
556       masm.b(dest);
557     }
558   }
559   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
560 }
561 
562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
bxx64_patchable(address dest,relocInfo::relocType rt,bool link)563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
564   // get current pc
565   uint64_t start_pc = (uint64_t) pc();
566 
567   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
568   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
569 
570   // relocate here
571   if (rt != relocInfo::none) {
572     relocate(rt);
573   }
574 
575   if ( ReoptimizeCallSequences &&
576        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
577         (!link && is_within_range_of_b(dest, pc_of_b)))) {
578     // variant 2:
579     // Emit an optimized, pc-relative call/jump.
580 
581     if (link) {
582       // some padding
583       nop();
584       nop();
585       nop();
586       nop();
587       nop();
588       nop();
589 
590       // do the call
591       assert(pc() == pc_of_bl, "just checking");
592       bl(dest, relocInfo::none);
593     } else {
594       // do the jump
595       assert(pc() == pc_of_b, "just checking");
596       b(dest, relocInfo::none);
597 
598       // some padding
599       nop();
600       nop();
601       nop();
602       nop();
603       nop();
604       nop();
605     }
606 
607     // Assert that we can identify the emitted call/jump.
608     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
609            "can't identify emitted call");
610   } else {
611     // variant 1:
612     mr(R0, R11);  // spill R11 -> R0.
613 
614     // Load the destination address into CTR,
615     // calculate destination relative to global toc.
616     calculate_address_from_global_toc(R11, dest, true, true, false);
617 
618     mtctr(R11);
619     mr(R11, R0);  // spill R11 <- R0.
620     nop();
621 
622     // do the call/jump
623     if (link) {
624       bctrl();
625     } else{
626       bctr();
627     }
628     // Assert that we can identify the emitted call/jump.
629     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
630            "can't identify emitted call");
631   }
632 
633   // Assert that we can identify the emitted call/jump.
634   assert(is_bxx64_patchable_at((address)start_pc, link),
635          "can't identify emitted call");
636   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
637          "wrong encoding of dest address");
638 }
639 
640 // Identify a bxx64_patchable instruction.
is_bxx64_patchable_at(address instruction_addr,bool link)641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
642   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
643     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
644       || is_bxx64_patchable_variant2_at(instruction_addr, link);
645 }
646 
647 // Does the call64_patchable instruction use a pc-relative encoding of
648 // the call destination?
is_bxx64_patchable_pcrelative_at(address instruction_addr,bool link)649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
650   // variant 2 is pc-relative
651   return is_bxx64_patchable_variant2_at(instruction_addr, link);
652 }
653 
654 // Identify variant 1.
is_bxx64_patchable_variant1_at(address instruction_addr,bool link)655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
656   unsigned int* instr = (unsigned int*) instruction_addr;
657   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
658       && is_mtctr(instr[5]) // mtctr
659     && is_load_const_at(instruction_addr);
660 }
661 
662 // Identify variant 1b: load destination relative to global toc.
is_bxx64_patchable_variant1b_at(address instruction_addr,bool link)663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
664   unsigned int* instr = (unsigned int*) instruction_addr;
665   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
666     && is_mtctr(instr[3]) // mtctr
667     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
668 }
669 
670 // Identify variant 2.
is_bxx64_patchable_variant2_at(address instruction_addr,bool link)671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
672   unsigned int* instr = (unsigned int*) instruction_addr;
673   if (link) {
674     return is_bl (instr[6])  // bl dest is last
675       && is_nop(instr[0])  // nop
676       && is_nop(instr[1])  // nop
677       && is_nop(instr[2])  // nop
678       && is_nop(instr[3])  // nop
679       && is_nop(instr[4])  // nop
680       && is_nop(instr[5]); // nop
681   } else {
682     return is_b  (instr[0])  // b  dest is first
683       && is_nop(instr[1])  // nop
684       && is_nop(instr[2])  // nop
685       && is_nop(instr[3])  // nop
686       && is_nop(instr[4])  // nop
687       && is_nop(instr[5])  // nop
688       && is_nop(instr[6]); // nop
689   }
690 }
691 
692 // Set dest address of a bxx64_patchable instruction.
set_dest_of_bxx64_patchable_at(address instruction_addr,address dest,bool link)693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
694   ResourceMark rm;
695   int code_size = MacroAssembler::bxx64_patchable_size;
696   CodeBuffer buf(instruction_addr, code_size);
697   MacroAssembler masm(&buf);
698   masm.bxx64_patchable(dest, relocInfo::none, link);
699   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
700 }
701 
702 // Get dest address of a bxx64_patchable instruction.
get_dest_of_bxx64_patchable_at(address instruction_addr,bool link)703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
704   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
705     return (address) (unsigned long) get_const(instruction_addr);
706   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
707     unsigned int* instr = (unsigned int*) instruction_addr;
708     if (link) {
709       const int instr_idx = 6; // bl is last
710       int branchoffset = branch_destination(instr[instr_idx], 0);
711       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
712     } else {
713       const int instr_idx = 0; // b is first
714       int branchoffset = branch_destination(instr[instr_idx], 0);
715       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
716     }
717   // Load dest relative to global toc.
718   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
719     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
720                                                                instruction_addr);
721   } else {
722     ShouldNotReachHere();
723     return NULL;
724   }
725 }
726 
727 // Uses ordering which corresponds to ABI:
728 //    _savegpr0_14:  std  r14,-144(r1)
729 //    _savegpr0_15:  std  r15,-136(r1)
730 //    _savegpr0_16:  std  r16,-128(r1)
save_nonvolatile_gprs(Register dst,int offset)731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
732   std(R14, offset, dst);   offset += 8;
733   std(R15, offset, dst);   offset += 8;
734   std(R16, offset, dst);   offset += 8;
735   std(R17, offset, dst);   offset += 8;
736   std(R18, offset, dst);   offset += 8;
737   std(R19, offset, dst);   offset += 8;
738   std(R20, offset, dst);   offset += 8;
739   std(R21, offset, dst);   offset += 8;
740   std(R22, offset, dst);   offset += 8;
741   std(R23, offset, dst);   offset += 8;
742   std(R24, offset, dst);   offset += 8;
743   std(R25, offset, dst);   offset += 8;
744   std(R26, offset, dst);   offset += 8;
745   std(R27, offset, dst);   offset += 8;
746   std(R28, offset, dst);   offset += 8;
747   std(R29, offset, dst);   offset += 8;
748   std(R30, offset, dst);   offset += 8;
749   std(R31, offset, dst);   offset += 8;
750 
751   stfd(F14, offset, dst);   offset += 8;
752   stfd(F15, offset, dst);   offset += 8;
753   stfd(F16, offset, dst);   offset += 8;
754   stfd(F17, offset, dst);   offset += 8;
755   stfd(F18, offset, dst);   offset += 8;
756   stfd(F19, offset, dst);   offset += 8;
757   stfd(F20, offset, dst);   offset += 8;
758   stfd(F21, offset, dst);   offset += 8;
759   stfd(F22, offset, dst);   offset += 8;
760   stfd(F23, offset, dst);   offset += 8;
761   stfd(F24, offset, dst);   offset += 8;
762   stfd(F25, offset, dst);   offset += 8;
763   stfd(F26, offset, dst);   offset += 8;
764   stfd(F27, offset, dst);   offset += 8;
765   stfd(F28, offset, dst);   offset += 8;
766   stfd(F29, offset, dst);   offset += 8;
767   stfd(F30, offset, dst);   offset += 8;
768   stfd(F31, offset, dst);
769 }
770 
771 // Uses ordering which corresponds to ABI:
772 //    _restgpr0_14:  ld   r14,-144(r1)
773 //    _restgpr0_15:  ld   r15,-136(r1)
774 //    _restgpr0_16:  ld   r16,-128(r1)
restore_nonvolatile_gprs(Register src,int offset)775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
776   ld(R14, offset, src);   offset += 8;
777   ld(R15, offset, src);   offset += 8;
778   ld(R16, offset, src);   offset += 8;
779   ld(R17, offset, src);   offset += 8;
780   ld(R18, offset, src);   offset += 8;
781   ld(R19, offset, src);   offset += 8;
782   ld(R20, offset, src);   offset += 8;
783   ld(R21, offset, src);   offset += 8;
784   ld(R22, offset, src);   offset += 8;
785   ld(R23, offset, src);   offset += 8;
786   ld(R24, offset, src);   offset += 8;
787   ld(R25, offset, src);   offset += 8;
788   ld(R26, offset, src);   offset += 8;
789   ld(R27, offset, src);   offset += 8;
790   ld(R28, offset, src);   offset += 8;
791   ld(R29, offset, src);   offset += 8;
792   ld(R30, offset, src);   offset += 8;
793   ld(R31, offset, src);   offset += 8;
794 
795   // FP registers
796   lfd(F14, offset, src);   offset += 8;
797   lfd(F15, offset, src);   offset += 8;
798   lfd(F16, offset, src);   offset += 8;
799   lfd(F17, offset, src);   offset += 8;
800   lfd(F18, offset, src);   offset += 8;
801   lfd(F19, offset, src);   offset += 8;
802   lfd(F20, offset, src);   offset += 8;
803   lfd(F21, offset, src);   offset += 8;
804   lfd(F22, offset, src);   offset += 8;
805   lfd(F23, offset, src);   offset += 8;
806   lfd(F24, offset, src);   offset += 8;
807   lfd(F25, offset, src);   offset += 8;
808   lfd(F26, offset, src);   offset += 8;
809   lfd(F27, offset, src);   offset += 8;
810   lfd(F28, offset, src);   offset += 8;
811   lfd(F29, offset, src);   offset += 8;
812   lfd(F30, offset, src);   offset += 8;
813   lfd(F31, offset, src);
814 }
815 
816 // For verify_oops.
save_volatile_gprs(Register dst,int offset)817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
818   std(R2,  offset, dst);   offset += 8;
819   std(R3,  offset, dst);   offset += 8;
820   std(R4,  offset, dst);   offset += 8;
821   std(R5,  offset, dst);   offset += 8;
822   std(R6,  offset, dst);   offset += 8;
823   std(R7,  offset, dst);   offset += 8;
824   std(R8,  offset, dst);   offset += 8;
825   std(R9,  offset, dst);   offset += 8;
826   std(R10, offset, dst);   offset += 8;
827   std(R11, offset, dst);   offset += 8;
828   std(R12, offset, dst);   offset += 8;
829 
830   stfd(F0, offset, dst);   offset += 8;
831   stfd(F1, offset, dst);   offset += 8;
832   stfd(F2, offset, dst);   offset += 8;
833   stfd(F3, offset, dst);   offset += 8;
834   stfd(F4, offset, dst);   offset += 8;
835   stfd(F5, offset, dst);   offset += 8;
836   stfd(F6, offset, dst);   offset += 8;
837   stfd(F7, offset, dst);   offset += 8;
838   stfd(F8, offset, dst);   offset += 8;
839   stfd(F9, offset, dst);   offset += 8;
840   stfd(F10, offset, dst);  offset += 8;
841   stfd(F11, offset, dst);  offset += 8;
842   stfd(F12, offset, dst);  offset += 8;
843   stfd(F13, offset, dst);
844 }
845 
846 // For verify_oops.
restore_volatile_gprs(Register src,int offset)847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
848   ld(R2,  offset, src);   offset += 8;
849   ld(R3,  offset, src);   offset += 8;
850   ld(R4,  offset, src);   offset += 8;
851   ld(R5,  offset, src);   offset += 8;
852   ld(R6,  offset, src);   offset += 8;
853   ld(R7,  offset, src);   offset += 8;
854   ld(R8,  offset, src);   offset += 8;
855   ld(R9,  offset, src);   offset += 8;
856   ld(R10, offset, src);   offset += 8;
857   ld(R11, offset, src);   offset += 8;
858   ld(R12, offset, src);   offset += 8;
859 
860   lfd(F0, offset, src);   offset += 8;
861   lfd(F1, offset, src);   offset += 8;
862   lfd(F2, offset, src);   offset += 8;
863   lfd(F3, offset, src);   offset += 8;
864   lfd(F4, offset, src);   offset += 8;
865   lfd(F5, offset, src);   offset += 8;
866   lfd(F6, offset, src);   offset += 8;
867   lfd(F7, offset, src);   offset += 8;
868   lfd(F8, offset, src);   offset += 8;
869   lfd(F9, offset, src);   offset += 8;
870   lfd(F10, offset, src);  offset += 8;
871   lfd(F11, offset, src);  offset += 8;
872   lfd(F12, offset, src);  offset += 8;
873   lfd(F13, offset, src);
874 }
875 
save_LR_CR(Register tmp)876 void MacroAssembler::save_LR_CR(Register tmp) {
877   mfcr(tmp);
878   std(tmp, _abi(cr), R1_SP);
879   mflr(tmp);
880   std(tmp, _abi(lr), R1_SP);
881   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
882 }
883 
restore_LR_CR(Register tmp)884 void MacroAssembler::restore_LR_CR(Register tmp) {
885   assert(tmp != R1_SP, "must be distinct");
886   ld(tmp, _abi(lr), R1_SP);
887   mtlr(tmp);
888   ld(tmp, _abi(cr), R1_SP);
889   mtcr(tmp);
890 }
891 
get_PC_trash_LR(Register result)892 address MacroAssembler::get_PC_trash_LR(Register result) {
893   Label L;
894   bl(L);
895   bind(L);
896   address lr_pc = pc();
897   mflr(result);
898   return lr_pc;
899 }
900 
resize_frame(Register offset,Register tmp)901 void MacroAssembler::resize_frame(Register offset, Register tmp) {
902 #ifdef ASSERT
903   assert_different_registers(offset, tmp, R1_SP);
904   andi_(tmp, offset, frame::alignment_in_bytes-1);
905   asm_assert_eq("resize_frame: unaligned");
906 #endif
907 
908   // tmp <- *(SP)
909   ld(tmp, _abi(callers_sp), R1_SP);
910   // addr <- SP + offset;
911   // *(addr) <- tmp;
912   // SP <- addr
913   stdux(tmp, R1_SP, offset);
914 }
915 
resize_frame(int offset,Register tmp)916 void MacroAssembler::resize_frame(int offset, Register tmp) {
917   assert(is_simm(offset, 16), "too big an offset");
918   assert_different_registers(tmp, R1_SP);
919   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
920   // tmp <- *(SP)
921   ld(tmp, _abi(callers_sp), R1_SP);
922   // addr <- SP + offset;
923   // *(addr) <- tmp;
924   // SP <- addr
925   stdu(tmp, offset, R1_SP);
926 }
927 
resize_frame_absolute(Register addr,Register tmp1,Register tmp2)928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
929   // (addr == tmp1) || (addr == tmp2) is allowed here!
930   assert(tmp1 != tmp2, "must be distinct");
931 
932   // compute offset w.r.t. current stack pointer
933   // tmp_1 <- addr - SP (!)
934   subf(tmp1, R1_SP, addr);
935 
936   // atomically update SP keeping back link.
937   resize_frame(tmp1/* offset */, tmp2/* tmp */);
938 }
939 
push_frame(Register bytes,Register tmp)940 void MacroAssembler::push_frame(Register bytes, Register tmp) {
941 #ifdef ASSERT
942   assert(bytes != R0, "r0 not allowed here");
943   andi_(R0, bytes, frame::alignment_in_bytes-1);
944   asm_assert_eq("push_frame(Reg, Reg): unaligned");
945 #endif
946   neg(tmp, bytes);
947   stdux(R1_SP, R1_SP, tmp);
948 }
949 
950 // Push a frame of size `bytes'.
push_frame(unsigned int bytes,Register tmp)951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
952   long offset = align_addr(bytes, frame::alignment_in_bytes);
953   if (is_simm(-offset, 16)) {
954     stdu(R1_SP, -offset, R1_SP);
955   } else {
956     load_const_optimized(tmp, -offset);
957     stdux(R1_SP, R1_SP, tmp);
958   }
959 }
960 
961 // Push a frame of size `bytes' plus abi_reg_args on top.
push_frame_reg_args(unsigned int bytes,Register tmp)962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
963   push_frame(bytes + frame::abi_reg_args_size, tmp);
964 }
965 
966 // Setup up a new C frame with a spill area for non-volatile GPRs and
967 // additional space for local variables.
push_frame_reg_args_nonvolatiles(unsigned int bytes,Register tmp)968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
969                                                       Register tmp) {
970   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
971 }
972 
973 // Pop current C frame.
pop_frame()974 void MacroAssembler::pop_frame() {
975   ld(R1_SP, _abi(callers_sp), R1_SP);
976 }
977 
978 #if defined(ABI_ELFv2)
branch_to(Register r_function_entry,bool and_link)979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
980   // TODO(asmundak): make sure the caller uses R12 as function descriptor
981   // most of the times.
982   if (R12 != r_function_entry) {
983     mr(R12, r_function_entry);
984   }
985   mtctr(R12);
986   // Do a call or a branch.
987   if (and_link) {
988     bctrl();
989   } else {
990     bctr();
991   }
992   _last_calls_return_pc = pc();
993 
994   return _last_calls_return_pc;
995 }
996 
997 // Call a C function via a function descriptor and use full C
998 // calling conventions. Updates and returns _last_calls_return_pc.
call_c(Register r_function_entry)999 address MacroAssembler::call_c(Register r_function_entry) {
1000   return branch_to(r_function_entry, /*and_link=*/true);
1001 }
1002 
1003 // For tail calls: only branch, don't link, so callee returns to caller of this function.
call_c_and_return_to_caller(Register r_function_entry)1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1005   return branch_to(r_function_entry, /*and_link=*/false);
1006 }
1007 
call_c(address function_entry,relocInfo::relocType rt)1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1009   load_const(R12, function_entry, R0);
1010   return branch_to(R12,  /*and_link=*/true);
1011 }
1012 
1013 #else
1014 // Generic version of a call to C function via a function descriptor
1015 // with variable support for C calling conventions (TOC, ENV, etc.).
1016 // Updates and returns _last_calls_return_pc.
branch_to(Register function_descriptor,bool and_link,bool save_toc_before_call,bool restore_toc_after_call,bool load_toc_of_callee,bool load_env_of_callee)1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1018                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1019   // we emit standard ptrgl glue code here
1020   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1021 
1022   // retrieve necessary entries from the function descriptor
1023   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1024   mtctr(R0);
1025 
1026   if (load_toc_of_callee) {
1027     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1028   }
1029   if (load_env_of_callee) {
1030     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1031   } else if (load_toc_of_callee) {
1032     li(R11, 0);
1033   }
1034 
1035   // do a call or a branch
1036   if (and_link) {
1037     bctrl();
1038   } else {
1039     bctr();
1040   }
1041   _last_calls_return_pc = pc();
1042 
1043   return _last_calls_return_pc;
1044 }
1045 
1046 // Call a C function via a function descriptor and use full C calling
1047 // conventions.
1048 // We don't use the TOC in generated code, so there is no need to save
1049 // and restore its value.
call_c(Register fd)1050 address MacroAssembler::call_c(Register fd) {
1051   return branch_to(fd, /*and_link=*/true,
1052                        /*save toc=*/false,
1053                        /*restore toc=*/false,
1054                        /*load toc=*/true,
1055                        /*load env=*/true);
1056 }
1057 
call_c_and_return_to_caller(Register fd)1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1059   return branch_to(fd, /*and_link=*/false,
1060                        /*save toc=*/false,
1061                        /*restore toc=*/false,
1062                        /*load toc=*/true,
1063                        /*load env=*/true);
1064 }
1065 
call_c(const FunctionDescriptor * fd,relocInfo::relocType rt)1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1067   if (rt != relocInfo::none) {
1068     // this call needs to be relocatable
1069     if (!ReoptimizeCallSequences
1070         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1071         || fd == NULL   // support code-size estimation
1072         || !fd->is_friend_function()
1073         || fd->entry() == NULL) {
1074       // it's not a friend function as defined by class FunctionDescriptor,
1075       // so do a full call-c here.
1076       load_const(R11, (address)fd, R0);
1077 
1078       bool has_env = (fd != NULL && fd->env() != NULL);
1079       return branch_to(R11, /*and_link=*/true,
1080                             /*save toc=*/false,
1081                             /*restore toc=*/false,
1082                             /*load toc=*/true,
1083                             /*load env=*/has_env);
1084     } else {
1085       // It's a friend function. Load the entry point and don't care about
1086       // toc and env. Use an optimizable call instruction, but ensure the
1087       // same code-size as in the case of a non-friend function.
1088       nop();
1089       nop();
1090       nop();
1091       bl64_patchable(fd->entry(), rt);
1092       _last_calls_return_pc = pc();
1093       return _last_calls_return_pc;
1094     }
1095   } else {
1096     // This call does not need to be relocatable, do more aggressive
1097     // optimizations.
1098     if (!ReoptimizeCallSequences
1099       || !fd->is_friend_function()) {
1100       // It's not a friend function as defined by class FunctionDescriptor,
1101       // so do a full call-c here.
1102       load_const(R11, (address)fd, R0);
1103       return branch_to(R11, /*and_link=*/true,
1104                             /*save toc=*/false,
1105                             /*restore toc=*/false,
1106                             /*load toc=*/true,
1107                             /*load env=*/true);
1108     } else {
1109       // it's a friend function, load the entry point and don't care about
1110       // toc and env.
1111       address dest = fd->entry();
1112       if (is_within_range_of_b(dest, pc())) {
1113         bl(dest);
1114       } else {
1115         bl64_patchable(dest, rt);
1116       }
1117       _last_calls_return_pc = pc();
1118       return _last_calls_return_pc;
1119     }
1120   }
1121 }
1122 
1123 // Call a C function.  All constants needed reside in TOC.
1124 //
1125 // Read the address to call from the TOC.
1126 // Read env from TOC, if fd specifies an env.
1127 // Read new TOC from TOC.
call_c_using_toc(const FunctionDescriptor * fd,relocInfo::relocType rt,Register toc)1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1129                                          relocInfo::relocType rt, Register toc) {
1130   if (!ReoptimizeCallSequences
1131     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1132     || !fd->is_friend_function()) {
1133     // It's not a friend function as defined by class FunctionDescriptor,
1134     // so do a full call-c here.
1135     assert(fd->entry() != NULL, "function must be linked");
1136 
1137     AddressLiteral fd_entry(fd->entry());
1138     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1139     mtctr(R11);
1140     if (fd->env() == NULL) {
1141       li(R11, 0);
1142       nop();
1143     } else {
1144       AddressLiteral fd_env(fd->env());
1145       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1146     }
1147     AddressLiteral fd_toc(fd->toc());
1148     // Set R2_TOC (load from toc)
1149     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1150     bctrl();
1151     _last_calls_return_pc = pc();
1152     if (!success) { return NULL; }
1153   } else {
1154     // It's a friend function, load the entry point and don't care about
1155     // toc and env. Use an optimizable call instruction, but ensure the
1156     // same code-size as in the case of a non-friend function.
1157     nop();
1158     bl64_patchable(fd->entry(), rt);
1159     _last_calls_return_pc = pc();
1160   }
1161   return _last_calls_return_pc;
1162 }
1163 #endif // ABI_ELFv2
1164 
call_VM_base(Register oop_result,Register last_java_sp,address entry_point,bool check_exceptions)1165 void MacroAssembler::call_VM_base(Register oop_result,
1166                                   Register last_java_sp,
1167                                   address  entry_point,
1168                                   bool     check_exceptions) {
1169   BLOCK_COMMENT("call_VM {");
1170   // Determine last_java_sp register.
1171   if (!last_java_sp->is_valid()) {
1172     last_java_sp = R1_SP;
1173   }
1174   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1175 
1176   // ARG1 must hold thread address.
1177   mr(R3_ARG1, R16_thread);
1178 #if defined(ABI_ELFv2)
1179   address return_pc = call_c(entry_point, relocInfo::none);
1180 #else
1181   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1182 #endif
1183 
1184   reset_last_Java_frame();
1185 
1186   // Check for pending exceptions.
1187   if (check_exceptions) {
1188     // We don't check for exceptions here.
1189     ShouldNotReachHere();
1190   }
1191 
1192   // Get oop result if there is one and reset the value in the thread.
1193   if (oop_result->is_valid()) {
1194     get_vm_result(oop_result);
1195   }
1196 
1197   _last_calls_return_pc = return_pc;
1198   BLOCK_COMMENT("} call_VM");
1199 }
1200 
call_VM_leaf_base(address entry_point)1201 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1202   BLOCK_COMMENT("call_VM_leaf {");
1203 #if defined(ABI_ELFv2)
1204   call_c(entry_point, relocInfo::none);
1205 #else
1206   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1207 #endif
1208   BLOCK_COMMENT("} call_VM_leaf");
1209 }
1210 
call_VM(Register oop_result,address entry_point,bool check_exceptions)1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1212   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1213 }
1214 
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1216                              bool check_exceptions) {
1217   // R3_ARG1 is reserved for the thread.
1218   mr_if_needed(R4_ARG2, arg_1);
1219   call_VM(oop_result, entry_point, check_exceptions);
1220 }
1221 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1223                              bool check_exceptions) {
1224   // R3_ARG1 is reserved for the thread
1225   mr_if_needed(R4_ARG2, arg_1);
1226   assert(arg_2 != R4_ARG2, "smashed argument");
1227   mr_if_needed(R5_ARG3, arg_2);
1228   call_VM(oop_result, entry_point, check_exceptions);
1229 }
1230 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1232                              bool check_exceptions) {
1233   // R3_ARG1 is reserved for the thread
1234   mr_if_needed(R4_ARG2, arg_1);
1235   assert(arg_2 != R4_ARG2, "smashed argument");
1236   mr_if_needed(R5_ARG3, arg_2);
1237   mr_if_needed(R6_ARG4, arg_3);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
call_VM_leaf(address entry_point)1241 void MacroAssembler::call_VM_leaf(address entry_point) {
1242   call_VM_leaf_base(entry_point);
1243 }
1244 
call_VM_leaf(address entry_point,Register arg_1)1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1246   mr_if_needed(R3_ARG1, arg_1);
1247   call_VM_leaf(entry_point);
1248 }
1249 
call_VM_leaf(address entry_point,Register arg_1,Register arg_2)1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1251   mr_if_needed(R3_ARG1, arg_1);
1252   assert(arg_2 != R3_ARG1, "smashed argument");
1253   mr_if_needed(R4_ARG2, arg_2);
1254   call_VM_leaf(entry_point);
1255 }
1256 
call_VM_leaf(address entry_point,Register arg_1,Register arg_2,Register arg_3)1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1258   mr_if_needed(R3_ARG1, arg_1);
1259   assert(arg_2 != R3_ARG1, "smashed argument");
1260   mr_if_needed(R4_ARG2, arg_2);
1261   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1262   mr_if_needed(R5_ARG3, arg_3);
1263   call_VM_leaf(entry_point);
1264 }
1265 
1266 // Check whether instruction is a read access to the polling page
1267 // which was emitted by load_from_polling_page(..).
is_load_from_polling_page(int instruction,void * ucontext,address * polling_address_ptr)1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1269                                                address* polling_address_ptr) {
1270   if (!is_ld(instruction))
1271     return false; // It's not a ld. Fail.
1272 
1273   int rt = inv_rt_field(instruction);
1274   int ra = inv_ra_field(instruction);
1275   int ds = inv_ds_field(instruction);
1276   if (!(ds == 0 && ra != 0 && rt == 0)) {
1277     return false; // It's not a ld(r0, X, ra). Fail.
1278   }
1279 
1280   if (!ucontext) {
1281     // Set polling address.
1282     if (polling_address_ptr != NULL) {
1283       *polling_address_ptr = NULL;
1284     }
1285     return true; // No ucontext given. Can't check value of ra. Assume true.
1286   }
1287 
1288 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
1289   // Ucontext given. Check that register ra contains the address of
1290   // the safepoing polling page.
1291   ucontext_t* uc = (ucontext_t*) ucontext;
1292   // Set polling address.
1293 #if defined(LINUX)
1294   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1295 #elif defined(_ALLBSD_SOURCE)
1296   address addr = (address)uc->uc_mcontext.mc_gpr[ra] + (ssize_t)ds;
1297 #endif
1298   if (polling_address_ptr != NULL) {
1299     *polling_address_ptr = addr;
1300   }
1301   return SafepointMechanism::is_poll_address(addr);
1302 #else
1303   // Not on Linux, ucontext must be NULL.
1304   ShouldNotReachHere();
1305   return false;
1306 #endif
1307 }
1308 
bang_stack_with_offset(int offset)1309 void MacroAssembler::bang_stack_with_offset(int offset) {
1310   // When increasing the stack, the old stack pointer will be written
1311   // to the new top of stack according to the PPC64 abi.
1312   // Therefore, stack banging is not necessary when increasing
1313   // the stack by <= os::vm_page_size() bytes.
1314   // When increasing the stack by a larger amount, this method is
1315   // called repeatedly to bang the intermediate pages.
1316 
1317   // Stack grows down, caller passes positive offset.
1318   assert(offset > 0, "must bang with positive offset");
1319 
1320   long stdoffset = -offset;
1321 
1322   if (is_simm(stdoffset, 16)) {
1323     // Signed 16 bit offset, a simple std is ok.
1324     if (UseLoadInstructionsForStackBangingPPC64) {
1325       ld(R0, (int)(signed short)stdoffset, R1_SP);
1326     } else {
1327       std(R0,(int)(signed short)stdoffset, R1_SP);
1328     }
1329   } else if (is_simm(stdoffset, 31)) {
1330     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1331     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1332 
1333     Register tmp = R11;
1334     addis(tmp, R1_SP, hi);
1335     if (UseLoadInstructionsForStackBangingPPC64) {
1336       ld(R0,  lo, tmp);
1337     } else {
1338       std(R0, lo, tmp);
1339     }
1340   } else {
1341     ShouldNotReachHere();
1342   }
1343 }
1344 
1345 // If instruction is a stack bang of the form
1346 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1347 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1348 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1349 // return the banged address. Otherwise, return 0.
get_stack_bang_address(int instruction,void * ucontext)1350 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1351 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
1352   ucontext_t* uc = (ucontext_t*) ucontext;
1353   int rs = inv_rs_field(instruction);
1354   int ra = inv_ra_field(instruction);
1355   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1356       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1357       || (is_stdu(instruction) && rs == 1)) {
1358     int ds = inv_ds_field(instruction);
1359     // return banged address
1360 #if defined(LINUX)
1361     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1362 #elif defined(_ALLBSD_SOURCE)
1363     return ds+(address)uc->uc_mcontext.mc_gpr[ra];
1364 #endif
1365   } else if (is_stdux(instruction) && rs == 1) {
1366     int rb = inv_rb_field(instruction);
1367 #if defined(LINUX)
1368     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1369     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1370 #elif defined(_ALLBSD_SOURCE)
1371     address sp = (address)uc->uc_mcontext.mc_gpr[1];
1372     long rb_val = (long)uc->uc_mcontext.mc_gpr[rb];
1373 #endif
1374     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1375                                   : sp + rb_val; // banged address
1376   }
1377   return NULL; // not a stack bang
1378 #else
1379   // workaround not needed on !LINUX :-)
1380   ShouldNotCallThis();
1381   return NULL;
1382 #endif
1383 }
1384 
reserved_stack_check(Register return_pc)1385 void MacroAssembler::reserved_stack_check(Register return_pc) {
1386   // Test if reserved zone needs to be enabled.
1387   Label no_reserved_zone_enabling;
1388 
1389   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1390   cmpld(CCR0, R1_SP, R0);
1391   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1392 
1393   // Enable reserved zone again, throw stack overflow exception.
1394   push_frame_reg_args(0, R0);
1395   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1396   pop_frame();
1397   mtlr(return_pc);
1398   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1399   mtctr(R0);
1400   bctr();
1401 
1402   should_not_reach_here();
1403 
1404   bind(no_reserved_zone_enabling);
1405 }
1406 
getandsetd(Register dest_current_value,Register exchange_value,Register addr_base,bool cmpxchgx_hint)1407 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1408                                 bool cmpxchgx_hint) {
1409   Label retry;
1410   bind(retry);
1411   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1412   stdcx_(exchange_value, addr_base);
1413   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1414     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1415   } else {
1416     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1417   }
1418 }
1419 
getandaddd(Register dest_current_value,Register inc_value,Register addr_base,Register tmp,bool cmpxchgx_hint)1420 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1421                                 Register tmp, bool cmpxchgx_hint) {
1422   Label retry;
1423   bind(retry);
1424   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1425   add(tmp, dest_current_value, inc_value);
1426   stdcx_(tmp, addr_base);
1427   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1428     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1429   } else {
1430     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1431   }
1432 }
1433 
1434 // Word/sub-word atomic helper functions
1435 
1436 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1437 // Only signed types are supported with size < 4.
1438 // Atomic add always kills tmp1.
atomic_get_and_modify_generic(Register dest_current_value,Register exchange_value,Register addr_base,Register tmp1,Register tmp2,Register tmp3,bool cmpxchgx_hint,bool is_add,int size)1439 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1440                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1441                                                    bool cmpxchgx_hint, bool is_add, int size) {
1442   // Sub-word instructions are available since Power 8.
1443   // For older processors, instruction_type != size holds, and we
1444   // emulate the sub-word instructions by constructing a 4-byte value
1445   // that leaves the other bytes unchanged.
1446   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1447 
1448   Label retry;
1449   Register shift_amount = noreg,
1450            val32 = dest_current_value,
1451            modval = is_add ? tmp1 : exchange_value;
1452 
1453   if (instruction_type != size) {
1454     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1455     modval = tmp1;
1456     shift_amount = tmp2;
1457     val32 = tmp3;
1458     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1459 #ifdef VM_LITTLE_ENDIAN
1460     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1461     clrrdi(addr_base, addr_base, 2);
1462 #else
1463     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1464     clrrdi(addr_base, addr_base, 2);
1465     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1466 #endif
1467   }
1468 
1469   // atomic emulation loop
1470   bind(retry);
1471 
1472   switch (instruction_type) {
1473     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1474     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1475     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1476     default: ShouldNotReachHere();
1477   }
1478 
1479   if (instruction_type != size) {
1480     srw(dest_current_value, val32, shift_amount);
1481   }
1482 
1483   if (is_add) { add(modval, dest_current_value, exchange_value); }
1484 
1485   if (instruction_type != size) {
1486     // Transform exchange value such that the replacement can be done by one xor instruction.
1487     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1488     clrldi(modval, modval, (size == 1) ? 56 : 48);
1489     slw(modval, modval, shift_amount);
1490     xorr(modval, val32, modval);
1491   }
1492 
1493   switch (instruction_type) {
1494     case 4: stwcx_(modval, addr_base); break;
1495     case 2: sthcx_(modval, addr_base); break;
1496     case 1: stbcx_(modval, addr_base); break;
1497     default: ShouldNotReachHere();
1498   }
1499 
1500   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1501     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1502   } else {
1503     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1504   }
1505 
1506   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1507   if (size == 1) {
1508     extsb(dest_current_value, dest_current_value);
1509   } else if (size == 2) {
1510     extsh(dest_current_value, dest_current_value);
1511   };
1512 }
1513 
1514 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1515 // Only signed types are supported with size < 4.
cmpxchg_loop_body(ConditionRegister flag,Register dest_current_value,Register compare_value,Register exchange_value,Register addr_base,Register tmp1,Register tmp2,Label & retry,Label & failed,bool cmpxchgx_hint,int size)1516 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1517                                        Register compare_value, Register exchange_value,
1518                                        Register addr_base, Register tmp1, Register tmp2,
1519                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1520   // Sub-word instructions are available since Power 8.
1521   // For older processors, instruction_type != size holds, and we
1522   // emulate the sub-word instructions by constructing a 4-byte value
1523   // that leaves the other bytes unchanged.
1524   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1525 
1526   Register shift_amount = noreg,
1527            val32 = dest_current_value,
1528            modval = exchange_value;
1529 
1530   if (instruction_type != size) {
1531     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1532     shift_amount = tmp1;
1533     val32 = tmp2;
1534     modval = tmp2;
1535     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1536 #ifdef VM_LITTLE_ENDIAN
1537     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1538     clrrdi(addr_base, addr_base, 2);
1539 #else
1540     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1541     clrrdi(addr_base, addr_base, 2);
1542     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1543 #endif
1544     // Transform exchange value such that the replacement can be done by one xor instruction.
1545     xorr(exchange_value, compare_value, exchange_value);
1546     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1547     slw(exchange_value, exchange_value, shift_amount);
1548   }
1549 
1550   // atomic emulation loop
1551   bind(retry);
1552 
1553   switch (instruction_type) {
1554     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1555     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1556     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1557     default: ShouldNotReachHere();
1558   }
1559 
1560   if (instruction_type != size) {
1561     srw(dest_current_value, val32, shift_amount);
1562   }
1563   if (size == 1) {
1564     extsb(dest_current_value, dest_current_value);
1565   } else if (size == 2) {
1566     extsh(dest_current_value, dest_current_value);
1567   };
1568 
1569   cmpw(flag, dest_current_value, compare_value);
1570   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1571     bne_predict_not_taken(flag, failed);
1572   } else {
1573     bne(                  flag, failed);
1574   }
1575   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1576   // fall through    => (flag == eq), (dest_current_value == compare_value)
1577 
1578   if (instruction_type != size) {
1579     xorr(modval, val32, exchange_value);
1580   }
1581 
1582   switch (instruction_type) {
1583     case 4: stwcx_(modval, addr_base); break;
1584     case 2: sthcx_(modval, addr_base); break;
1585     case 1: stbcx_(modval, addr_base); break;
1586     default: ShouldNotReachHere();
1587   }
1588 }
1589 
1590 // CmpxchgX sets condition register to cmpX(current, compare).
cmpxchg_generic(ConditionRegister flag,Register dest_current_value,Register compare_value,Register exchange_value,Register addr_base,Register tmp1,Register tmp2,int semantics,bool cmpxchgx_hint,Register int_flag_success,bool contention_hint,bool weak,int size)1591 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1592                                      Register compare_value, Register exchange_value,
1593                                      Register addr_base, Register tmp1, Register tmp2,
1594                                      int semantics, bool cmpxchgx_hint,
1595                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1596   Label retry;
1597   Label failed;
1598   Label done;
1599 
1600   // Save one branch if result is returned via register and
1601   // result register is different from the other ones.
1602   bool use_result_reg    = (int_flag_success != noreg);
1603   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1604                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1605                             int_flag_success != tmp1 && int_flag_success != tmp2);
1606   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1607   assert(size == 1 || size == 2 || size == 4, "unsupported");
1608 
1609   if (use_result_reg && preset_result_reg) {
1610     li(int_flag_success, 0); // preset (assume cas failed)
1611   }
1612 
1613   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1614   if (contention_hint) { // Don't try to reserve if cmp fails.
1615     switch (size) {
1616       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1617       case 2: lha(dest_current_value, 0, addr_base); break;
1618       case 4: lwz(dest_current_value, 0, addr_base); break;
1619       default: ShouldNotReachHere();
1620     }
1621     cmpw(flag, dest_current_value, compare_value);
1622     bne(flag, failed);
1623   }
1624 
1625   // release/fence semantics
1626   if (semantics & MemBarRel) {
1627     release();
1628   }
1629 
1630   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1631                     retry, failed, cmpxchgx_hint, size);
1632   if (!weak || use_result_reg) {
1633     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1634       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1635     } else {
1636       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1637     }
1638   }
1639   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1640 
1641   // Result in register (must do this at the end because int_flag_success can be the
1642   // same register as one above).
1643   if (use_result_reg) {
1644     li(int_flag_success, 1);
1645   }
1646 
1647   if (semantics & MemBarFenceAfter) {
1648     fence();
1649   } else if (semantics & MemBarAcq) {
1650     isync();
1651   }
1652 
1653   if (use_result_reg && !preset_result_reg) {
1654     b(done);
1655   }
1656 
1657   bind(failed);
1658   if (use_result_reg && !preset_result_reg) {
1659     li(int_flag_success, 0);
1660   }
1661 
1662   bind(done);
1663   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1664   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1665 }
1666 
1667 // Preforms atomic compare exchange:
1668 //   if (compare_value == *addr_base)
1669 //     *addr_base = exchange_value
1670 //     int_flag_success = 1;
1671 //   else
1672 //     int_flag_success = 0;
1673 //
1674 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1675 // Register dest_current_value  = *addr_base
1676 // Register compare_value       Used to compare with value in memory
1677 // Register exchange_value      Written to memory if compare_value == *addr_base
1678 // Register addr_base           The memory location to compareXChange
1679 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1680 //
1681 // To avoid the costly compare exchange the value is tested beforehand.
1682 // Several special cases exist to avoid that unnecessary information is generated.
1683 //
cmpxchgd(ConditionRegister flag,Register dest_current_value,RegisterOrConstant compare_value,Register exchange_value,Register addr_base,int semantics,bool cmpxchgx_hint,Register int_flag_success,Label * failed_ext,bool contention_hint,bool weak)1684 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1685                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1686                               Register addr_base, int semantics, bool cmpxchgx_hint,
1687                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1688   Label retry;
1689   Label failed_int;
1690   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1691   Label done;
1692 
1693   // Save one branch if result is returned via register and result register is different from the other ones.
1694   bool use_result_reg    = (int_flag_success!=noreg);
1695   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1696                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1697   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1698   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1699 
1700   if (use_result_reg && preset_result_reg) {
1701     li(int_flag_success, 0); // preset (assume cas failed)
1702   }
1703 
1704   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1705   if (contention_hint) { // Don't try to reserve if cmp fails.
1706     ld(dest_current_value, 0, addr_base);
1707     cmpd(flag, compare_value, dest_current_value);
1708     bne(flag, failed);
1709   }
1710 
1711   // release/fence semantics
1712   if (semantics & MemBarRel) {
1713     release();
1714   }
1715 
1716   // atomic emulation loop
1717   bind(retry);
1718 
1719   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1720   cmpd(flag, compare_value, dest_current_value);
1721   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1722     bne_predict_not_taken(flag, failed);
1723   } else {
1724     bne(                  flag, failed);
1725   }
1726 
1727   stdcx_(exchange_value, addr_base);
1728   if (!weak || use_result_reg || failed_ext) {
1729     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1730       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1731     } else {
1732       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1733     }
1734   }
1735 
1736   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1737   if (use_result_reg) {
1738     li(int_flag_success, 1);
1739   }
1740 
1741   if (semantics & MemBarFenceAfter) {
1742     fence();
1743   } else if (semantics & MemBarAcq) {
1744     isync();
1745   }
1746 
1747   if (use_result_reg && !preset_result_reg) {
1748     b(done);
1749   }
1750 
1751   bind(failed_int);
1752   if (use_result_reg && !preset_result_reg) {
1753     li(int_flag_success, 0);
1754   }
1755 
1756   bind(done);
1757   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1758   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1759 }
1760 
1761 // Look up the method for a megamorphic invokeinterface call.
1762 // The target method is determined by <intf_klass, itable_index>.
1763 // The receiver klass is in recv_klass.
1764 // On success, the result will be in method_result, and execution falls through.
1765 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Register temp2,Label & L_no_such_interface,bool return_method)1766 void MacroAssembler::lookup_interface_method(Register recv_klass,
1767                                              Register intf_klass,
1768                                              RegisterOrConstant itable_index,
1769                                              Register method_result,
1770                                              Register scan_temp,
1771                                              Register temp2,
1772                                              Label& L_no_such_interface,
1773                                              bool return_method) {
1774   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1775 
1776   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1777   int vtable_base = in_bytes(Klass::vtable_start_offset());
1778   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1779   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1780   int scan_step   = itableOffsetEntry::size() * wordSize;
1781   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1782 
1783   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1784   // %%% We should store the aligned, prescaled offset in the klassoop.
1785   // Then the next several instructions would fold away.
1786 
1787   sldi(scan_temp, scan_temp, log_vte_size);
1788   addi(scan_temp, scan_temp, vtable_base);
1789   add(scan_temp, recv_klass, scan_temp);
1790 
1791   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1792   if (return_method) {
1793     if (itable_index.is_register()) {
1794       Register itable_offset = itable_index.as_register();
1795       sldi(method_result, itable_offset, logMEsize);
1796       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1797       add(method_result, method_result, recv_klass);
1798     } else {
1799       long itable_offset = (long)itable_index.as_constant();
1800       // static address, no relocation
1801       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1802     }
1803   }
1804 
1805   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1806   //   if (scan->interface() == intf) {
1807   //     result = (klass + scan->offset() + itable_index);
1808   //   }
1809   // }
1810   Label search, found_method;
1811 
1812   for (int peel = 1; peel >= 0; peel--) {
1813     // %%%% Could load both offset and interface in one ldx, if they were
1814     // in the opposite order. This would save a load.
1815     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1816 
1817     // Check that this entry is non-null. A null entry means that
1818     // the receiver class doesn't implement the interface, and wasn't the
1819     // same as when the caller was compiled.
1820     cmpd(CCR0, temp2, intf_klass);
1821 
1822     if (peel) {
1823       beq(CCR0, found_method);
1824     } else {
1825       bne(CCR0, search);
1826       // (invert the test to fall through to found_method...)
1827     }
1828 
1829     if (!peel) break;
1830 
1831     bind(search);
1832 
1833     cmpdi(CCR0, temp2, 0);
1834     beq(CCR0, L_no_such_interface);
1835     addi(scan_temp, scan_temp, scan_step);
1836   }
1837 
1838   bind(found_method);
1839 
1840   // Got a hit.
1841   if (return_method) {
1842     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1843     lwz(scan_temp, ito_offset, scan_temp);
1844     ldx(method_result, scan_temp, method_result);
1845   }
1846 }
1847 
1848 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1849 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1850                                            RegisterOrConstant vtable_index,
1851                                            Register method_result) {
1852 
1853   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1854 
1855   const int base = in_bytes(Klass::vtable_start_offset());
1856   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1857 
1858   if (vtable_index.is_register()) {
1859     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1860     add(recv_klass, vtable_index.as_register(), recv_klass);
1861   } else {
1862     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1863   }
1864   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1865 }
1866 
1867 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp1_reg,Register temp2_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1868 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1869                                                    Register super_klass,
1870                                                    Register temp1_reg,
1871                                                    Register temp2_reg,
1872                                                    Label* L_success,
1873                                                    Label* L_failure,
1874                                                    Label* L_slow_path,
1875                                                    RegisterOrConstant super_check_offset) {
1876 
1877   const Register check_cache_offset = temp1_reg;
1878   const Register cached_super       = temp2_reg;
1879 
1880   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1881 
1882   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1883   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1884 
1885   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1886   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1887 
1888   Label L_fallthrough;
1889   int label_nulls = 0;
1890   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1891   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1892   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1893   assert(label_nulls <= 1 ||
1894          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1895          "at most one NULL in the batch, usually");
1896 
1897   // If the pointers are equal, we are done (e.g., String[] elements).
1898   // This self-check enables sharing of secondary supertype arrays among
1899   // non-primary types such as array-of-interface. Otherwise, each such
1900   // type would need its own customized SSA.
1901   // We move this check to the front of the fast path because many
1902   // type checks are in fact trivially successful in this manner,
1903   // so we get a nicely predicted branch right at the start of the check.
1904   cmpd(CCR0, sub_klass, super_klass);
1905   beq(CCR0, *L_success);
1906 
1907   // Check the supertype display:
1908   if (must_load_sco) {
1909     // The super check offset is always positive...
1910     lwz(check_cache_offset, sco_offset, super_klass);
1911     super_check_offset = RegisterOrConstant(check_cache_offset);
1912     // super_check_offset is register.
1913     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1914   }
1915   // The loaded value is the offset from KlassOopDesc.
1916 
1917   ld(cached_super, super_check_offset, sub_klass);
1918   cmpd(CCR0, cached_super, super_klass);
1919 
1920   // This check has worked decisively for primary supers.
1921   // Secondary supers are sought in the super_cache ('super_cache_addr').
1922   // (Secondary supers are interfaces and very deeply nested subtypes.)
1923   // This works in the same check above because of a tricky aliasing
1924   // between the super_cache and the primary super display elements.
1925   // (The 'super_check_addr' can address either, as the case requires.)
1926   // Note that the cache is updated below if it does not help us find
1927   // what we need immediately.
1928   // So if it was a primary super, we can just fail immediately.
1929   // Otherwise, it's the slow path for us (no success at this point).
1930 
1931 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1932 
1933   if (super_check_offset.is_register()) {
1934     beq(CCR0, *L_success);
1935     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1936     if (L_failure == &L_fallthrough) {
1937       beq(CCR0, *L_slow_path);
1938     } else {
1939       bne(CCR0, *L_failure);
1940       FINAL_JUMP(*L_slow_path);
1941     }
1942   } else {
1943     if (super_check_offset.as_constant() == sc_offset) {
1944       // Need a slow path; fast failure is impossible.
1945       if (L_slow_path == &L_fallthrough) {
1946         beq(CCR0, *L_success);
1947       } else {
1948         bne(CCR0, *L_slow_path);
1949         FINAL_JUMP(*L_success);
1950       }
1951     } else {
1952       // No slow path; it's a fast decision.
1953       if (L_failure == &L_fallthrough) {
1954         beq(CCR0, *L_success);
1955       } else {
1956         bne(CCR0, *L_failure);
1957         FINAL_JUMP(*L_success);
1958       }
1959     }
1960   }
1961 
1962   bind(L_fallthrough);
1963 #undef FINAL_JUMP
1964 }
1965 
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp1_reg,Register temp2_reg,Label * L_success,Register result_reg)1966 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1967                                                    Register super_klass,
1968                                                    Register temp1_reg,
1969                                                    Register temp2_reg,
1970                                                    Label* L_success,
1971                                                    Register result_reg) {
1972   const Register array_ptr = temp1_reg; // current value from cache array
1973   const Register temp      = temp2_reg;
1974 
1975   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1976 
1977   int source_offset = in_bytes(Klass::secondary_supers_offset());
1978   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1979 
1980   int length_offset = Array<Klass*>::length_offset_in_bytes();
1981   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1982 
1983   Label hit, loop, failure, fallthru;
1984 
1985   ld(array_ptr, source_offset, sub_klass);
1986 
1987   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1988   lwz(temp, length_offset, array_ptr);
1989   cmpwi(CCR0, temp, 0);
1990   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1991 
1992   mtctr(temp); // load ctr
1993 
1994   bind(loop);
1995   // Oops in table are NO MORE compressed.
1996   ld(temp, base_offset, array_ptr);
1997   cmpd(CCR0, temp, super_klass);
1998   beq(CCR0, hit);
1999   addi(array_ptr, array_ptr, BytesPerWord);
2000   bdnz(loop);
2001 
2002   bind(failure);
2003   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2004   b(fallthru);
2005 
2006   bind(hit);
2007   std(super_klass, target_offset, sub_klass); // save result to cache
2008   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2009   if (L_success != NULL) { b(*L_success); }
2010   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2011 
2012   bind(fallthru);
2013 }
2014 
2015 // Try fast path, then go to slow one if not successful
check_klass_subtype(Register sub_klass,Register super_klass,Register temp1_reg,Register temp2_reg,Label & L_success)2016 void MacroAssembler::check_klass_subtype(Register sub_klass,
2017                          Register super_klass,
2018                          Register temp1_reg,
2019                          Register temp2_reg,
2020                          Label& L_success) {
2021   Label L_failure;
2022   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2023   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2024   bind(L_failure); // Fallthru if not successful.
2025 }
2026 
clinit_barrier(Register klass,Register thread,Label * L_fast_path,Label * L_slow_path)2027 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2028   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2029 
2030   Label L_fallthrough;
2031   if (L_fast_path == NULL) {
2032     L_fast_path = &L_fallthrough;
2033   } else if (L_slow_path == NULL) {
2034     L_slow_path = &L_fallthrough;
2035   }
2036 
2037   // Fast path check: class is fully initialized
2038   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2039   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2040   beq(CCR0, *L_fast_path);
2041 
2042   // Fast path check: current thread is initializer thread
2043   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2044   cmpd(CCR0, thread, R0);
2045   if (L_slow_path == &L_fallthrough) {
2046     beq(CCR0, *L_fast_path);
2047   } else if (L_fast_path == &L_fallthrough) {
2048     bne(CCR0, *L_slow_path);
2049   } else {
2050     Unimplemented();
2051   }
2052 
2053   bind(L_fallthrough);
2054 }
2055 
argument_offset(RegisterOrConstant arg_slot,Register temp_reg,int extra_slot_offset)2056 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2057                                                    Register temp_reg,
2058                                                    int extra_slot_offset) {
2059   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2060   int stackElementSize = Interpreter::stackElementSize;
2061   int offset = extra_slot_offset * stackElementSize;
2062   if (arg_slot.is_constant()) {
2063     offset += arg_slot.as_constant() * stackElementSize;
2064     return offset;
2065   } else {
2066     assert(temp_reg != noreg, "must specify");
2067     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2068     if (offset != 0)
2069       addi(temp_reg, temp_reg, offset);
2070     return temp_reg;
2071   }
2072 }
2073 
2074 // Supports temp2_reg = R0.
biased_locking_enter(ConditionRegister cr_reg,Register obj_reg,Register mark_reg,Register temp_reg,Register temp2_reg,Label & done,Label * slow_case)2075 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2076                                           Register mark_reg, Register temp_reg,
2077                                           Register temp2_reg, Label& done, Label* slow_case) {
2078   assert(UseBiasedLocking, "why call this otherwise?");
2079 
2080 #ifdef ASSERT
2081   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2082 #endif
2083 
2084   Label cas_label;
2085 
2086   // Branch to done if fast path fails and no slow_case provided.
2087   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2088 
2089   // Biased locking
2090   // See whether the lock is currently biased toward our thread and
2091   // whether the epoch is still valid
2092   // Note that the runtime guarantees sufficient alignment of JavaThread
2093   // pointers to allow age to be placed into low bits
2094   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,
2095          "biased locking makes assumptions about bit layout");
2096 
2097   if (PrintBiasedLockingStatistics) {
2098     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2099     lwzx(temp_reg, temp2_reg);
2100     addi(temp_reg, temp_reg, 1);
2101     stwx(temp_reg, temp2_reg);
2102   }
2103 
2104   andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);
2105   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2106   bne(cr_reg, cas_label);
2107 
2108   load_klass(temp_reg, obj_reg);
2109 
2110   load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));
2111   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2112   orr(temp_reg, R16_thread, temp_reg);
2113   xorr(temp_reg, mark_reg, temp_reg);
2114   andr(temp_reg, temp_reg, temp2_reg);
2115   cmpdi(cr_reg, temp_reg, 0);
2116   if (PrintBiasedLockingStatistics) {
2117     Label l;
2118     bne(cr_reg, l);
2119     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2120     lwzx(mark_reg, temp2_reg);
2121     addi(mark_reg, mark_reg, 1);
2122     stwx(mark_reg, temp2_reg);
2123     // restore mark_reg
2124     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2125     bind(l);
2126   }
2127   beq(cr_reg, done);
2128 
2129   Label try_revoke_bias;
2130   Label try_rebias;
2131 
2132   // At this point we know that the header has the bias pattern and
2133   // that we are not the bias owner in the current epoch. We need to
2134   // figure out more details about the state of the header in order to
2135   // know what operations can be legally performed on the object's
2136   // header.
2137 
2138   // If the low three bits in the xor result aren't clear, that means
2139   // the prototype header is no longer biased and we have to revoke
2140   // the bias on this object.
2141   andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);
2142   cmpwi(cr_reg, temp2_reg, 0);
2143   bne(cr_reg, try_revoke_bias);
2144 
2145   // Biasing is still enabled for this data type. See whether the
2146   // epoch of the current bias is still valid, meaning that the epoch
2147   // bits of the mark word are equal to the epoch bits of the
2148   // prototype header. (Note that the prototype header's epoch bits
2149   // only change at a safepoint.) If not, attempt to rebias the object
2150   // toward the current thread. Note that we must be absolutely sure
2151   // that the current epoch is invalid in order to do this because
2152   // otherwise the manipulations it performs on the mark word are
2153   // illegal.
2154 
2155   int shift_amount = 64 - markWord::epoch_shift;
2156   // rotate epoch bits to right (little) end and set other bits to 0
2157   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2158   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);
2159   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2160   bne(CCR0, try_rebias);
2161 
2162   // The epoch of the current bias is still valid but we know nothing
2163   // about the owner; it might be set or it might be clear. Try to
2164   // acquire the bias of the object using an atomic operation. If this
2165   // fails we will go in to the runtime to revoke the object's bias.
2166   // Note that we first construct the presumed unbiased header so we
2167   // don't accidentally blow away another thread's valid bias.
2168   andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |
2169                                 markWord::age_mask_in_place |
2170                                 markWord::epoch_mask_in_place));
2171   orr(temp_reg, R16_thread, mark_reg);
2172 
2173   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2174 
2175   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2176   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2177            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2178            /*where=*/obj_reg,
2179            MacroAssembler::MemBarAcq,
2180            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2181            noreg, slow_case_int); // bail out if failed
2182 
2183   // If the biasing toward our thread failed, this means that
2184   // another thread succeeded in biasing it toward itself and we
2185   // need to revoke that bias. The revocation will occur in the
2186   // interpreter runtime in the slow case.
2187   if (PrintBiasedLockingStatistics) {
2188     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2189     lwzx(temp_reg, temp2_reg);
2190     addi(temp_reg, temp_reg, 1);
2191     stwx(temp_reg, temp2_reg);
2192   }
2193   b(done);
2194 
2195   bind(try_rebias);
2196   // At this point we know the epoch has expired, meaning that the
2197   // current "bias owner", if any, is actually invalid. Under these
2198   // circumstances _only_, we are allowed to use the current header's
2199   // value as the comparison value when doing the cas to acquire the
2200   // bias in the current epoch. In other words, we allow transfer of
2201   // the bias from one thread to another directly in this situation.
2202   load_klass(temp_reg, obj_reg);
2203   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2204   orr(temp2_reg, R16_thread, temp2_reg);
2205   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2206   orr(temp_reg, temp2_reg, temp_reg);
2207 
2208   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2209 
2210   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2211                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2212                  /*where=*/obj_reg,
2213                  MacroAssembler::MemBarAcq,
2214                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2215                  noreg, slow_case_int); // bail out if failed
2216 
2217   // If the biasing toward our thread failed, this means that
2218   // another thread succeeded in biasing it toward itself and we
2219   // need to revoke that bias. The revocation will occur in the
2220   // interpreter runtime in the slow case.
2221   if (PrintBiasedLockingStatistics) {
2222     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2223     lwzx(temp_reg, temp2_reg);
2224     addi(temp_reg, temp_reg, 1);
2225     stwx(temp_reg, temp2_reg);
2226   }
2227   b(done);
2228 
2229   bind(try_revoke_bias);
2230   // The prototype mark in the klass doesn't have the bias bit set any
2231   // more, indicating that objects of this data type are not supposed
2232   // to be biased any more. We are going to try to reset the mark of
2233   // this object to the prototype value and fall through to the
2234   // CAS-based locking scheme. Note that if our CAS fails, it means
2235   // that another thread raced us for the privilege of revoking the
2236   // bias of this particular object, so it's okay to continue in the
2237   // normal locking code.
2238   load_klass(temp_reg, obj_reg);
2239   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2240   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2241   orr(temp_reg, temp_reg, temp2_reg);
2242 
2243   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2244 
2245   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2246   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2247                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2248                  /*where=*/obj_reg,
2249                  MacroAssembler::MemBarAcq,
2250                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2251 
2252   // reload markWord in mark_reg before continuing with lightweight locking
2253   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2254 
2255   // Fall through to the normal CAS-based lock, because no matter what
2256   // the result of the above CAS, some thread must have succeeded in
2257   // removing the bias bit from the object's header.
2258   if (PrintBiasedLockingStatistics) {
2259     Label l;
2260     bne(cr_reg, l);
2261     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2262     lwzx(temp_reg, temp2_reg);
2263     addi(temp_reg, temp_reg, 1);
2264     stwx(temp_reg, temp2_reg);
2265     bind(l);
2266   }
2267 
2268   bind(cas_label);
2269 }
2270 
biased_locking_exit(ConditionRegister cr_reg,Register mark_addr,Register temp_reg,Label & done)2271 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2272   // Check for biased locking unlock case, which is a no-op
2273   // Note: we do not have to check the thread ID for two reasons.
2274   // First, the interpreter checks for IllegalMonitorStateException at
2275   // a higher level. Second, if the bias was revoked while we held the
2276   // lock, the object could not be rebiased toward another thread, so
2277   // the bias bit would be clear.
2278 
2279   ld(temp_reg, 0, mark_addr);
2280   andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
2281 
2282   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2283   beq(cr_reg, done);
2284 }
2285 
2286 // allocation (for C1)
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)2287 void MacroAssembler::eden_allocate(
2288   Register obj,                      // result: pointer to object after successful allocation
2289   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2290   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2291   Register t1,                       // temp register
2292   Register t2,                       // temp register
2293   Label&   slow_case                 // continuation point if fast allocation fails
2294 ) {
2295   b(slow_case);
2296 }
2297 
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)2298 void MacroAssembler::tlab_allocate(
2299   Register obj,                      // result: pointer to object after successful allocation
2300   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2301   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2302   Register t1,                       // temp register
2303   Label&   slow_case                 // continuation point if fast allocation fails
2304 ) {
2305   // make sure arguments make sense
2306   assert_different_registers(obj, var_size_in_bytes, t1);
2307   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2308   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2309 
2310   const Register new_top = t1;
2311   //verify_tlab(); not implemented
2312 
2313   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2314   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2315   if (var_size_in_bytes == noreg) {
2316     addi(new_top, obj, con_size_in_bytes);
2317   } else {
2318     add(new_top, obj, var_size_in_bytes);
2319   }
2320   cmpld(CCR0, new_top, R0);
2321   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2322 
2323 #ifdef ASSERT
2324   // make sure new free pointer is properly aligned
2325   {
2326     Label L;
2327     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2328     beq(CCR0, L);
2329     stop("updated TLAB free is not properly aligned");
2330     bind(L);
2331   }
2332 #endif // ASSERT
2333 
2334   // update the tlab top pointer
2335   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2336   //verify_tlab(); not implemented
2337 }
incr_allocated_bytes(RegisterOrConstant size_in_bytes,Register t1,Register t2)2338 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2339   unimplemented("incr_allocated_bytes");
2340 }
2341 
emit_trampoline_stub(int destination_toc_offset,int insts_call_instruction_offset,Register Rtoc)2342 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2343                                              int insts_call_instruction_offset, Register Rtoc) {
2344   // Start the stub.
2345   address stub = start_a_stub(64);
2346   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2347 
2348   // Create a trampoline stub relocation which relates this trampoline stub
2349   // with the call instruction at insts_call_instruction_offset in the
2350   // instructions code-section.
2351   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2352   const int stub_start_offset = offset();
2353 
2354   // For java_to_interp stubs we use R11_scratch1 as scratch register
2355   // and in call trampoline stubs we use R12_scratch2. This way we
2356   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2357   Register reg_scratch = R12_scratch2;
2358 
2359   // Now, create the trampoline stub's code:
2360   // - load the TOC
2361   // - load the call target from the constant pool
2362   // - call
2363   if (Rtoc == noreg) {
2364     calculate_address_from_global_toc(reg_scratch, method_toc());
2365     Rtoc = reg_scratch;
2366   }
2367 
2368   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2369   mtctr(reg_scratch);
2370   bctr();
2371 
2372   const address stub_start_addr = addr_at(stub_start_offset);
2373 
2374   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2375   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2376          "encoded offset into the constant pool must match");
2377   // Trampoline_stub_size should be good.
2378   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2379   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2380 
2381   // End the stub.
2382   end_a_stub();
2383   return stub;
2384 }
2385 
2386 // TM on PPC64.
atomic_inc_ptr(Register addr,Register result,int simm16)2387 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2388   Label retry;
2389   bind(retry);
2390   ldarx(result, addr, /*hint*/ false);
2391   addi(result, result, simm16);
2392   stdcx_(result, addr);
2393   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2394     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2395   } else {
2396     bne(                  CCR0, retry); // stXcx_ sets CCR0
2397   }
2398 }
2399 
atomic_ori_int(Register addr,Register result,int uimm16)2400 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2401   Label retry;
2402   bind(retry);
2403   lwarx(result, addr, /*hint*/ false);
2404   ori(result, result, uimm16);
2405   stwcx_(result, addr);
2406   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2407     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2408   } else {
2409     bne(                  CCR0, retry); // stXcx_ sets CCR0
2410   }
2411 }
2412 
2413 #if INCLUDE_RTM_OPT
2414 
2415 // Update rtm_counters based on abort status
2416 // input: abort_status
2417 //        rtm_counters_Reg (RTMLockingCounters*)
rtm_counters_update(Register abort_status,Register rtm_counters_Reg)2418 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2419   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2420   // x86 ppc (! means inverted, ? means not the same)
2421   //  0   31  Set if abort caused by XABORT instruction.
2422   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2423   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2424   //  3   10  Set if an internal buffer overflowed.
2425   //  4  ?12  Set if a debug breakpoint was hit.
2426   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2427   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2428                              tm_failure_persistent,
2429                              tm_non_trans_cf,
2430                              tm_trans_cf,
2431                              tm_footprint_of,
2432                              tm_failure_code,
2433                              tm_transaction_level};
2434 
2435   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2436   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2437 
2438   const int bit2counter_map[][num_counters] =
2439   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2440   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2441   // Care must be taken when mapping bits to counters as bits for a given
2442   // counter must be mutually exclusive. Otherwise, the counter will be
2443   // incremented more than once.
2444   // counters:
2445   // 0        1        2         3         4         5
2446   // abort  , persist, conflict, overflow, debug   , nested         bits:
2447   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2448    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2449    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2450    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2451    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2452    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2453    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2454   // ...
2455 
2456   // Move abort_status value to R0 and use abort_status register as a
2457   // temporary register because R0 as third operand in ld/std is treated
2458   // as base address zero (value). Likewise, R0 as second operand in addi
2459   // is problematic because it amounts to li.
2460   const Register temp_Reg = abort_status;
2461   const Register abort_status_R0 = R0;
2462   mr(abort_status_R0, abort_status);
2463 
2464   // Increment total abort counter.
2465   int counters_offs = RTMLockingCounters::abort_count_offset();
2466   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2467   addi(temp_Reg, temp_Reg, 1);
2468   std(temp_Reg, counters_offs, rtm_counters_Reg);
2469 
2470   // Increment specific abort counters.
2471   if (PrintPreciseRTMLockingStatistics) {
2472 
2473     // #0 counter offset.
2474     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2475 
2476     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2477       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2478         if (bit2counter_map[nbit][ncounter] != 0) {
2479           Label check_abort;
2480           int abort_counter_offs = abortX_offs + (ncounter << 3);
2481 
2482           if (failure_bit[nbit] == tm_transaction_level) {
2483             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2484             // 11 bits in the TL field are checked to find out if failure
2485             // occured in a nested transaction. This check also matches
2486             // the case when nesting_of = 1 (nesting overflow).
2487             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2488           } else if (failure_bit[nbit] == tm_failure_code) {
2489             // Check failure code for trap or illegal caught in TM.
2490             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2491             // tabort or treclaim source operand.
2492             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2493             rldicl(temp_Reg, abort_status_R0, 8, 56);
2494             cmpdi(CCR0, temp_Reg, 0xD4);
2495           } else {
2496             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2497           }
2498 
2499           if (bit2counter_map[nbit][ncounter] == 1) {
2500             beq(CCR0, check_abort);
2501           } else {
2502             bne(CCR0, check_abort);
2503           }
2504 
2505           // We don't increment atomically.
2506           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2507           addi(temp_Reg, temp_Reg, 1);
2508           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2509 
2510           bind(check_abort);
2511         }
2512       }
2513     }
2514   }
2515   // Restore abort_status.
2516   mr(abort_status, abort_status_R0);
2517 }
2518 
2519 // Branch if (random & (count-1) != 0), count is 2^n
2520 // tmp and CR0 are killed
branch_on_random_using_tb(Register tmp,int count,Label & brLabel)2521 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2522   mftb(tmp);
2523   andi_(tmp, tmp, count-1);
2524   bne(CCR0, brLabel);
2525 }
2526 
2527 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2528 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
rtm_abort_ratio_calculation(Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data)2529 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2530                                                  RTMLockingCounters* rtm_counters,
2531                                                  Metadata* method_data) {
2532   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2533 
2534   if (RTMLockingCalculationDelay > 0) {
2535     // Delay calculation.
2536     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2537     cmpdi(CCR0, rtm_counters_Reg, 0);
2538     beq(CCR0, L_done);
2539     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2540   }
2541   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2542   //   Aborted transactions = abort_count * 100
2543   //   All transactions = total_count *  RTMTotalCountIncrRate
2544   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2545   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2546   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2547     cmpdi(CCR0, R0, RTMAbortThreshold);
2548     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2549   } else {
2550     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2551     cmpd(CCR0, R0, rtm_counters_Reg);
2552     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2553   }
2554   mulli(R0, R0, 100);
2555 
2556   const Register tmpReg = rtm_counters_Reg;
2557   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2558   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2559   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2560   cmpd(CCR0, R0, tmpReg);
2561   blt(CCR0, L_check_always_rtm1); // jump to reload
2562   if (method_data != NULL) {
2563     // Set rtm_state to "no rtm" in MDO.
2564     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2565     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2566     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2567     atomic_ori_int(R0, tmpReg, NoRTM);
2568   }
2569   b(L_done);
2570 
2571   bind(L_check_always_rtm1);
2572   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2573   bind(L_check_always_rtm2);
2574   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2575   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2576   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2577     cmpdi(CCR0, tmpReg, thresholdValue);
2578   } else {
2579     load_const_optimized(R0, thresholdValue);
2580     cmpd(CCR0, tmpReg, R0);
2581   }
2582   blt(CCR0, L_done);
2583   if (method_data != NULL) {
2584     // Set rtm_state to "always rtm" in MDO.
2585     // Not using a metadata relocation. See above.
2586     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2587     atomic_ori_int(R0, tmpReg, UseRTM);
2588   }
2589   bind(L_done);
2590 }
2591 
2592 // Update counters and perform abort ratio calculation.
2593 // input: abort_status_Reg
rtm_profiling(Register abort_status_Reg,Register temp_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm)2594 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2595                                    RTMLockingCounters* rtm_counters,
2596                                    Metadata* method_data,
2597                                    bool profile_rtm) {
2598 
2599   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2600   // Update rtm counters based on state at abort.
2601   // Reads abort_status_Reg, updates flags.
2602   assert_different_registers(abort_status_Reg, temp_Reg);
2603   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2604   rtm_counters_update(abort_status_Reg, temp_Reg);
2605   if (profile_rtm) {
2606     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2607     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2608   }
2609 }
2610 
2611 // Retry on abort if abort's status indicates non-persistent failure.
2612 // inputs: retry_count_Reg
2613 //       : abort_status_Reg
2614 // output: retry_count_Reg decremented by 1
rtm_retry_lock_on_abort(Register retry_count_Reg,Register abort_status_Reg,Label & retryLabel,Label * checkRetry)2615 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2616                                              Label& retryLabel, Label* checkRetry) {
2617   Label doneRetry;
2618 
2619   // Don't retry if failure is persistent.
2620   // The persistent bit is set when a (A) Disallowed operation is performed in
2621   // transactional state, like for instance trying to write the TFHAR after a
2622   // transaction is started; or when there is (B) a Nesting Overflow (too many
2623   // nested transactions); or when (C) the Footprint overflows (too many
2624   // addressess touched in TM state so there is no more space in the footprint
2625   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2626   // store is performed to a given address in TM state, then once in suspended
2627   // state the same address is accessed. Failure (A) is very unlikely to occur
2628   // in the JVM. Failure (D) will never occur because Suspended state is never
2629   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2630   // Overflow will set the persistent bit.
2631   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2632   bne(CCR0, doneRetry);
2633 
2634   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2635   // tabort instruction.
2636   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2637   bne(CCR0, doneRetry);
2638 
2639   // Retry if transaction aborted due to a conflict with another thread.
2640   if (checkRetry) { bind(*checkRetry); }
2641   addic_(retry_count_Reg, retry_count_Reg, -1);
2642   blt(CCR0, doneRetry);
2643   b(retryLabel);
2644   bind(doneRetry);
2645 }
2646 
2647 // Spin and retry if lock is busy.
2648 // inputs: owner_addr_Reg (monitor address)
2649 //       : retry_count_Reg
2650 // output: retry_count_Reg decremented by 1
2651 // CTR is killed
rtm_retry_lock_on_busy(Register retry_count_Reg,Register owner_addr_Reg,Label & retryLabel)2652 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2653   Label SpinLoop, doneRetry, doRetry;
2654   addic_(retry_count_Reg, retry_count_Reg, -1);
2655   blt(CCR0, doneRetry);
2656 
2657   if (RTMSpinLoopCount > 1) {
2658     li(R0, RTMSpinLoopCount);
2659     mtctr(R0);
2660   }
2661 
2662   // low thread priority
2663   smt_prio_low();
2664   bind(SpinLoop);
2665 
2666   if (RTMSpinLoopCount > 1) {
2667     bdz(doRetry);
2668     ld(R0, 0, owner_addr_Reg);
2669     cmpdi(CCR0, R0, 0);
2670     bne(CCR0, SpinLoop);
2671   }
2672 
2673   bind(doRetry);
2674 
2675   // restore thread priority to default in userspace
2676 #ifdef LINUX
2677   smt_prio_medium_low();
2678 #else
2679   smt_prio_medium();
2680 #endif
2681 
2682   b(retryLabel);
2683 
2684   bind(doneRetry);
2685 }
2686 
2687 // Use RTM for normal stack locks.
2688 // Input: objReg (object to lock)
rtm_stack_locking(ConditionRegister flag,Register obj,Register mark_word,Register tmp,Register retry_on_abort_count_Reg,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL,Label & IsInflated)2689 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2690                                        Register obj, Register mark_word, Register tmp,
2691                                        Register retry_on_abort_count_Reg,
2692                                        RTMLockingCounters* stack_rtm_counters,
2693                                        Metadata* method_data, bool profile_rtm,
2694                                        Label& DONE_LABEL, Label& IsInflated) {
2695   assert(UseRTMForStackLocks, "why call this otherwise?");
2696   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2697   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2698 
2699   if (RTMRetryCount > 0) {
2700     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2701     bind(L_rtm_retry);
2702   }
2703   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
2704   bne(CCR0, IsInflated);
2705 
2706   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2707     Label L_noincrement;
2708     if (RTMTotalCountIncrRate > 1) {
2709       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2710     }
2711     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2712     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2713     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2714     ldx(mark_word, tmp);
2715     addi(mark_word, mark_word, 1);
2716     stdx(mark_word, tmp);
2717     bind(L_noincrement);
2718   }
2719   tbegin_();
2720   beq(CCR0, L_on_abort);
2721   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);      // Reload in transaction, conflicts need to be tracked.
2722   andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2723   cmpwi(flag, R0, markWord::unlocked_value);                // bits = 001 unlocked
2724   beq(flag, DONE_LABEL);                                    // all done if unlocked
2725 
2726   if (UseRTMXendForLockBusy) {
2727     tend_();
2728     b(L_decrement_retry);
2729   } else {
2730     tabort_();
2731   }
2732   bind(L_on_abort);
2733   const Register abort_status_Reg = tmp;
2734   mftexasr(abort_status_Reg);
2735   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2736     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2737   }
2738   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2739   if (RTMRetryCount > 0) {
2740     // Retry on lock abort if abort status is not permanent.
2741     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2742   } else {
2743     bind(L_decrement_retry);
2744   }
2745 }
2746 
2747 // Use RTM for inflating locks
2748 // inputs: obj       (object to lock)
2749 //         mark_word (current header - KILLED)
2750 //         boxReg    (on-stack box address (displaced header location) - KILLED)
rtm_inflated_locking(ConditionRegister flag,Register obj,Register mark_word,Register boxReg,Register retry_on_busy_count_Reg,Register retry_on_abort_count_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL)2751 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2752                                           Register obj, Register mark_word, Register boxReg,
2753                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2754                                           RTMLockingCounters* rtm_counters,
2755                                           Metadata* method_data, bool profile_rtm,
2756                                           Label& DONE_LABEL) {
2757   assert(UseRTMLocking, "why call this otherwise?");
2758   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2759   // Clean monitor_value bit to get valid pointer.
2760   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2761 
2762   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2763   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2764   const Register tmpReg = boxReg;
2765   const Register owner_addr_Reg = mark_word;
2766   addi(owner_addr_Reg, mark_word, owner_offset);
2767 
2768   if (RTMRetryCount > 0) {
2769     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2770     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2771     bind(L_rtm_retry);
2772   }
2773   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2774     Label L_noincrement;
2775     if (RTMTotalCountIncrRate > 1) {
2776       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2777     }
2778     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2779     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2780     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2781     ldx(tmpReg, R0);
2782     addi(tmpReg, tmpReg, 1);
2783     stdx(tmpReg, R0);
2784     bind(L_noincrement);
2785   }
2786   tbegin_();
2787   beq(CCR0, L_on_abort);
2788   // We don't reload mark word. Will only be reset at safepoint.
2789   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2790   cmpdi(flag, R0, 0);
2791   beq(flag, DONE_LABEL);
2792 
2793   if (UseRTMXendForLockBusy) {
2794     tend_();
2795     b(L_decrement_retry);
2796   } else {
2797     tabort_();
2798   }
2799   bind(L_on_abort);
2800   const Register abort_status_Reg = tmpReg;
2801   mftexasr(abort_status_Reg);
2802   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2803     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2804     // Restore owner_addr_Reg
2805     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2806 #ifdef ASSERT
2807     andi_(R0, mark_word, markWord::monitor_value);
2808     asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2809 #endif
2810     addi(owner_addr_Reg, mark_word, owner_offset);
2811   }
2812   if (RTMRetryCount > 0) {
2813     // Retry on lock abort if abort status is not permanent.
2814     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2815   }
2816 
2817   // Appears unlocked - try to swing _owner from null to non-null.
2818   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2819            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2820            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2821 
2822   if (RTMRetryCount > 0) {
2823     // success done else retry
2824     b(DONE_LABEL);
2825     bind(L_decrement_retry);
2826     // Spin and retry if lock is busy.
2827     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2828   } else {
2829     bind(L_decrement_retry);
2830   }
2831 }
2832 
2833 #endif //  INCLUDE_RTM_OPT
2834 
2835 // "The box" is the space on the stack where we copy the object mark.
compiler_fast_lock_object(ConditionRegister flag,Register oop,Register box,Register temp,Register displaced_header,Register current_header,bool try_bias,RTMLockingCounters * rtm_counters,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool use_rtm,bool profile_rtm)2836 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2837                                                Register temp, Register displaced_header, Register current_header,
2838                                                bool try_bias,
2839                                                RTMLockingCounters* rtm_counters,
2840                                                RTMLockingCounters* stack_rtm_counters,
2841                                                Metadata* method_data,
2842                                                bool use_rtm, bool profile_rtm) {
2843   assert_different_registers(oop, box, temp, displaced_header, current_header);
2844   assert(flag != CCR0, "bad condition register");
2845   Label cont;
2846   Label object_has_monitor;
2847   Label cas_failed;
2848 
2849   // Load markWord from object into displaced_header.
2850   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2851 
2852 
2853   if (try_bias) {
2854     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2855   }
2856 
2857 #if INCLUDE_RTM_OPT
2858   if (UseRTMForStackLocks && use_rtm) {
2859     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2860                       stack_rtm_counters, method_data, profile_rtm,
2861                       cont, object_has_monitor);
2862   }
2863 #endif // INCLUDE_RTM_OPT
2864 
2865   // Handle existing monitor.
2866   // The object has an existing monitor iff (mark & monitor_value) != 0.
2867   andi_(temp, displaced_header, markWord::monitor_value);
2868   bne(CCR0, object_has_monitor);
2869 
2870   // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2871   ori(displaced_header, displaced_header, markWord::unlocked_value);
2872 
2873   // Load Compare Value application register.
2874 
2875   // Initialize the box. (Must happen before we update the object mark!)
2876   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2877 
2878   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2879   // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2880   cmpxchgd(/*flag=*/flag,
2881            /*current_value=*/current_header,
2882            /*compare_value=*/displaced_header,
2883            /*exchange_value=*/box,
2884            /*where=*/oop,
2885            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2886            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2887            noreg,
2888            &cas_failed,
2889            /*check without membar and ldarx first*/true);
2890   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2891 
2892   // If the compare-and-exchange succeeded, then we found an unlocked
2893   // object and we have now locked it.
2894   b(cont);
2895 
2896   bind(cas_failed);
2897   // We did not see an unlocked object so try the fast recursive case.
2898 
2899   // Check if the owner is self by comparing the value in the markWord of object
2900   // (current_header) with the stack pointer.
2901   sub(current_header, current_header, R1_SP);
2902   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2903 
2904   and_(R0/*==0?*/, current_header, temp);
2905   // If condition is true we are cont and hence we can store 0 as the
2906   // displaced header in the box, which indicates that it is a recursive lock.
2907   mcrf(flag,CCR0);
2908   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2909 
2910   // Handle existing monitor.
2911   b(cont);
2912 
2913   bind(object_has_monitor);
2914   // The object's monitor m is unlocked iff m->owner == NULL,
2915   // otherwise m->owner may contain a thread or a stack address.
2916 
2917 #if INCLUDE_RTM_OPT
2918   // Use the same RTM locking code in 32- and 64-bit VM.
2919   if (use_rtm) {
2920     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2921                          rtm_counters, method_data, profile_rtm, cont);
2922   } else {
2923 #endif // INCLUDE_RTM_OPT
2924 
2925   // Try to CAS m->owner from NULL to current thread.
2926   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2927   cmpxchgd(/*flag=*/flag,
2928            /*current_value=*/current_header,
2929            /*compare_value=*/(intptr_t)0,
2930            /*exchange_value=*/R16_thread,
2931            /*where=*/temp,
2932            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2933            MacroAssembler::cmpxchgx_hint_acquire_lock());
2934 
2935   // Store a non-null value into the box.
2936   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2937 
2938 # ifdef ASSERT
2939   bne(flag, cont);
2940   // We have acquired the monitor, check some invariants.
2941   addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2942   // Invariant 1: _recursions should be 0.
2943   //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2944   asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2945                             "monitor->_recursions should be 0");
2946 # endif
2947 
2948 #if INCLUDE_RTM_OPT
2949   } // use_rtm()
2950 #endif
2951 
2952   bind(cont);
2953   // flag == EQ indicates success
2954   // flag == NE indicates failure
2955 }
2956 
compiler_fast_unlock_object(ConditionRegister flag,Register oop,Register box,Register temp,Register displaced_header,Register current_header,bool try_bias,bool use_rtm)2957 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2958                                                  Register temp, Register displaced_header, Register current_header,
2959                                                  bool try_bias, bool use_rtm) {
2960   assert_different_registers(oop, box, temp, displaced_header, current_header);
2961   assert(flag != CCR0, "bad condition register");
2962   Label cont;
2963   Label object_has_monitor;
2964 
2965   if (try_bias) {
2966     biased_locking_exit(flag, oop, current_header, cont);
2967   }
2968 
2969 #if INCLUDE_RTM_OPT
2970   if (UseRTMForStackLocks && use_rtm) {
2971     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2972     Label L_regular_unlock;
2973     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);      // fetch markword
2974     andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2975     cmpwi(flag, R0, markWord::unlocked_value);                     // bits = 001 unlocked
2976     bne(flag, L_regular_unlock);                                   // else RegularLock
2977     tend_();                                                       // otherwise end...
2978     b(cont);                                                       // ... and we're done
2979     bind(L_regular_unlock);
2980   }
2981 #endif
2982 
2983   // Find the lock address and load the displaced header from the stack.
2984   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2985 
2986   // If the displaced header is 0, we have a recursive unlock.
2987   cmpdi(flag, displaced_header, 0);
2988   beq(flag, cont);
2989 
2990   // Handle existing monitor.
2991   // The object has an existing monitor iff (mark & monitor_value) != 0.
2992   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2993   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2994   andi_(R0, current_header, markWord::monitor_value);
2995   bne(CCR0, object_has_monitor);
2996 
2997   // Check if it is still a light weight lock, this is is true if we see
2998   // the stack address of the basicLock in the markWord of the object.
2999   // Cmpxchg sets flag to cmpd(current_header, box).
3000   cmpxchgd(/*flag=*/flag,
3001            /*current_value=*/current_header,
3002            /*compare_value=*/box,
3003            /*exchange_value=*/displaced_header,
3004            /*where=*/oop,
3005            MacroAssembler::MemBarRel,
3006            MacroAssembler::cmpxchgx_hint_release_lock(),
3007            noreg,
3008            &cont);
3009 
3010   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
3011 
3012   // Handle existing monitor.
3013   b(cont);
3014 
3015   bind(object_has_monitor);
3016   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
3017   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
3018   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
3019 
3020     // It's inflated.
3021 #if INCLUDE_RTM_OPT
3022   if (use_rtm) {
3023     Label L_regular_inflated_unlock;
3024     // Clean monitor_value bit to get valid pointer
3025     cmpdi(flag, temp, 0);
3026     bne(flag, L_regular_inflated_unlock);
3027     tend_();
3028     b(cont);
3029     bind(L_regular_inflated_unlock);
3030   }
3031 #endif
3032 
3033   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3034   xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
3035   orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3036   cmpdi(flag, temp, 0);
3037   bne(flag, cont);
3038 
3039   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3040   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3041   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3042   cmpdi(flag, temp, 0);
3043   bne(flag, cont);
3044   release();
3045   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3046 
3047   bind(cont);
3048   // flag == EQ indicates success
3049   // flag == NE indicates failure
3050 }
3051 
safepoint_poll(Label & slow_path,Register temp_reg)3052 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3053   ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3054   // Armed page has poll_bit set.
3055   andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3056   bne(CCR0, slow_path);
3057 }
3058 
resolve_jobject(Register value,Register tmp1,Register tmp2,bool needs_frame)3059 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3060   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3061   bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3062 }
3063 
3064 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3065 // in frame_ppc.hpp.
set_last_Java_frame(Register last_Java_sp,Register last_Java_pc)3066 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3067   // Always set last_Java_pc and flags first because once last_Java_sp
3068   // is visible has_last_Java_frame is true and users will look at the
3069   // rest of the fields. (Note: flags should always be zero before we
3070   // get here so doesn't need to be set.)
3071 
3072   // Verify that last_Java_pc was zeroed on return to Java
3073   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3074                           "last_Java_pc not zeroed before leaving Java");
3075 
3076   // When returning from calling out from Java mode the frame anchor's
3077   // last_Java_pc will always be set to NULL. It is set here so that
3078   // if we are doing a call to native (not VM) that we capture the
3079   // known pc and don't have to rely on the native call having a
3080   // standard frame linkage where we can find the pc.
3081   if (last_Java_pc != noreg)
3082     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3083 
3084   // Set last_Java_sp last.
3085   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3086 }
3087 
reset_last_Java_frame(void)3088 void MacroAssembler::reset_last_Java_frame(void) {
3089   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3090                              R16_thread, "SP was not set, still zero");
3091 
3092   BLOCK_COMMENT("reset_last_Java_frame {");
3093   li(R0, 0);
3094 
3095   // _last_Java_sp = 0
3096   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3097 
3098   // _last_Java_pc = 0
3099   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3100   BLOCK_COMMENT("} reset_last_Java_frame");
3101 }
3102 
set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp,Register tmp1)3103 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3104   assert_different_registers(sp, tmp1);
3105 
3106   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3107   // TOP_IJAVA_FRAME_ABI.
3108   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3109   address entry = pc();
3110   load_const_optimized(tmp1, entry);
3111 
3112   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3113 }
3114 
get_vm_result(Register oop_result)3115 void MacroAssembler::get_vm_result(Register oop_result) {
3116   // Read:
3117   //   R16_thread
3118   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3119   //
3120   // Updated:
3121   //   oop_result
3122   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3123 
3124   verify_thread();
3125 
3126   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3127   li(R0, 0);
3128   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3129 
3130   verify_oop(oop_result, FILE_AND_LINE);
3131 }
3132 
get_vm_result_2(Register metadata_result)3133 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3134   // Read:
3135   //   R16_thread
3136   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3137   //
3138   // Updated:
3139   //   metadata_result
3140   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3141 
3142   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3143   li(R0, 0);
3144   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3145 }
3146 
encode_klass_not_null(Register dst,Register src)3147 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3148   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3149   if (CompressedKlassPointers::base() != 0) {
3150     // Use dst as temp if it is free.
3151     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3152     current = dst;
3153   }
3154   if (CompressedKlassPointers::shift() != 0) {
3155     srdi(dst, current, CompressedKlassPointers::shift());
3156     current = dst;
3157   }
3158   return current;
3159 }
3160 
store_klass(Register dst_oop,Register klass,Register ck)3161 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3162   if (UseCompressedClassPointers) {
3163     Register compressedKlass = encode_klass_not_null(ck, klass);
3164     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3165   } else {
3166     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3167   }
3168 }
3169 
store_klass_gap(Register dst_oop,Register val)3170 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3171   if (UseCompressedClassPointers) {
3172     if (val == noreg) {
3173       val = R0;
3174       li(val, 0);
3175     }
3176     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3177   }
3178 }
3179 
instr_size_for_decode_klass_not_null()3180 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3181   if (!UseCompressedClassPointers) return 0;
3182   int num_instrs = 1;  // shift or move
3183   if (CompressedKlassPointers::base() != 0) num_instrs = 7;  // shift + load const + add
3184   return num_instrs * BytesPerInstWord;
3185 }
3186 
decode_klass_not_null(Register dst,Register src)3187 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3188   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3189   if (src == noreg) src = dst;
3190   Register shifted_src = src;
3191   if (CompressedKlassPointers::shift() != 0 ||
3192       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
3193     shifted_src = dst;
3194     sldi(shifted_src, src, CompressedKlassPointers::shift());
3195   }
3196   if (CompressedKlassPointers::base() != 0) {
3197     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3198   }
3199 }
3200 
load_klass(Register dst,Register src)3201 void MacroAssembler::load_klass(Register dst, Register src) {
3202   if (UseCompressedClassPointers) {
3203     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3204     // Attention: no null check here!
3205     decode_klass_not_null(dst, dst);
3206   } else {
3207     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3208   }
3209 }
3210 
3211 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result)3212 void MacroAssembler::resolve_oop_handle(Register result) {
3213   // OopHandle::resolve is an indirection.
3214   ld(result, 0, result);
3215 }
3216 
load_mirror_from_const_method(Register mirror,Register const_method)3217 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3218   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3219   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3220   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3221   resolve_oop_handle(mirror);
3222 }
3223 
load_method_holder(Register holder,Register method)3224 void MacroAssembler::load_method_holder(Register holder, Register method) {
3225   ld(holder, in_bytes(Method::const_offset()), method);
3226   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3227   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3228 }
3229 
3230 // Clear Array
3231 // For very short arrays. tmp == R0 is allowed.
clear_memory_unrolled(Register base_ptr,int cnt_dwords,Register tmp,int offset)3232 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3233   if (cnt_dwords > 0) { li(tmp, 0); }
3234   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3235 }
3236 
3237 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
clear_memory_constlen(Register base_ptr,int cnt_dwords,Register tmp)3238 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3239   if (cnt_dwords < 8) {
3240     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3241     return;
3242   }
3243 
3244   Label loop;
3245   const long loopcnt   = cnt_dwords >> 1,
3246              remainder = cnt_dwords & 1;
3247 
3248   li(tmp, loopcnt);
3249   mtctr(tmp);
3250   li(tmp, 0);
3251   bind(loop);
3252     std(tmp, 0, base_ptr);
3253     std(tmp, 8, base_ptr);
3254     addi(base_ptr, base_ptr, 16);
3255     bdnz(loop);
3256   if (remainder) { std(tmp, 0, base_ptr); }
3257 }
3258 
3259 // Kills both input registers. tmp == R0 is allowed.
clear_memory_doubleword(Register base_ptr,Register cnt_dwords,Register tmp,long const_cnt)3260 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3261   // Procedure for large arrays (uses data cache block zero instruction).
3262     Label startloop, fast, fastloop, small_rest, restloop, done;
3263     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3264               cl_dwords       = cl_size >> 3,
3265               cl_dw_addr_bits = exact_log2(cl_dwords),
3266               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3267               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3268 
3269   if (const_cnt >= 0) {
3270     // Constant case.
3271     if (const_cnt < min_cnt) {
3272       clear_memory_constlen(base_ptr, const_cnt, tmp);
3273       return;
3274     }
3275     load_const_optimized(cnt_dwords, const_cnt, tmp);
3276   } else {
3277     // cnt_dwords already loaded in register. Need to check size.
3278     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3279     blt(CCR1, small_rest);
3280   }
3281     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3282     beq(CCR0, fast);                                  // Already 128byte aligned.
3283 
3284     subfic(tmp, tmp, cl_dwords);
3285     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3286     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3287     li(tmp, 0);
3288 
3289   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3290     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3291     addi(base_ptr, base_ptr, 8);
3292     bdnz(startloop);
3293 
3294   bind(fast);                                  // Clear 128byte blocks.
3295     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3296     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3297     mtctr(tmp);                                // Load counter.
3298 
3299   bind(fastloop);
3300     dcbz(base_ptr);                    // Clear 128byte aligned block.
3301     addi(base_ptr, base_ptr, cl_size);
3302     bdnz(fastloop);
3303 
3304   bind(small_rest);
3305     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3306     beq(CCR0, done);                   // rest == 0
3307     li(tmp, 0);
3308     mtctr(cnt_dwords);                 // Load counter.
3309 
3310   bind(restloop);                      // Clear rest.
3311     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3312     addi(base_ptr, base_ptr, 8);
3313     bdnz(restloop);
3314 
3315   bind(done);
3316 }
3317 
3318 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3319 
3320 // Helpers for Intrinsic Emitters
3321 //
3322 // Revert the byte order of a 32bit value in a register
3323 //   src: 0x44556677
3324 //   dst: 0x77665544
3325 // Three steps to obtain the result:
3326 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3327 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3328 //     This value initializes dst.
3329 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3330 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3331 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3332 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3333 //     This value is mask inserted into dst with a [8..15] mask of 1s.
load_reverse_32(Register dst,Register src)3334 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3335   assert_different_registers(dst, src);
3336 
3337   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3338   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3339   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3340 }
3341 
3342 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3343 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3344 // body size from 20 to 16 instructions.
3345 // Returns the offset that was used to calculate the address of column tc3.
3346 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3347 // at hand, the original table address can be easily reconstructed.
crc32_table_columns(Register table,Register tc0,Register tc1,Register tc2,Register tc3)3348 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3349   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3350 
3351   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3352   // Layout: See StubRoutines::generate_crc_constants.
3353 #ifdef VM_LITTLE_ENDIAN
3354   const int ix0 = 3 * CRC32_TABLE_SIZE;
3355   const int ix1 = 2 * CRC32_TABLE_SIZE;
3356   const int ix2 = 1 * CRC32_TABLE_SIZE;
3357   const int ix3 = 0 * CRC32_TABLE_SIZE;
3358 #else
3359   const int ix0 = 1 * CRC32_TABLE_SIZE;
3360   const int ix1 = 2 * CRC32_TABLE_SIZE;
3361   const int ix2 = 3 * CRC32_TABLE_SIZE;
3362   const int ix3 = 4 * CRC32_TABLE_SIZE;
3363 #endif
3364   assert_different_registers(table, tc0, tc1, tc2);
3365   assert(table == tc3, "must be!");
3366 
3367   addi(tc0, table, ix0);
3368   addi(tc1, table, ix1);
3369   addi(tc2, table, ix2);
3370   if (ix3 != 0) addi(tc3, table, ix3);
3371 
3372   return ix3;
3373 }
3374 
3375 /**
3376  * uint32_t crc;
3377  * table[crc & 0xFF] ^ (crc >> 8);
3378  */
fold_byte_crc32(Register crc,Register val,Register table,Register tmp)3379 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3380   assert_different_registers(crc, table, tmp);
3381   assert_different_registers(val, table);
3382 
3383   if (crc == val) {                   // Must rotate first to use the unmodified value.
3384     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3385                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3386     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3387   } else {
3388     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3389     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3390   }
3391   lwzx(tmp, table, tmp);
3392   xorr(crc, crc, tmp);
3393 }
3394 
3395 /**
3396  * Emits code to update CRC-32 with a byte value according to constants in table.
3397  *
3398  * @param [in,out]crc   Register containing the crc.
3399  * @param [in]val       Register containing the byte to fold into the CRC.
3400  * @param [in]table     Register containing the table of crc constants.
3401  *
3402  * uint32_t crc;
3403  * val = crc_table[(val ^ crc) & 0xFF];
3404  * crc = val ^ (crc >> 8);
3405  */
update_byte_crc32(Register crc,Register val,Register table)3406 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3407   BLOCK_COMMENT("update_byte_crc32:");
3408   xorr(val, val, crc);
3409   fold_byte_crc32(crc, val, table, val);
3410 }
3411 
3412 /**
3413  * @param crc   register containing existing CRC (32-bit)
3414  * @param buf   register pointing to input byte buffer (byte*)
3415  * @param len   register containing number of bytes
3416  * @param table register pointing to CRC table
3417  */
update_byteLoop_crc32(Register crc,Register buf,Register len,Register table,Register data,bool loopAlignment)3418 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3419                                            Register data, bool loopAlignment) {
3420   assert_different_registers(crc, buf, len, table, data);
3421 
3422   Label L_mainLoop, L_done;
3423   const int mainLoop_stepping  = 1;
3424   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3425 
3426   // Process all bytes in a single-byte loop.
3427   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3428   beq(CCR0, L_done);
3429 
3430   mtctr(len);
3431   align(mainLoop_alignment);
3432   BIND(L_mainLoop);
3433     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3434     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3435     update_byte_crc32(crc, data, table);
3436     bdnz(L_mainLoop);                            // Iterate.
3437 
3438   bind(L_done);
3439 }
3440 
3441 /**
3442  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3443  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3444  */
3445 // A note on the lookup table address(es):
3446 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3447 // To save the effort of adding the column offset to the table address each time
3448 // a table element is looked up, it is possible to pass the pre-calculated
3449 // column addresses.
3450 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
update_1word_crc32(Register crc,Register buf,Register table,int bufDisp,int bufInc,Register t0,Register t1,Register t2,Register t3,Register tc0,Register tc1,Register tc2,Register tc3)3451 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3452                                         Register t0,  Register t1,  Register t2,  Register t3,
3453                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3454   assert_different_registers(crc, t3);
3455 
3456   // XOR crc with next four bytes of buffer.
3457   lwz(t3, bufDisp, buf);
3458   if (bufInc != 0) {
3459     addi(buf, buf, bufInc);
3460   }
3461   xorr(t3, t3, crc);
3462 
3463   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3464   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3465   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3466   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3467   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3468 
3469   // Use the pre-calculated column addresses.
3470   // Load pre-calculated table values.
3471   lwzx(t0, tc0, t0);
3472   lwzx(t1, tc1, t1);
3473   lwzx(t2, tc2, t2);
3474   lwzx(t3, tc3, t3);
3475 
3476   // Calculate new crc from table values.
3477   xorr(t0,  t0, t1);
3478   xorr(t2,  t2, t3);
3479   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3480 }
3481 
3482 /**
3483  * @param crc   register containing existing CRC (32-bit)
3484  * @param buf   register pointing to input byte buffer (byte*)
3485  * @param len   register containing number of bytes
3486  * @param table register pointing to CRC table
3487  *
3488  * uses R9..R12 as work register. Must be saved/restored by caller!
3489  */
kernel_crc32_1word(Register crc,Register buf,Register len,Register table,Register t0,Register t1,Register t2,Register t3,Register tc0,Register tc1,Register tc2,Register tc3,bool invertCRC)3490 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3491                                         Register t0,  Register t1,  Register t2,  Register t3,
3492                                         Register tc0, Register tc1, Register tc2, Register tc3,
3493                                         bool invertCRC) {
3494   assert_different_registers(crc, buf, len, table);
3495 
3496   Label L_mainLoop, L_tail;
3497   Register  tmp          = t0;
3498   Register  data         = t0;
3499   Register  tmp2         = t1;
3500   const int mainLoop_stepping  = 4;
3501   const int tailLoop_stepping  = 1;
3502   const int log_stepping       = exact_log2(mainLoop_stepping);
3503   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3504   const int complexThreshold   = 2*mainLoop_stepping;
3505 
3506   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3507   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3508   // for all well-behaved cases. The situation itself is detected and handled correctly
3509   // within update_byteLoop_crc32.
3510   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3511 
3512   BLOCK_COMMENT("kernel_crc32_1word {");
3513 
3514   if (invertCRC) {
3515     nand(crc, crc, crc);                      // 1s complement of crc
3516   }
3517 
3518   // Check for short (<mainLoop_stepping) buffer.
3519   cmpdi(CCR0, len, complexThreshold);
3520   blt(CCR0, L_tail);
3521 
3522   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3523   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3524   {
3525     // Align buf addr to mainLoop_stepping boundary.
3526     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3527     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3528 
3529     if (complexThreshold > mainLoop_stepping) {
3530       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3531     } else {
3532       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3533       cmpdi(CCR0, tmp, mainLoop_stepping);
3534       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3535       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3536     }
3537     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3538   }
3539 
3540   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3541   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3542   mtctr(tmp2);
3543 
3544 #ifdef VM_LITTLE_ENDIAN
3545   Register crc_rv = crc;
3546 #else
3547   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3548                                                  // Occupies tmp, but frees up crc.
3549   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3550   tmp = crc;
3551 #endif
3552 
3553   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3554 
3555   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3556   BIND(L_mainLoop);
3557     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3558     bdnz(L_mainLoop);
3559 
3560 #ifndef VM_LITTLE_ENDIAN
3561   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3562   tmp = crc_rv;                                  // Tmp uses it's original register again.
3563 #endif
3564 
3565   // Restore original table address for tailLoop.
3566   if (reconstructTableOffset != 0) {
3567     addi(table, table, -reconstructTableOffset);
3568   }
3569 
3570   // Process last few (<complexThreshold) bytes of buffer.
3571   BIND(L_tail);
3572   update_byteLoop_crc32(crc, buf, len, table, data, false);
3573 
3574   if (invertCRC) {
3575     nand(crc, crc, crc);                      // 1s complement of crc
3576   }
3577   BLOCK_COMMENT("} kernel_crc32_1word");
3578 }
3579 
3580 /**
3581  * @param crc             register containing existing CRC (32-bit)
3582  * @param buf             register pointing to input byte buffer (byte*)
3583  * @param len             register containing number of bytes
3584  * @param constants       register pointing to precomputed constants
3585  * @param t0-t6           temp registers
3586  */
kernel_crc32_vpmsum(Register crc,Register buf,Register len,Register constants,Register t0,Register t1,Register t2,Register t3,Register t4,Register t5,Register t6,bool invertCRC)3587 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3588                                          Register t0, Register t1, Register t2, Register t3,
3589                                          Register t4, Register t5, Register t6, bool invertCRC) {
3590   assert_different_registers(crc, buf, len, constants);
3591 
3592   Label L_tail;
3593 
3594   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3595 
3596   if (invertCRC) {
3597     nand(crc, crc, crc);                      // 1s complement of crc
3598   }
3599 
3600   // Enforce 32 bit.
3601   clrldi(len, len, 32);
3602 
3603   // Align if we have enough bytes for the fast version.
3604   const int alignment = 16,
3605             threshold = 32;
3606   Register prealign = t0;
3607 
3608   neg(prealign, buf);
3609   addi(t1, len, -threshold);
3610   andi(prealign, prealign, alignment - 1);
3611   cmpw(CCR0, t1, prealign);
3612   blt(CCR0, L_tail); // len - prealign < threshold?
3613 
3614   subf(len, prealign, len);
3615   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3616 
3617   // Calculate from first aligned address as far as possible.
3618   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3619   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3620   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3621 
3622   // Remaining bytes.
3623   BIND(L_tail);
3624   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3625 
3626   if (invertCRC) {
3627     nand(crc, crc, crc);                      // 1s complement of crc
3628   }
3629 
3630   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3631 }
3632 
3633 /**
3634  * @param crc             register containing existing CRC (32-bit)
3635  * @param buf             register pointing to input byte buffer (byte*)
3636  * @param len             register containing number of bytes (will get updated to remaining bytes)
3637  * @param constants       register pointing to CRC table for 128-bit aligned memory
3638  * @param t0-t6           temp registers
3639  */
kernel_crc32_vpmsum_aligned(Register crc,Register buf,Register len,Register constants,Register t0,Register t1,Register t2,Register t3,Register t4,Register t5,Register t6)3640 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3641     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3642 
3643   // Save non-volatile vector registers (frameless).
3644   Register offset = t1;
3645   int offsetInt = 0;
3646   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3647   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3648   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3649   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3650   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3651   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3652 #ifndef VM_LITTLE_ENDIAN
3653   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3654 #endif
3655   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3656   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3657 
3658   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3659   // bytes per iteration. The basic scheme is:
3660   // lvx: load vector (Big Endian needs reversal)
3661   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3662   // vxor: xor partial results together to get unroll_factor2 vectors
3663 
3664   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3665 
3666   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3667   const int unroll_factor = CRC32_UNROLL_FACTOR,
3668             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3669 
3670   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3671             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3672 
3673   // Support registers.
3674   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3675   Register num_bytes = R14,
3676            loop_count = R15,
3677            cur_const = crc; // will live in VCRC
3678   // Constant array for outer loop: unroll_factor2 - 1 registers,
3679   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3680   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3681                  consts1[] = { VR23, VR24 };
3682   // Data register arrays: 2 arrays with unroll_factor2 registers.
3683   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3684                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3685 
3686   VectorRegister VCRC = data0[0];
3687   VectorRegister Vc = VR25;
3688   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3689 
3690   // We have at least 1 iteration (ensured by caller).
3691   Label L_outer_loop, L_inner_loop, L_last;
3692 
3693   // If supported set DSCR pre-fetch to deepest.
3694   if (VM_Version::has_mfdscr()) {
3695     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3696     mtdscr(t0);
3697   }
3698 
3699   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3700 
3701   for (int i = 1; i < unroll_factor2; ++i) {
3702     li(offs[i], 16 * i);
3703   }
3704 
3705   // Load consts for outer loop
3706   lvx(consts0[0], constants);
3707   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3708     lvx(consts0[i], offs[i], constants);
3709   }
3710 
3711   load_const_optimized(num_bytes, 16 * unroll_factor);
3712 
3713   // Reuse data registers outside of the loop.
3714   VectorRegister Vtmp = data1[0];
3715   VectorRegister Vtmp2 = data1[1];
3716   VectorRegister zeroes = data1[2];
3717 
3718   vspltisb(Vtmp, 0);
3719   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3720 
3721   // Load vector for vpermxor (to xor both 64 bit parts together)
3722   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3723   vspltisb(Vc, 4);
3724   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3725   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3726   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3727 
3728 #ifdef VM_LITTLE_ENDIAN
3729 #define BE_swap_bytes(x)
3730 #else
3731   vspltisb(Vtmp2, 0xf);
3732   vxor(swap_bytes, Vtmp, Vtmp2);
3733 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3734 #endif
3735 
3736   cmpd(CCR0, len, num_bytes);
3737   blt(CCR0, L_last);
3738 
3739   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3740   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3741 
3742   // ********** Main loop start **********
3743   align(32);
3744   bind(L_outer_loop);
3745 
3746   // Begin of unrolled first iteration (no xor).
3747   lvx(data1[0], buf);
3748   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3749     lvx(data1[i], offs[i], buf);
3750   }
3751   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3752   lvx(consts1[0], cur_const);
3753   mtctr(loop_count);
3754   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3755     BE_swap_bytes(data1[i]);
3756     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3757     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3758     vpmsumw(data0[i], data1[i], consts1[0]);
3759   }
3760   addi(buf, buf, 16 * unroll_factor2);
3761   subf(len, num_bytes, len);
3762   lvx(consts1[1], offs[1], cur_const);
3763   addi(cur_const, cur_const, 32);
3764   // Begin of unrolled second iteration (head).
3765   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3766     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3767     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3768     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3769   }
3770   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3771     BE_swap_bytes(data1[i]);
3772     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3773     vpmsumw(data1[i], data1[i], consts1[1]);
3774   }
3775   addi(buf, buf, 16 * unroll_factor2);
3776 
3777   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3778   // Double-iteration allows using the 2 constant registers alternatingly.
3779   align(32);
3780   bind(L_inner_loop);
3781   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3782     if (j & 1) {
3783       lvx(consts1[0], cur_const);
3784     } else {
3785       lvx(consts1[1], offs[1], cur_const);
3786       addi(cur_const, cur_const, 32);
3787     }
3788     for (int i = 0; i < unroll_factor2; ++i) {
3789       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3790       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3791       BE_swap_bytes(data1[idx]);
3792       vxor(data0[i], data0[i], data1[i]);
3793       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3794       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3795     }
3796     addi(buf, buf, 16 * unroll_factor2);
3797   }
3798   bdnz(L_inner_loop);
3799 
3800   addi(cur_const, constants, outer_consts_size); // Reset
3801 
3802   // Tail of last iteration (no loads).
3803   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3804     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3805     vxor(data0[i], data0[i], data1[i]);
3806     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3807   }
3808   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3809     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3810     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3811   }
3812 
3813   // Last data register is ok, other ones need fixup shift.
3814   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3815     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3816   }
3817 
3818   // Combine to 128 bit result vector VCRC = data0[0].
3819   for (int i = 1; i < unroll_factor2; i<<=1) {
3820     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3821       vxor(data0[j], data0[j], data0[j+i]);
3822     }
3823   }
3824   cmpd(CCR0, len, num_bytes);
3825   bge(CCR0, L_outer_loop);
3826 
3827   // Last chance with lower num_bytes.
3828   bind(L_last);
3829   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3830   // Point behind last const for inner loop.
3831   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3832   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3833   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3834   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3835 
3836   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3837   bgt(CCR0, L_outer_loop);
3838   // ********** Main loop end **********
3839 
3840   // Restore DSCR pre-fetch value.
3841   if (VM_Version::has_mfdscr()) {
3842     load_const_optimized(t0, VM_Version::_dscr_val);
3843     mtdscr(t0);
3844   }
3845 
3846   // ********** Simple loop for remaining 16 byte blocks **********
3847   {
3848     Label L_loop, L_done;
3849 
3850     srdi_(t0, len, 4); // 16 bytes per iteration
3851     clrldi(len, len, 64-4);
3852     beq(CCR0, L_done);
3853 
3854     // Point to const (same as last const for inner loop).
3855     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3856     mtctr(t0);
3857     lvx(Vtmp2, cur_const);
3858 
3859     align(32);
3860     bind(L_loop);
3861 
3862     lvx(Vtmp, buf);
3863     addi(buf, buf, 16);
3864     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3865     BE_swap_bytes(Vtmp);
3866     vxor(VCRC, VCRC, Vtmp);
3867     vpmsumw(VCRC, VCRC, Vtmp2);
3868     bdnz(L_loop);
3869 
3870     bind(L_done);
3871   }
3872   // ********** Simple loop end **********
3873 #undef BE_swap_bytes
3874 
3875   // Point to Barrett constants
3876   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3877 
3878   vspltisb(zeroes, 0);
3879 
3880   // Combine to 64 bit result.
3881   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3882 
3883   // Reduce to 32 bit CRC: Remainder by multiply-high.
3884   lvx(Vtmp, cur_const);
3885   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3886   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3887   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3888   vsldoi(Vtmp, zeroes, Vtmp, 8);
3889   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3890   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3891 
3892   // Move result. len is already updated.
3893   vsldoi(VCRC, VCRC, zeroes, 8);
3894   mfvrd(crc, VCRC);
3895 
3896   // Restore non-volatile Vector registers (frameless).
3897   offsetInt = 0;
3898   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3899   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3900   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3901   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3902   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3903   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3904 #ifndef VM_LITTLE_ENDIAN
3905   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3906 #endif
3907   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3908   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3909 }
3910 
crc32(Register crc,Register buf,Register len,Register t0,Register t1,Register t2,Register t3,Register t4,Register t5,Register t6,Register t7,bool is_crc32c)3911 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3912                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3913   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3914                                      : StubRoutines::crc_table_addr()   , R0);
3915 
3916   if (VM_Version::has_vpmsumb()) {
3917     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3918   } else {
3919     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3920   }
3921 }
3922 
kernel_crc32_singleByteReg(Register crc,Register val,Register table,bool invertCRC)3923 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3924   assert_different_registers(crc, val, table);
3925 
3926   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3927   if (invertCRC) {
3928     nand(crc, crc, crc);                // 1s complement of crc
3929   }
3930 
3931   update_byte_crc32(crc, val, table);
3932 
3933   if (invertCRC) {
3934     nand(crc, crc, crc);                // 1s complement of crc
3935   }
3936 }
3937 
3938 // dest_lo += src1 + src2
3939 // dest_hi += carry1 + carry2
add2_with_carry(Register dest_hi,Register dest_lo,Register src1,Register src2)3940 void MacroAssembler::add2_with_carry(Register dest_hi,
3941                                      Register dest_lo,
3942                                      Register src1, Register src2) {
3943   li(R0, 0);
3944   addc(dest_lo, dest_lo, src1);
3945   adde(dest_hi, dest_hi, R0);
3946   addc(dest_lo, dest_lo, src2);
3947   adde(dest_hi, dest_hi, R0);
3948 }
3949 
3950 // Multiply 64 bit by 64 bit first loop.
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product_high,Register product,Register idx,Register kdx,Register tmp)3951 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3952                                            Register x_xstart,
3953                                            Register y, Register y_idx,
3954                                            Register z,
3955                                            Register carry,
3956                                            Register product_high, Register product,
3957                                            Register idx, Register kdx,
3958                                            Register tmp) {
3959   //  jlong carry, x[], y[], z[];
3960   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3961   //    huge_128 product = y[idx] * x[xstart] + carry;
3962   //    z[kdx] = (jlong)product;
3963   //    carry  = (jlong)(product >>> 64);
3964   //  }
3965   //  z[xstart] = carry;
3966 
3967   Label L_first_loop, L_first_loop_exit;
3968   Label L_one_x, L_one_y, L_multiply;
3969 
3970   addic_(xstart, xstart, -1);
3971   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3972 
3973   // Load next two integers of x.
3974   sldi(tmp, xstart, LogBytesPerInt);
3975   ldx(x_xstart, x, tmp);
3976 #ifdef VM_LITTLE_ENDIAN
3977   rldicl(x_xstart, x_xstart, 32, 0);
3978 #endif
3979 
3980   align(32, 16);
3981   bind(L_first_loop);
3982 
3983   cmpdi(CCR0, idx, 1);
3984   blt(CCR0, L_first_loop_exit);
3985   addi(idx, idx, -2);
3986   beq(CCR0, L_one_y);
3987 
3988   // Load next two integers of y.
3989   sldi(tmp, idx, LogBytesPerInt);
3990   ldx(y_idx, y, tmp);
3991 #ifdef VM_LITTLE_ENDIAN
3992   rldicl(y_idx, y_idx, 32, 0);
3993 #endif
3994 
3995 
3996   bind(L_multiply);
3997   multiply64(product_high, product, x_xstart, y_idx);
3998 
3999   li(tmp, 0);
4000   addc(product, product, carry);         // Add carry to result.
4001   adde(product_high, product_high, tmp); // Add carry of the last addition.
4002   addi(kdx, kdx, -2);
4003 
4004   // Store result.
4005 #ifdef VM_LITTLE_ENDIAN
4006   rldicl(product, product, 32, 0);
4007 #endif
4008   sldi(tmp, kdx, LogBytesPerInt);
4009   stdx(product, z, tmp);
4010   mr_if_needed(carry, product_high);
4011   b(L_first_loop);
4012 
4013 
4014   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4015 
4016   lwz(y_idx, 0, y);
4017   b(L_multiply);
4018 
4019 
4020   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4021 
4022   lwz(x_xstart, 0, x);
4023   b(L_first_loop);
4024 
4025   bind(L_first_loop_exit);
4026 }
4027 
4028 // Multiply 64 bit by 64 bit and add 128 bit.
multiply_add_128_x_128(Register x_xstart,Register y,Register z,Register yz_idx,Register idx,Register carry,Register product_high,Register product,Register tmp,int offset)4029 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4030                                             Register z, Register yz_idx,
4031                                             Register idx, Register carry,
4032                                             Register product_high, Register product,
4033                                             Register tmp, int offset) {
4034 
4035   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4036   //  z[kdx] = (jlong)product;
4037 
4038   sldi(tmp, idx, LogBytesPerInt);
4039   if (offset) {
4040     addi(tmp, tmp, offset);
4041   }
4042   ldx(yz_idx, y, tmp);
4043 #ifdef VM_LITTLE_ENDIAN
4044   rldicl(yz_idx, yz_idx, 32, 0);
4045 #endif
4046 
4047   multiply64(product_high, product, x_xstart, yz_idx);
4048   ldx(yz_idx, z, tmp);
4049 #ifdef VM_LITTLE_ENDIAN
4050   rldicl(yz_idx, yz_idx, 32, 0);
4051 #endif
4052 
4053   add2_with_carry(product_high, product, carry, yz_idx);
4054 
4055   sldi(tmp, idx, LogBytesPerInt);
4056   if (offset) {
4057     addi(tmp, tmp, offset);
4058   }
4059 #ifdef VM_LITTLE_ENDIAN
4060   rldicl(product, product, 32, 0);
4061 #endif
4062   stdx(product, z, tmp);
4063 }
4064 
4065 // Multiply 128 bit by 128 bit. Unrolled inner loop.
multiply_128_x_128_loop(Register x_xstart,Register y,Register z,Register yz_idx,Register idx,Register carry,Register product_high,Register product,Register carry2,Register tmp)4066 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4067                                              Register y, Register z,
4068                                              Register yz_idx, Register idx, Register carry,
4069                                              Register product_high, Register product,
4070                                              Register carry2, Register tmp) {
4071 
4072   //  jlong carry, x[], y[], z[];
4073   //  int kdx = ystart+1;
4074   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4075   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4076   //    z[kdx+idx+1] = (jlong)product;
4077   //    jlong carry2 = (jlong)(product >>> 64);
4078   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4079   //    z[kdx+idx] = (jlong)product;
4080   //    carry = (jlong)(product >>> 64);
4081   //  }
4082   //  idx += 2;
4083   //  if (idx > 0) {
4084   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4085   //    z[kdx+idx] = (jlong)product;
4086   //    carry = (jlong)(product >>> 64);
4087   //  }
4088 
4089   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4090   const Register jdx = R0;
4091 
4092   // Scale the index.
4093   srdi_(jdx, idx, 2);
4094   beq(CCR0, L_third_loop_exit);
4095   mtctr(jdx);
4096 
4097   align(32, 16);
4098   bind(L_third_loop);
4099 
4100   addi(idx, idx, -4);
4101 
4102   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4103   mr_if_needed(carry2, product_high);
4104 
4105   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4106   mr_if_needed(carry, product_high);
4107   bdnz(L_third_loop);
4108 
4109   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4110 
4111   andi_(idx, idx, 0x3);
4112   beq(CCR0, L_post_third_loop_done);
4113 
4114   Label L_check_1;
4115 
4116   addic_(idx, idx, -2);
4117   blt(CCR0, L_check_1);
4118 
4119   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4120   mr_if_needed(carry, product_high);
4121 
4122   bind(L_check_1);
4123 
4124   addi(idx, idx, 0x2);
4125   andi_(idx, idx, 0x1);
4126   addic_(idx, idx, -1);
4127   blt(CCR0, L_post_third_loop_done);
4128 
4129   sldi(tmp, idx, LogBytesPerInt);
4130   lwzx(yz_idx, y, tmp);
4131   multiply64(product_high, product, x_xstart, yz_idx);
4132   lwzx(yz_idx, z, tmp);
4133 
4134   add2_with_carry(product_high, product, yz_idx, carry);
4135 
4136   sldi(tmp, idx, LogBytesPerInt);
4137   stwx(product, z, tmp);
4138   srdi(product, product, 32);
4139 
4140   sldi(product_high, product_high, 32);
4141   orr(product, product, product_high);
4142   mr_if_needed(carry, product);
4143 
4144   bind(L_post_third_loop_done);
4145 }   // multiply_128_x_128_loop
4146 
muladd(Register out,Register in,Register offset,Register len,Register k,Register tmp1,Register tmp2,Register carry)4147 void MacroAssembler::muladd(Register out, Register in,
4148                             Register offset, Register len, Register k,
4149                             Register tmp1, Register tmp2, Register carry) {
4150 
4151   // Labels
4152   Label LOOP, SKIP;
4153 
4154   // Make sure length is positive.
4155   cmpdi  (CCR0,    len,     0);
4156 
4157   // Prepare variables
4158   subi   (offset,  offset,  4);
4159   li     (carry,   0);
4160   ble    (CCR0,    SKIP);
4161 
4162   mtctr  (len);
4163   subi   (len,     len,     1    );
4164   sldi   (len,     len,     2    );
4165 
4166   // Main loop
4167   bind(LOOP);
4168   lwzx   (tmp1,    len,     in   );
4169   lwzx   (tmp2,    offset,  out  );
4170   mulld  (tmp1,    tmp1,    k    );
4171   add    (tmp2,    carry,   tmp2 );
4172   add    (tmp2,    tmp1,    tmp2 );
4173   stwx   (tmp2,    offset,  out  );
4174   srdi   (carry,   tmp2,    32   );
4175   subi   (offset,  offset,  4    );
4176   subi   (len,     len,     4    );
4177   bdnz   (LOOP);
4178   bind(SKIP);
4179 }
4180 
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register tmp7,Register tmp8,Register tmp9,Register tmp10,Register tmp11,Register tmp12,Register tmp13)4181 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4182                                      Register y, Register ylen,
4183                                      Register z, Register zlen,
4184                                      Register tmp1, Register tmp2,
4185                                      Register tmp3, Register tmp4,
4186                                      Register tmp5, Register tmp6,
4187                                      Register tmp7, Register tmp8,
4188                                      Register tmp9, Register tmp10,
4189                                      Register tmp11, Register tmp12,
4190                                      Register tmp13) {
4191 
4192   ShortBranchVerifier sbv(this);
4193 
4194   assert_different_registers(x, xlen, y, ylen, z, zlen,
4195                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4196   assert_different_registers(x, xlen, y, ylen, z, zlen,
4197                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4198   assert_different_registers(x, xlen, y, ylen, z, zlen,
4199                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4200 
4201   const Register idx = tmp1;
4202   const Register kdx = tmp2;
4203   const Register xstart = tmp3;
4204 
4205   const Register y_idx = tmp4;
4206   const Register carry = tmp5;
4207   const Register product = tmp6;
4208   const Register product_high = tmp7;
4209   const Register x_xstart = tmp8;
4210   const Register tmp = tmp9;
4211 
4212   // First Loop.
4213   //
4214   //  final static long LONG_MASK = 0xffffffffL;
4215   //  int xstart = xlen - 1;
4216   //  int ystart = ylen - 1;
4217   //  long carry = 0;
4218   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4219   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4220   //    z[kdx] = (int)product;
4221   //    carry = product >>> 32;
4222   //  }
4223   //  z[xstart] = (int)carry;
4224 
4225   mr_if_needed(idx, ylen);        // idx = ylen
4226   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4227   li(carry, 0);                   // carry = 0
4228 
4229   Label L_done;
4230 
4231   addic_(xstart, xlen, -1);
4232   blt(CCR0, L_done);
4233 
4234   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4235                         carry, product_high, product, idx, kdx, tmp);
4236 
4237   Label L_second_loop;
4238 
4239   cmpdi(CCR0, kdx, 0);
4240   beq(CCR0, L_second_loop);
4241 
4242   Label L_carry;
4243 
4244   addic_(kdx, kdx, -1);
4245   beq(CCR0, L_carry);
4246 
4247   // Store lower 32 bits of carry.
4248   sldi(tmp, kdx, LogBytesPerInt);
4249   stwx(carry, z, tmp);
4250   srdi(carry, carry, 32);
4251   addi(kdx, kdx, -1);
4252 
4253 
4254   bind(L_carry);
4255 
4256   // Store upper 32 bits of carry.
4257   sldi(tmp, kdx, LogBytesPerInt);
4258   stwx(carry, z, tmp);
4259 
4260   // Second and third (nested) loops.
4261   //
4262   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4263   //    carry = 0;
4264   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4265   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4266   //                     (z[k] & LONG_MASK) + carry;
4267   //      z[k] = (int)product;
4268   //      carry = product >>> 32;
4269   //    }
4270   //    z[i] = (int)carry;
4271   //  }
4272   //
4273   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4274 
4275   bind(L_second_loop);
4276 
4277   li(carry, 0);                   // carry = 0;
4278 
4279   addic_(xstart, xstart, -1);     // i = xstart-1;
4280   blt(CCR0, L_done);
4281 
4282   Register zsave = tmp10;
4283 
4284   mr(zsave, z);
4285 
4286 
4287   Label L_last_x;
4288 
4289   sldi(tmp, xstart, LogBytesPerInt);
4290   add(z, z, tmp);                 // z = z + k - j
4291   addi(z, z, 4);
4292   addic_(xstart, xstart, -1);     // i = xstart-1;
4293   blt(CCR0, L_last_x);
4294 
4295   sldi(tmp, xstart, LogBytesPerInt);
4296   ldx(x_xstart, x, tmp);
4297 #ifdef VM_LITTLE_ENDIAN
4298   rldicl(x_xstart, x_xstart, 32, 0);
4299 #endif
4300 
4301 
4302   Label L_third_loop_prologue;
4303 
4304   bind(L_third_loop_prologue);
4305 
4306   Register xsave = tmp11;
4307   Register xlensave = tmp12;
4308   Register ylensave = tmp13;
4309 
4310   mr(xsave, x);
4311   mr(xlensave, xstart);
4312   mr(ylensave, ylen);
4313 
4314 
4315   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4316                           carry, product_high, product, x, tmp);
4317 
4318   mr(z, zsave);
4319   mr(x, xsave);
4320   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4321   mr(ylen, ylensave);
4322 
4323   addi(tmp3, xlen, 1);
4324   sldi(tmp, tmp3, LogBytesPerInt);
4325   stwx(carry, z, tmp);
4326   addic_(tmp3, tmp3, -1);
4327   blt(CCR0, L_done);
4328 
4329   srdi(carry, carry, 32);
4330   sldi(tmp, tmp3, LogBytesPerInt);
4331   stwx(carry, z, tmp);
4332   b(L_second_loop);
4333 
4334   // Next infrequent code is moved outside loops.
4335   bind(L_last_x);
4336 
4337   lwz(x_xstart, 0, x);
4338   b(L_third_loop_prologue);
4339 
4340   bind(L_done);
4341 }   // multiply_to_len
4342 
asm_assert(bool check_equal,const char * msg)4343 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4344 #ifdef ASSERT
4345   Label ok;
4346   if (check_equal) {
4347     beq(CCR0, ok);
4348   } else {
4349     bne(CCR0, ok);
4350   }
4351   stop(msg);
4352   bind(ok);
4353 #endif
4354 }
4355 
asm_assert_mems_zero(bool check_equal,int size,int mem_offset,Register mem_base,const char * msg)4356 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4357                                           Register mem_base, const char* msg) {
4358 #ifdef ASSERT
4359   switch (size) {
4360     case 4:
4361       lwz(R0, mem_offset, mem_base);
4362       cmpwi(CCR0, R0, 0);
4363       break;
4364     case 8:
4365       ld(R0, mem_offset, mem_base);
4366       cmpdi(CCR0, R0, 0);
4367       break;
4368     default:
4369       ShouldNotReachHere();
4370   }
4371   asm_assert(check_equal, msg);
4372 #endif // ASSERT
4373 }
4374 
verify_thread()4375 void MacroAssembler::verify_thread() {
4376   if (VerifyThread) {
4377     unimplemented("'VerifyThread' currently not implemented on PPC");
4378   }
4379 }
4380 
verify_coop(Register coop,const char * msg)4381 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4382   if (!VerifyOops) { return; }
4383   if (UseCompressedOops) { decode_heap_oop(coop); }
4384   verify_oop(coop, msg);
4385   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4386 }
4387 
4388 // READ: oop. KILL: R0. Volatile floats perhaps.
verify_oop(Register oop,const char * msg)4389 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4390   if (!VerifyOops) {
4391     return;
4392   }
4393 
4394   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4395   const Register tmp = R11; // Will be preserved.
4396   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4397 
4398   BLOCK_COMMENT("verify_oop {");
4399 
4400   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4401 
4402   mr_if_needed(R4_ARG2, oop);
4403   save_LR_CR(tmp); // save in old frame
4404   push_frame_reg_args(nbytes_save, tmp);
4405   // load FunctionDescriptor** / entry_address *
4406   load_const_optimized(tmp, fd, R0);
4407   // load FunctionDescriptor* / entry_address
4408   ld(tmp, 0, tmp);
4409   load_const_optimized(R3_ARG1, (address)msg, R0);
4410   // Call destination for its side effect.
4411   call_c(tmp);
4412 
4413   pop_frame();
4414   restore_LR_CR(tmp);
4415   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4416 
4417   BLOCK_COMMENT("} verify_oop");
4418 }
4419 
verify_oop_addr(RegisterOrConstant offs,Register base,const char * msg)4420 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4421   if (!VerifyOops) {
4422     return;
4423   }
4424 
4425   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4426   const Register tmp = R11; // Will be preserved.
4427   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4428   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4429 
4430   ld(R4_ARG2, offs, base);
4431   save_LR_CR(tmp); // save in old frame
4432   push_frame_reg_args(nbytes_save, tmp);
4433   // load FunctionDescriptor** / entry_address *
4434   load_const_optimized(tmp, fd, R0);
4435   // load FunctionDescriptor* / entry_address
4436   ld(tmp, 0, tmp);
4437   load_const_optimized(R3_ARG1, (address)msg, R0);
4438   // Call destination for its side effect.
4439   call_c(tmp);
4440 
4441   pop_frame();
4442   restore_LR_CR(tmp);
4443   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4444 }
4445 
4446 // Call a C-function that prints output.
stop(int type,const char * msg)4447 void MacroAssembler::stop(int type, const char* msg) {
4448   bool msg_present = (msg != NULL);
4449 
4450 #ifndef PRODUCT
4451   block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4452 #else
4453   block_comment("stop {");
4454 #endif
4455 
4456   if (msg_present) {
4457     type |= stop_msg_present;
4458   }
4459   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4460   if (msg_present) {
4461     emit_int64((uintptr_t)msg);
4462   }
4463 
4464   block_comment("} stop;");
4465 }
4466 
4467 #ifndef PRODUCT
4468 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4469 // Val, addr are temp registers.
4470 // If low == addr, addr is killed.
4471 // High is preserved.
zap_from_to(Register low,int before,Register high,int after,Register val,Register addr)4472 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4473   if (!ZapMemory) return;
4474 
4475   assert_different_registers(low, val);
4476 
4477   BLOCK_COMMENT("zap memory region {");
4478   load_const_optimized(val, 0x0101010101010101);
4479   int size = before + after;
4480   if (low == high && size < 5 && size > 0) {
4481     int offset = -before*BytesPerWord;
4482     for (int i = 0; i < size; ++i) {
4483       std(val, offset, low);
4484       offset += (1*BytesPerWord);
4485     }
4486   } else {
4487     addi(addr, low, -before*BytesPerWord);
4488     assert_different_registers(high, val);
4489     if (after) addi(high, high, after * BytesPerWord);
4490     Label loop;
4491     bind(loop);
4492     std(val, 0, addr);
4493     addi(addr, addr, 8);
4494     cmpd(CCR6, addr, high);
4495     ble(CCR6, loop);
4496     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4497   }
4498   BLOCK_COMMENT("} zap memory region");
4499 }
4500 
4501 #endif // !PRODUCT
4502 
skip_to_label_if_equal_zero(MacroAssembler * masm,Register temp,const bool * flag_addr,Label & label)4503 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4504                                                   const bool* flag_addr, Label& label) {
4505   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4506   assert(sizeof(bool) == 1, "PowerPC ABI");
4507   masm->lbz(temp, simm16_offset, temp);
4508   masm->cmpwi(CCR0, temp, 0);
4509   masm->beq(CCR0, label);
4510 }
4511 
SkipIfEqualZero(MacroAssembler * masm,Register temp,const bool * flag_addr)4512 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4513   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4514 }
4515 
~SkipIfEqualZero()4516 SkipIfEqualZero::~SkipIfEqualZero() {
4517   _masm->bind(_label);
4518 }
4519 
cache_wb(Address line)4520 void MacroAssembler::cache_wb(Address line) {
4521   assert(line.index() == noreg, "index should be noreg");
4522   assert(line.disp() == 0, "displacement should be 0");
4523   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4524   // Data Cache Store, not really a flush, so it works like a sync of cache
4525   // line and persistent mem, i.e. copying the cache line to persistent whilst
4526   // not invalidating the cache line.
4527   dcbst(line.base());
4528 }
4529 
cache_wbsync(bool is_presync)4530 void MacroAssembler::cache_wbsync(bool is_presync) {
4531   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4532   // We only need a post sync barrier. Post means _after_ a cache line flush or
4533   // store instruction, pre means a barrier emitted before such a instructions.
4534   if (!is_presync) {
4535     fence();
4536   }
4537 }
4538